summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.clang-format1
-rw-r--r--.mailmap3
-rw-r--r--Documentation/ABI/testing/sysfs-bus-cxl11
-rw-r--r--Documentation/ABI/testing/sysfs-class-power-rt975630
-rw-r--r--Documentation/PCI/pci-error-recovery.rst15
-rw-r--r--Documentation/devicetree/bindings/iommu/arm,smmu.yaml2
-rw-r--r--Documentation/devicetree/bindings/iommu/mediatek,iommu.yaml8
-rw-r--r--Documentation/devicetree/bindings/iommu/qcom,iommu.yaml4
-rw-r--r--Documentation/devicetree/bindings/pci/amlogic,axg-pcie.yaml23
-rw-r--r--Documentation/devicetree/bindings/pci/cix,sky1-pcie-host.yaml83
-rw-r--r--Documentation/devicetree/bindings/pci/mediatek-pcie-mt7623.yaml164
-rw-r--r--Documentation/devicetree/bindings/pci/mediatek-pcie.txt289
-rw-r--r--Documentation/devicetree/bindings/pci/mediatek-pcie.yaml438
-rw-r--r--Documentation/devicetree/bindings/pci/nxp,s32g-pcie.yaml130
-rw-r--r--Documentation/devicetree/bindings/pci/pci-ep.yaml2
-rw-r--r--Documentation/devicetree/bindings/pci/qcom,pcie-common.yaml2
-rw-r--r--Documentation/devicetree/bindings/pci/qcom,pcie-ep.yaml2
-rw-r--r--Documentation/devicetree/bindings/pci/qcom,pcie-sa8255p.yaml2
-rw-r--r--Documentation/devicetree/bindings/pci/qcom,pcie-sa8775p.yaml5
-rw-r--r--Documentation/devicetree/bindings/pci/qcom,pcie-sc7280.yaml7
-rw-r--r--Documentation/devicetree/bindings/pci/qcom,pcie-sc8180x.yaml2
-rw-r--r--Documentation/devicetree/bindings/pci/qcom,pcie-sc8280xp.yaml5
-rw-r--r--Documentation/devicetree/bindings/pci/qcom,pcie-sm8150.yaml7
-rw-r--r--Documentation/devicetree/bindings/pci/qcom,pcie-sm8250.yaml7
-rw-r--r--Documentation/devicetree/bindings/pci/qcom,pcie-sm8350.yaml7
-rw-r--r--Documentation/devicetree/bindings/pci/qcom,pcie-sm8450.yaml7
-rw-r--r--Documentation/devicetree/bindings/pci/qcom,pcie-sm8550.yaml8
-rw-r--r--Documentation/devicetree/bindings/pci/qcom,pcie-x1e80100.yaml7
-rw-r--r--Documentation/devicetree/bindings/pci/qcom,pcie.yaml2
-rw-r--r--Documentation/devicetree/bindings/pci/renesas,r9a08g045-pcie.yaml249
-rw-r--r--Documentation/devicetree/bindings/pci/rockchip-dw-pcie.yaml3
-rw-r--r--Documentation/devicetree/bindings/pci/snps,dw-pcie-common.yaml6
-rw-r--r--Documentation/devicetree/bindings/pci/spacemit,k1-pcie-host.yaml157
-rw-r--r--Documentation/devicetree/bindings/pci/toshiba,tc9563.yaml179
-rw-r--r--Documentation/devicetree/bindings/power/supply/richtek,rt9756.yaml72
-rw-r--r--Documentation/driver-api/cxl/allocation/page-allocator.rst31
-rw-r--r--Documentation/driver-api/firmware/efi/index.rst11
-rw-r--r--Documentation/driver-api/generic_pt.rst137
-rw-r--r--Documentation/driver-api/index.rst1
-rw-r--r--Documentation/driver-api/pci/p2pdma.rst97
-rw-r--r--Documentation/driver-api/pci/pci.rst3
-rw-r--r--Documentation/power/power_supply_class.rst84
-rw-r--r--MAINTAINERS44
-rw-r--r--arch/powerpc/include/asm/mem_encrypt.h3
-rw-r--r--arch/powerpc/kernel/iommu.c5
-rw-r--r--block/blk-mq-dma.c2
-rw-r--r--drivers/acpi/apei/ghes.c27
-rw-r--r--drivers/acpi/numa/hmat.c11
-rw-r--r--drivers/amba/Kconfig2
-rw-r--r--drivers/crypto/hisilicon/qm.c27
-rw-r--r--drivers/crypto/intel/qat/qat_common/adf_aer.c2
-rw-r--r--drivers/cxl/acpi.c73
-rw-r--r--drivers/cxl/core/cdat.c4
-rw-r--r--drivers/cxl/core/hdm.c3
-rw-r--r--drivers/cxl/core/pci.c87
-rw-r--r--drivers/cxl/core/port.c1
-rw-r--r--drivers/cxl/core/region.c311
-rw-r--r--drivers/cxl/cxl.h29
-rw-r--r--drivers/cxl/cxlpci.h1
-rw-r--r--drivers/cxl/pci.c2
-rw-r--r--drivers/dma-buf/Makefile2
-rw-r--r--drivers/dma-buf/dma-buf-mapping.c248
-rw-r--r--drivers/dma/ioat/init.c1
-rw-r--r--drivers/firmware/efi/cper-arm.c52
-rw-r--r--drivers/firmware/efi/cper.c62
-rw-r--r--drivers/firmware/efi/libstub/efi-stub.c2
-rw-r--r--drivers/firmware/efi/libstub/efistub.h31
-rw-r--r--drivers/firmware/efi/libstub/gop.c137
-rw-r--r--drivers/firmware/efi/libstub/x86-stub.c104
-rw-r--r--drivers/firmware/efi/memattr.c7
-rw-r--r--drivers/firmware/efi/riscv-runtime.c10
-rw-r--r--drivers/firmware/efi/stmm/mm_communication.h6
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_device.c20
-rw-r--r--drivers/gpu/drm/i915/gt/intel_region_lmem.c24
-rw-r--r--drivers/gpu/drm/i915/gvt/kvmgt.c263
-rw-r--r--drivers/gpu/drm/xe/xe_vram.c58
-rw-r--r--drivers/infiniband/Kconfig1
-rw-r--r--drivers/infiniband/core/cm.c9
-rw-r--r--drivers/infiniband/core/cma.c2
-rw-r--r--drivers/infiniband/core/device.c4
-rw-r--r--drivers/infiniband/core/restrack.c4
-rw-r--r--drivers/infiniband/core/ucma.c2
-rw-r--r--drivers/infiniband/core/umem.c8
-rw-r--r--drivers/infiniband/core/verbs.c3
-rw-r--r--drivers/infiniband/hw/Makefile1
-rw-r--r--drivers/infiniband/hw/bng_re/Kconfig10
-rw-r--r--drivers/infiniband/hw/bng_re/Makefile8
-rw-r--r--drivers/infiniband/hw/bng_re/bng_debugfs.c39
-rw-r--r--drivers/infiniband/hw/bng_re/bng_debugfs.h12
-rw-r--r--drivers/infiniband/hw/bng_re/bng_dev.c534
-rw-r--r--drivers/infiniband/hw/bng_re/bng_fw.c767
-rw-r--r--drivers/infiniband/hw/bng_re/bng_fw.h211
-rw-r--r--drivers/infiniband/hw/bng_re/bng_re.h85
-rw-r--r--drivers/infiniband/hw/bng_re/bng_res.c279
-rw-r--r--drivers/infiniband/hw/bng_re/bng_res.h215
-rw-r--r--drivers/infiniband/hw/bng_re/bng_sp.c131
-rw-r--r--drivers/infiniband/hw/bng_re/bng_sp.h47
-rw-r--r--drivers/infiniband/hw/bng_re/bng_tlv.h128
-rw-r--r--drivers/infiniband/hw/bnxt_re/bnxt_re.h2
-rw-r--r--drivers/infiniband/hw/bnxt_re/debugfs.c128
-rw-r--r--drivers/infiniband/hw/bnxt_re/debugfs.h19
-rw-r--r--drivers/infiniband/hw/bnxt_re/ib_verbs.c8
-rw-r--r--drivers/infiniband/hw/bnxt_re/main.c1
-rw-r--r--drivers/infiniband/hw/bnxt_re/qplib_fp.c3
-rw-r--r--drivers/infiniband/hw/bnxt_re/qplib_fp.h1
-rw-r--r--drivers/infiniband/hw/bnxt_re/qplib_sp.c8
-rw-r--r--drivers/infiniband/hw/bnxt_re/qplib_sp.h2
-rw-r--r--drivers/infiniband/hw/cxgb4/mem.c2
-rw-r--r--drivers/infiniband/hw/hfi1/init.c4
-rw-r--r--drivers/infiniband/hw/hfi1/opfn.c4
-rw-r--r--drivers/infiniband/hw/hns/Makefile4
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_ah.c1
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_bond.c1012
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_bond.h95
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_device.h16
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_hw_v2.c141
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_hw_v2.h20
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_main.c185
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_pd.c1
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_qp.c5
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_srq.c1
-rw-r--r--drivers/infiniband/hw/irdma/cm.c2
-rw-r--r--drivers/infiniband/hw/irdma/ctrl.c107
-rw-r--r--drivers/infiniband/hw/irdma/hw.c3
-rw-r--r--drivers/infiniband/hw/irdma/icrdma_if.c6
-rw-r--r--drivers/infiniband/hw/irdma/ig3rdma_if.c4
-rw-r--r--drivers/infiniband/hw/irdma/main.h3
-rw-r--r--drivers/infiniband/hw/irdma/pble.c6
-rw-r--r--drivers/infiniband/hw/irdma/puda.c20
-rw-r--r--drivers/infiniband/hw/irdma/type.h5
-rw-r--r--drivers/infiniband/hw/irdma/uk.c67
-rw-r--r--drivers/infiniband/hw/irdma/user.h6
-rw-r--r--drivers/infiniband/hw/irdma/utils.c58
-rw-r--r--drivers/infiniband/hw/irdma/verbs.c49
-rw-r--r--drivers/infiniband/hw/irdma/verbs.h3
-rw-r--r--drivers/infiniband/hw/mlx4/cm.c2
-rw-r--r--drivers/infiniband/hw/mlx5/devx.c14
-rw-r--r--drivers/infiniband/hw/mlx5/fs.c65
-rw-r--r--drivers/infiniband/hw/mlx5/ib_rep.c74
-rw-r--r--drivers/infiniband/hw/mlx5/main.c4
-rw-r--r--drivers/infiniband/hw/mlx5/odp.c93
-rw-r--r--drivers/infiniband/hw/mlx5/qp.c5
-rw-r--r--drivers/infiniband/sw/rdmavt/cq.c3
-rw-r--r--drivers/infiniband/sw/rxe/rxe_mr.c1
-rw-r--r--drivers/infiniband/sw/rxe/rxe_net.c49
-rw-r--r--drivers/infiniband/sw/rxe/rxe_odp.c1
-rw-r--r--drivers/infiniband/sw/rxe/rxe_qp.c49
-rw-r--r--drivers/infiniband/sw/rxe/rxe_srq.c7
-rw-r--r--drivers/infiniband/sw/siw/siw_cm.c51
-rw-r--r--drivers/infiniband/ulp/iser/iscsi_iser.c2
-rw-r--r--drivers/infiniband/ulp/isert/ib_isert.c2
-rw-r--r--drivers/infiniband/ulp/rtrs/rtrs-srv.c2
-rw-r--r--drivers/iommu/Kconfig15
-rw-r--r--drivers/iommu/Makefile2
-rw-r--r--drivers/iommu/amd/Kconfig5
-rw-r--r--drivers/iommu/amd/Makefile2
-rw-r--r--drivers/iommu/amd/amd_iommu.h1
-rw-r--r--drivers/iommu/amd/amd_iommu_types.h114
-rw-r--r--drivers/iommu/amd/debugfs.c2
-rw-r--r--drivers/iommu/amd/init.c15
-rw-r--r--drivers/iommu/amd/io_pgtable.c577
-rw-r--r--drivers/iommu/amd/io_pgtable_v2.c370
-rw-r--r--drivers/iommu/amd/iommu.c572
-rw-r--r--drivers/iommu/apple-dart.c11
-rw-r--r--drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c18
-rw-r--r--drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c33
-rw-r--r--drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c28
-rw-r--r--drivers/iommu/arm/arm-smmu/arm-smmu.c9
-rw-r--r--drivers/iommu/arm/arm-smmu/qcom_iommu.c21
-rw-r--r--drivers/iommu/dma-iommu.c4
-rw-r--r--drivers/iommu/exynos-iommu.c20
-rw-r--r--drivers/iommu/fsl_pamu_domain.c12
-rw-r--r--drivers/iommu/generic_pt/.kunitconfig14
-rw-r--r--drivers/iommu/generic_pt/Kconfig79
-rw-r--r--drivers/iommu/generic_pt/fmt/Makefile28
-rw-r--r--drivers/iommu/generic_pt/fmt/amdv1.h411
-rw-r--r--drivers/iommu/generic_pt/fmt/defs_amdv1.h21
-rw-r--r--drivers/iommu/generic_pt/fmt/defs_vtdss.h21
-rw-r--r--drivers/iommu/generic_pt/fmt/defs_x86_64.h21
-rw-r--r--drivers/iommu/generic_pt/fmt/iommu_amdv1.c15
-rw-r--r--drivers/iommu/generic_pt/fmt/iommu_mock.c10
-rw-r--r--drivers/iommu/generic_pt/fmt/iommu_template.h48
-rw-r--r--drivers/iommu/generic_pt/fmt/iommu_vtdss.c10
-rw-r--r--drivers/iommu/generic_pt/fmt/iommu_x86_64.c11
-rw-r--r--drivers/iommu/generic_pt/fmt/vtdss.h285
-rw-r--r--drivers/iommu/generic_pt/fmt/x86_64.h279
-rw-r--r--drivers/iommu/generic_pt/iommu_pt.h1289
-rw-r--r--drivers/iommu/generic_pt/kunit_generic_pt.h823
-rw-r--r--drivers/iommu/generic_pt/kunit_iommu.h184
-rw-r--r--drivers/iommu/generic_pt/kunit_iommu_pt.h487
-rw-r--r--drivers/iommu/generic_pt/pt_common.h389
-rw-r--r--drivers/iommu/generic_pt/pt_defs.h332
-rw-r--r--drivers/iommu/generic_pt/pt_fmt_defaults.h295
-rw-r--r--drivers/iommu/generic_pt/pt_iter.h636
-rw-r--r--drivers/iommu/generic_pt/pt_log2.h122
-rw-r--r--drivers/iommu/intel/Kconfig6
-rw-r--r--drivers/iommu/intel/iommu.c931
-rw-r--r--drivers/iommu/intel/iommu.h99
-rw-r--r--drivers/iommu/intel/nested.c7
-rw-r--r--drivers/iommu/intel/pasid.c44
-rw-r--r--drivers/iommu/intel/pasid.h1
-rw-r--r--drivers/iommu/intel/svm.c1
-rw-r--r--drivers/iommu/io-pgtable-arm-selftests.c214
-rw-r--r--drivers/iommu/io-pgtable-arm.c203
-rw-r--r--drivers/iommu/io-pgtable.c4
-rw-r--r--drivers/iommu/iommu-pages.c136
-rw-r--r--drivers/iommu/iommu-pages.h51
-rw-r--r--drivers/iommu/iommu.c44
-rw-r--r--drivers/iommu/iommufd/Kconfig1
-rw-r--r--drivers/iommu/iommufd/io_pagetable.c78
-rw-r--r--drivers/iommu/iommufd/io_pagetable.h54
-rw-r--r--drivers/iommu/iommufd/ioas.c8
-rw-r--r--drivers/iommu/iommufd/iommufd_private.h14
-rw-r--r--drivers/iommu/iommufd/iommufd_test.h21
-rw-r--r--drivers/iommu/iommufd/main.c10
-rw-r--r--drivers/iommu/iommufd/pages.c414
-rw-r--r--drivers/iommu/iommufd/selftest.c569
-rw-r--r--drivers/iommu/ipmmu-vmsa.c12
-rw-r--r--drivers/iommu/msm_iommu.c11
-rw-r--r--drivers/iommu/mtk_iommu.c174
-rw-r--r--drivers/iommu/mtk_iommu_v1.c35
-rw-r--r--drivers/iommu/omap-iommu.c19
-rw-r--r--drivers/iommu/omap-iommu.h2
-rw-r--r--drivers/iommu/riscv/iommu.c9
-rw-r--r--drivers/iommu/rockchip-iommu.c20
-rw-r--r--drivers/iommu/s390-iommu.c13
-rw-r--r--drivers/iommu/sprd-iommu.c3
-rw-r--r--drivers/iommu/sun50i-iommu.c10
-rw-r--r--drivers/iommu/tegra-smmu.c15
-rw-r--r--drivers/iommu/virtio-iommu.c6
-rw-r--r--drivers/net/ethernet/broadcom/bnge/Makefile3
-rw-r--r--drivers/net/ethernet/broadcom/bnge/bnge.h10
-rw-r--r--drivers/net/ethernet/broadcom/bnge/bnge_auxr.c258
-rw-r--r--drivers/net/ethernet/broadcom/bnge/bnge_auxr.h84
-rw-r--r--drivers/net/ethernet/broadcom/bnge/bnge_core.c18
-rw-r--r--drivers/net/ethernet/broadcom/bnge/bnge_hwrm.c40
-rw-r--r--drivers/net/ethernet/broadcom/bnge/bnge_hwrm.h2
-rw-r--r--drivers/net/ethernet/broadcom/bnge/bnge_resc.c12
-rw-r--r--drivers/net/ethernet/broadcom/bnge/bnge_resc.h1
-rw-r--r--drivers/net/ethernet/broadcom/bnx2.c2
-rw-r--r--drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c1
-rw-r--r--drivers/net/ethernet/broadcom/tg3.c1
-rw-r--r--drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c1
-rw-r--r--drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c2
-rw-r--r--drivers/net/ethernet/hisilicon/hibmcge/hbg_err.c1
-rw-r--r--drivers/net/ethernet/intel/e1000e/netdev.c1
-rw-r--r--drivers/net/ethernet/intel/fm10k/fm10k_pci.c6
-rw-r--r--drivers/net/ethernet/intel/i40e/i40e_main.c1
-rw-r--r--drivers/net/ethernet/intel/ice/ice_main.c2
-rw-r--r--drivers/net/ethernet/intel/igb/igb_main.c2
-rw-r--r--drivers/net/ethernet/intel/igc/igc_main.c2
-rw-r--r--drivers/net/ethernet/intel/ixgbe/ixgbe_main.c1
-rw-r--r--drivers/net/ethernet/mellanox/mlx4/main.c1
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/main.c1
-rw-r--r--drivers/net/ethernet/meta/fbnic/fbnic_pci.c1
-rw-r--r--drivers/net/ethernet/microchip/lan743x_main.c1
-rw-r--r--drivers/net/ethernet/myricom/myri10ge/myri10ge.c4
-rw-r--r--drivers/net/ethernet/neterion/s2io.c1
-rw-r--r--drivers/pci/Makefile2
-rw-r--r--drivers/pci/bus.c3
-rw-r--r--drivers/pci/controller/Kconfig18
-rw-r--r--drivers/pci/controller/Makefile1
-rw-r--r--drivers/pci/controller/cadence/Kconfig21
-rw-r--r--drivers/pci/controller/cadence/Makefile11
-rw-r--r--drivers/pci/controller/cadence/pci-j721e.c33
-rw-r--r--drivers/pci/controller/cadence/pci-sky1.c238
-rw-r--r--drivers/pci/controller/cadence/pcie-cadence-host-common.c288
-rw-r--r--drivers/pci/controller/cadence/pcie-cadence-host-common.h46
-rw-r--r--drivers/pci/controller/cadence/pcie-cadence-host-hpa.c368
-rw-r--r--drivers/pci/controller/cadence/pcie-cadence-host.c278
-rw-r--r--drivers/pci/controller/cadence/pcie-cadence-hpa-regs.h193
-rw-r--r--drivers/pci/controller/cadence/pcie-cadence-hpa.c167
-rw-r--r--drivers/pci/controller/cadence/pcie-cadence-lga-regs.h230
-rw-r--r--drivers/pci/controller/cadence/pcie-cadence-plat.c9
-rw-r--r--drivers/pci/controller/cadence/pcie-cadence.c12
-rw-r--r--drivers/pci/controller/cadence/pcie-cadence.h409
-rw-r--r--drivers/pci/controller/cadence/pcie-sg2042.c3
-rw-r--r--drivers/pci/controller/dwc/Kconfig38
-rw-r--r--drivers/pci/controller/dwc/Makefile5
-rw-r--r--drivers/pci/controller/dwc/pci-keystone.c80
-rw-r--r--drivers/pci/controller/dwc/pci-meson.c18
-rw-r--r--drivers/pci/controller/dwc/pcie-designware-ep.c1
-rw-r--r--drivers/pci/controller/dwc/pcie-designware-host.c12
-rw-r--r--drivers/pci/controller/dwc/pcie-designware.c36
-rw-r--r--drivers/pci/controller/dwc/pcie-designware.h21
-rw-r--r--drivers/pci/controller/dwc/pcie-dw-rockchip.c63
-rw-r--r--drivers/pci/controller/dwc/pcie-nxp-s32g.c406
-rw-r--r--drivers/pci/controller/dwc/pcie-qcom.c32
-rw-r--r--drivers/pci/controller/dwc/pcie-spacemit-k1.c357
-rw-r--r--drivers/pci/controller/dwc/pcie-stm32-ep.c43
-rw-r--r--drivers/pci/controller/dwc/pcie-stm32.c14
-rw-r--r--drivers/pci/controller/dwc/pcie-stm32.h3
-rw-r--r--drivers/pci/controller/dwc/pcie-tegra194.c48
-rw-r--r--drivers/pci/controller/pci-host-common.c13
-rw-r--r--drivers/pci/controller/pci-host-common.h1
-rw-r--r--drivers/pci/controller/pci-hyperv.c62
-rw-r--r--drivers/pci/controller/pci-ixp4xx.c6
-rw-r--r--drivers/pci/controller/pcie-apple.c43
-rw-r--r--drivers/pci/controller/pcie-brcmstb.c209
-rw-r--r--drivers/pci/controller/pcie-mediatek.c113
-rw-r--r--drivers/pci/controller/pcie-rzg3s-host.c1761
-rw-r--r--drivers/pci/controller/vmd.c40
-rw-r--r--drivers/pci/endpoint/functions/pci-epf-test.c5
-rw-r--r--drivers/pci/endpoint/functions/pci-epf-vntb.c153
-rw-r--r--drivers/pci/endpoint/pci-epf-core.c159
-rw-r--r--drivers/pci/host-bridge.c1
-rw-r--r--drivers/pci/iov.c25
-rw-r--r--drivers/pci/p2pdma.c186
-rw-r--r--drivers/pci/pci-driver.c6
-rw-r--r--drivers/pci/pci-sysfs.c19
-rw-r--r--drivers/pci/pci.c172
-rw-r--r--drivers/pci/pci.h14
-rw-r--r--drivers/pci/pcie/portdrv.c1
-rw-r--r--drivers/pci/pcie/ptm.c23
-rw-r--r--drivers/pci/probe.c13
-rw-r--r--drivers/pci/pwrctrl/Kconfig15
-rw-r--r--drivers/pci/pwrctrl/Makefile2
-rw-r--r--drivers/pci/pwrctrl/pci-pwrctrl-tc9563.c648
-rw-r--r--drivers/pci/rebar.c328
-rw-r--r--drivers/pci/setup-bus.c126
-rw-r--r--drivers/pci/setup-res.c78
-rw-r--r--drivers/power/reset/Kconfig9
-rw-r--r--drivers/power/reset/Makefile1
-rw-r--r--drivers/power/reset/spacemit-p1-reboot.c88
-rw-r--r--drivers/power/supply/Kconfig24
-rw-r--r--drivers/power/supply/Makefile2
-rw-r--r--drivers/power/supply/apm_power.c3
-rw-r--r--drivers/power/supply/bd71828-power.c1049
-rw-r--r--drivers/power/supply/cw2015_battery.c8
-rw-r--r--drivers/power/supply/max17040_battery.c6
-rw-r--r--drivers/power/supply/max77705_charger.c56
-rw-r--r--drivers/power/supply/qcom_battmgr.c14
-rw-r--r--drivers/power/supply/rt5033_charger.c2
-rw-r--r--drivers/power/supply/rt9467-charger.c6
-rw-r--r--drivers/power/supply/rt9756.c955
-rw-r--r--drivers/power/supply/wm831x_power.c10
-rw-r--r--drivers/ras/ras.c40
-rw-r--r--drivers/s390/cio/vfio_ccw_ops.c47
-rw-r--r--drivers/scsi/bfa/bfad.c1
-rw-r--r--drivers/scsi/csiostor/csio_init.c1
-rw-r--r--drivers/scsi/ipr.c1
-rw-r--r--drivers/scsi/lpfc/lpfc_init.c6
-rw-r--r--drivers/scsi/qla2xxx/qla_os.c5
-rw-r--r--drivers/scsi/qla4xxx/ql4_os.c5
-rw-r--r--drivers/tty/serial/8250/8250_pci.c1
-rw-r--r--drivers/tty/serial/jsm/jsm_driver.c1
-rw-r--r--drivers/vdpa/mlx5/net/mlx5_vnet.c2
-rw-r--r--drivers/vdpa/octeon_ep/octep_vdpa_main.c1
-rw-r--r--drivers/vdpa/pds/vdpa_dev.c2
-rw-r--r--drivers/vdpa/vdpa_user/vduse_dev.c3
-rw-r--r--drivers/vfio/cdx/main.c29
-rw-r--r--drivers/vfio/device_cdev.c2
-rw-r--r--drivers/vfio/fsl-mc/vfio_fsl_mc.c43
-rw-r--r--drivers/vfio/pci/Kconfig3
-rw-r--r--drivers/vfio/pci/Makefile1
-rw-r--r--drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c171
-rw-r--r--drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h23
-rw-r--r--drivers/vfio/pci/mlx5/main.c1
-rw-r--r--drivers/vfio/pci/nvgrace-gpu/main.c342
-rw-r--r--drivers/vfio/pci/pds/vfio_dev.c1
-rw-r--r--drivers/vfio/pci/qat/main.c1
-rw-r--r--drivers/vfio/pci/vfio_pci.c6
-rw-r--r--drivers/vfio/pci/vfio_pci_config.c23
-rw-r--r--drivers/vfio/pci/vfio_pci_core.c300
-rw-r--r--drivers/vfio/pci/vfio_pci_dmabuf.c350
-rw-r--r--drivers/vfio/pci/vfio_pci_intrs.c52
-rw-r--r--drivers/vfio/pci/vfio_pci_priv.h28
-rw-r--r--drivers/vfio/pci/virtio/common.h5
-rw-r--r--drivers/vfio/pci/virtio/legacy_io.c38
-rw-r--r--drivers/vfio/pci/virtio/main.c5
-rw-r--r--drivers/vfio/platform/vfio_amba.c1
-rw-r--r--drivers/vfio/platform/vfio_platform.c1
-rw-r--r--drivers/vfio/platform/vfio_platform_common.c40
-rw-r--r--drivers/vfio/platform/vfio_platform_private.h3
-rw-r--r--drivers/vfio/vfio_main.c51
-rw-r--r--drivers/vhost/net.c29
-rw-r--r--drivers/vhost/scsi.c9
-rw-r--r--drivers/vhost/test.c10
-rw-r--r--drivers/vhost/vhost.c4
-rw-r--r--drivers/vhost/vhost.h42
-rw-r--r--drivers/vhost/vsock.c10
-rw-r--r--drivers/virtio/virtio.c12
-rw-r--r--drivers/virtio/virtio_balloon.c3
-rw-r--r--drivers/virtio/virtio_debug.c10
-rw-r--r--drivers/virtio/virtio_pci_modern_dev.c6
-rw-r--r--drivers/virtio/virtio_ring.c7
-rw-r--r--drivers/virtio/virtio_vdpa.c2
-rw-r--r--include/dt-bindings/memory/mediatek,mt8189-memory-port.h283
-rw-r--r--include/linux/cper.h12
-rw-r--r--include/linux/dma-buf-mapping.h17
-rw-r--r--include/linux/dma-buf.h11
-rw-r--r--include/linux/efi.h6
-rw-r--r--include/linux/generic_pt/common.h191
-rw-r--r--include/linux/generic_pt/iommu.h293
-rw-r--r--include/linux/hisi_acc_qm.h3
-rw-r--r--include/linux/io-pgtable.h2
-rw-r--r--include/linux/iommu.h3
-rw-r--r--include/linux/irqchip/riscv-imsic.h3
-rw-r--r--include/linux/pci-epf.h12
-rw-r--r--include/linux/pci-p2pdma.h120
-rw-r--r--include/linux/pci.h27
-rw-r--r--include/linux/power/max77705_charger.h2
-rw-r--r--include/linux/ras.h16
-rw-r--r--include/linux/sizes.h1
-rw-r--r--include/linux/vfio.h6
-rw-r--r--include/linux/vfio_pci_core.h73
-rw-r--r--include/linux/virtio.h2
-rw-r--r--include/linux/virtio_config.h24
-rw-r--r--include/linux/virtio_features.h29
-rw-r--r--include/linux/virtio_pci_modern.h8
-rw-r--r--include/ras/ras_event.h49
-rw-r--r--include/rdma/ib_cm.h4
-rw-r--r--include/rdma/ib_verbs.h100
-rw-r--r--include/rdma/rdmavt_qp.h70
-rw-r--r--include/uapi/linux/iommufd.h10
-rw-r--r--include/uapi/linux/vfio.h28
-rw-r--r--include/uapi/linux/virtio_pci.h2
-rw-r--r--kernel/dma/direct.c4
-rw-r--r--mm/hmm.c2
-rw-r--r--samples/vfio-mdev/mbochs.c71
-rw-r--r--samples/vfio-mdev/mdpy.c34
-rw-r--r--samples/vfio-mdev/mtty.c35
-rw-r--r--tools/testing/cxl/Kbuild3
-rw-r--r--tools/testing/cxl/test/Kbuild1
-rw-r--r--tools/testing/cxl/test/cxl.c86
-rw-r--r--tools/testing/cxl/test/cxl_translate.c445
-rw-r--r--tools/testing/cxl/test/mem.c11
-rw-r--r--tools/testing/cxl/test/mock.c52
-rw-r--r--tools/testing/cxl/test/mock.h4
-rw-r--r--tools/testing/selftests/iommu/iommufd.c103
-rw-r--r--tools/testing/selftests/iommu/iommufd_utils.h56
-rw-r--r--tools/testing/selftests/vfio/Makefile10
-rw-r--r--tools/testing/selftests/vfio/lib/drivers/dsa/dsa.c36
-rw-r--r--tools/testing/selftests/vfio/lib/drivers/ioat/ioat.c18
-rw-r--r--tools/testing/selftests/vfio/lib/include/libvfio.h26
-rw-r--r--tools/testing/selftests/vfio/lib/include/libvfio/assert.h54
-rw-r--r--tools/testing/selftests/vfio/lib/include/libvfio/iommu.h76
-rw-r--r--tools/testing/selftests/vfio/lib/include/libvfio/iova_allocator.h23
-rw-r--r--tools/testing/selftests/vfio/lib/include/libvfio/vfio_pci_device.h125
-rw-r--r--tools/testing/selftests/vfio/lib/include/libvfio/vfio_pci_driver.h97
-rw-r--r--tools/testing/selftests/vfio/lib/include/vfio_util.h331
-rw-r--r--tools/testing/selftests/vfio/lib/iommu.c465
-rw-r--r--tools/testing/selftests/vfio/lib/iova_allocator.c94
-rw-r--r--tools/testing/selftests/vfio/lib/libvfio.c78
-rw-r--r--tools/testing/selftests/vfio/lib/libvfio.mk23
-rw-r--r--tools/testing/selftests/vfio/lib/vfio_pci_device.c556
-rw-r--r--tools/testing/selftests/vfio/lib/vfio_pci_driver.c16
-rwxr-xr-xtools/testing/selftests/vfio/run.sh109
-rwxr-xr-xtools/testing/selftests/vfio/scripts/cleanup.sh41
-rwxr-xr-xtools/testing/selftests/vfio/scripts/lib.sh42
-rwxr-xr-xtools/testing/selftests/vfio/scripts/run.sh16
-rwxr-xr-xtools/testing/selftests/vfio/scripts/setup.sh48
-rw-r--r--tools/testing/selftests/vfio/vfio_dma_mapping_test.c46
-rw-r--r--tools/testing/selftests/vfio/vfio_iommufd_setup_test.c2
-rw-r--r--tools/testing/selftests/vfio/vfio_pci_device_init_perf_test.c168
-rw-r--r--tools/testing/selftests/vfio/vfio_pci_device_test.c12
-rw-r--r--tools/testing/selftests/vfio/vfio_pci_driver_test.c51
457 files changed, 29258 insertions, 7591 deletions
diff --git a/.clang-format b/.clang-format
index ecb5035a3d9d..2fadacbd3b8e 100644
--- a/.clang-format
+++ b/.clang-format
@@ -415,6 +415,7 @@ ForEachMacros:
- 'for_each_prop_dlc_cpus'
- 'for_each_prop_dlc_platforms'
- 'for_each_property_of_node'
+ - 'for_each_pt_level_entry'
- 'for_each_rdt_resource'
- 'for_each_reg'
- 'for_each_reg_filtered'
diff --git a/.mailmap b/.mailmap
index eb7515272ce6..6b22c085b39d 100644
--- a/.mailmap
+++ b/.mailmap
@@ -345,7 +345,8 @@ Jayachandran C <c.jayachandran@gmail.com> <jayachandranc@netlogicmicro.com>
Jayachandran C <c.jayachandran@gmail.com> <jchandra@broadcom.com>
Jayachandran C <c.jayachandran@gmail.com> <jchandra@digeo.com>
Jayachandran C <c.jayachandran@gmail.com> <jnair@caviumnetworks.com>
-<jean-philippe@linaro.org> <jean-philippe.brucker@arm.com>
+Jean-Philippe Brucker <jpb@kernel.org> <jean-philippe.brucker@arm.com>
+Jean-Philippe Brucker <jpb@kernel.org> <jean-philippe@linaro.org>
Jean-Michel Hautbois <jeanmichel.hautbois@yoseli.org> <jeanmichel.hautbois@ideasonboard.com>
Jean Tourrilhes <jt@hpl.hp.com>
Jeevan Shriram <quic_jshriram@quicinc.com> <jshriram@codeaurora.org>
diff --git a/Documentation/ABI/testing/sysfs-bus-cxl b/Documentation/ABI/testing/sysfs-bus-cxl
index 6b4e8c7a963d..c80a1b5a03db 100644
--- a/Documentation/ABI/testing/sysfs-bus-cxl
+++ b/Documentation/ABI/testing/sysfs-bus-cxl
@@ -496,8 +496,17 @@ Description:
changed, only freed by writing 0. The kernel makes no guarantees
that data is maintained over an address space freeing event, and
there is no guarantee that a free followed by an allocate
- results in the same address being allocated.
+ results in the same address being allocated. If extended linear
+ cache is present, the size indicates extended linear cache size
+ plus the CXL region size.
+What: /sys/bus/cxl/devices/regionZ/extended_linear_cache_size
+Date: October, 2025
+KernelVersion: v6.19
+Contact: linux-cxl@vger.kernel.org
+Description:
+ (RO) The size of extended linear cache, if there is an extended
+ linear cache. Otherwise the attribute will not be visible.
What: /sys/bus/cxl/devices/regionZ/mode
Date: January, 2023
diff --git a/Documentation/ABI/testing/sysfs-class-power-rt9756 b/Documentation/ABI/testing/sysfs-class-power-rt9756
new file mode 100644
index 000000000000..c4d6c2b4715d
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-class-power-rt9756
@@ -0,0 +1,30 @@
+What: /sys/class/power_supply/rt9756-*/watchdog_timer
+Date: Dec 2025
+KernelVersion: 6.19
+Contact: ChiYuan Huang <cy_huang@richtek.com>
+Description:
+ This entry shows and sets the watchdog timer when rt9756 charger
+ operates in charging mode. When the timer expires, the device
+ will disable the charging. To prevent the timer expires, any
+ host communication can make the timer restarted.
+
+ Access: Read, Write
+
+ Valid values:
+ - 500, 1000, 5000, 30000, 40000, 80000, 128000 or 255000 (milliseconds),
+ - 0: disabled
+
+What: /sys/class/power_supply/rt9756-*/operation_mode
+Date: Dec 2025
+KernelVersion: 6.19
+Contact: ChiYuan Huang <cy_huang@richtek.com>
+Description:
+ This entry shows and set the operation mode when rt9756 charger
+ operates in charging phase. If 'bypass' mode is used, internal
+ path will connect vbus directly to vbat. Else, default 'div2'
+ mode for the switch-cap charging.
+
+ Access: Read, Write
+
+ Valid values:
+ - 'bypass' or 'div2'
diff --git a/Documentation/PCI/pci-error-recovery.rst b/Documentation/PCI/pci-error-recovery.rst
index 5df481ac6193..43bc4e3665b4 100644
--- a/Documentation/PCI/pci-error-recovery.rst
+++ b/Documentation/PCI/pci-error-recovery.rst
@@ -326,6 +326,21 @@ be recovered, there is nothing more that can be done; the platform
will typically report a "permanent failure" in such a case. The
device will be considered "dead" in this case.
+Drivers typically need to call pci_restore_state() after reset to
+re-initialize the device's config space registers and thereby
+bring it from D0\ :sub:`uninitialized` into D0\ :sub:`active` state
+(PCIe r7.0 sec 5.3.1.1). The PCI core invokes pci_save_state()
+on enumeration after initializing config space to ensure that a
+saved state is available for subsequent error recovery.
+Drivers which modify config space on probe may need to invoke
+pci_save_state() afterwards to record those changes for later
+error recovery. When going into system suspend, pci_save_state()
+is called for every PCI device and that state will be restored
+not only on resume, but also on any subsequent error recovery.
+In the unlikely event that the saved state recorded on suspend
+is unsuitable for error recovery, drivers should call
+pci_save_state() on resume.
+
Drivers for multi-function cards will need to coordinate among
themselves as to which driver instance will perform any "one-shot"
or global device initialization. For example, the Symbios sym53cxx2
diff --git a/Documentation/devicetree/bindings/iommu/arm,smmu.yaml b/Documentation/devicetree/bindings/iommu/arm,smmu.yaml
index 5003403cda18..cdbd23b5c08c 100644
--- a/Documentation/devicetree/bindings/iommu/arm,smmu.yaml
+++ b/Documentation/devicetree/bindings/iommu/arm,smmu.yaml
@@ -35,6 +35,8 @@ properties:
- description: Qcom SoCs implementing "qcom,smmu-500" and "arm,mmu-500"
items:
- enum:
+ - qcom,glymur-smmu-500
+ - qcom,kaanapali-smmu-500
- qcom,milos-smmu-500
- qcom,qcm2290-smmu-500
- qcom,qcs615-smmu-500
diff --git a/Documentation/devicetree/bindings/iommu/mediatek,iommu.yaml b/Documentation/devicetree/bindings/iommu/mediatek,iommu.yaml
index f49ed8ac4776..79c573c47b08 100644
--- a/Documentation/devicetree/bindings/iommu/mediatek,iommu.yaml
+++ b/Documentation/devicetree/bindings/iommu/mediatek,iommu.yaml
@@ -82,6 +82,9 @@ properties:
- mediatek,mt8188-iommu-vdo # generation two
- mediatek,mt8188-iommu-vpp # generation two
- mediatek,mt8188-iommu-infra # generation two
+ - mediatek,mt8189-iommu-apu # generation two
+ - mediatek,mt8189-iommu-infra # generation two
+ - mediatek,mt8189-iommu-mm # generation two
- mediatek,mt8192-m4u # generation two
- mediatek,mt8195-iommu-vdo # generation two
- mediatek,mt8195-iommu-vpp # generation two
@@ -128,6 +131,7 @@ properties:
This is the mtk_m4u_id according to the HW. Specifies the mtk_m4u_id as
defined in
dt-binding/memory/mediatek,mt8188-memory-port.h for mt8188,
+ dt-binding/memory/mediatek,mt8189-memory-port.h for mt8189,
dt-binding/memory/mt2701-larb-port.h for mt2701 and mt7623,
dt-binding/memory/mt2712-larb-port.h for mt2712,
dt-binding/memory/mt6779-larb-port.h for mt6779,
@@ -164,6 +168,7 @@ allOf:
- mediatek,mt8186-iommu-mm
- mediatek,mt8188-iommu-vdo
- mediatek,mt8188-iommu-vpp
+ - mediatek,mt8189-iommu-mm
- mediatek,mt8192-m4u
- mediatek,mt8195-iommu-vdo
- mediatek,mt8195-iommu-vpp
@@ -180,6 +185,7 @@ allOf:
- mediatek,mt8186-iommu-mm
- mediatek,mt8188-iommu-vdo
- mediatek,mt8188-iommu-vpp
+ - mediatek,mt8189-iommu-mm
- mediatek,mt8192-m4u
- mediatek,mt8195-iommu-vdo
- mediatek,mt8195-iommu-vpp
@@ -208,6 +214,8 @@ allOf:
contains:
enum:
- mediatek,mt8188-iommu-infra
+ - mediatek,mt8189-iommu-apu
+ - mediatek,mt8189-iommu-infra
- mediatek,mt8195-iommu-infra
then:
diff --git a/Documentation/devicetree/bindings/iommu/qcom,iommu.yaml b/Documentation/devicetree/bindings/iommu/qcom,iommu.yaml
index 3e5623edd207..93a489025317 100644
--- a/Documentation/devicetree/bindings/iommu/qcom,iommu.yaml
+++ b/Documentation/devicetree/bindings/iommu/qcom,iommu.yaml
@@ -32,14 +32,18 @@ properties:
- const: qcom,msm-iommu-v2
clocks:
+ minItems: 2
items:
- description: Clock required for IOMMU register group access
- description: Clock required for underlying bus access
+ - description: Clock required for Translation Buffer Unit access
clock-names:
+ minItems: 2
items:
- const: iface
- const: bus
+ - const: tbu
power-domains:
maxItems: 1
diff --git a/Documentation/devicetree/bindings/pci/amlogic,axg-pcie.yaml b/Documentation/devicetree/bindings/pci/amlogic,axg-pcie.yaml
index 79a21ba0f9fd..d67cb7a850a3 100644
--- a/Documentation/devicetree/bindings/pci/amlogic,axg-pcie.yaml
+++ b/Documentation/devicetree/bindings/pci/amlogic,axg-pcie.yaml
@@ -20,9 +20,10 @@ allOf:
select:
properties:
compatible:
- enum:
- - amlogic,axg-pcie
- - amlogic,g12a-pcie
+ contains:
+ enum:
+ - amlogic,axg-pcie
+ - amlogic,g12a-pcie
required:
- compatible
@@ -36,13 +37,13 @@ properties:
reg:
items:
- - description: External local bus interface registers
+ - description: Data Bus Interface registers
- description: Meson designed configuration registers
- description: PCIe configuration space
reg-names:
items:
- - const: elbi
+ - const: dbi
- const: cfg
- const: config
@@ -51,15 +52,15 @@ properties:
clocks:
items:
+ - description: PCIe PHY clock
- description: PCIe GEN 100M PLL clock
- description: PCIe RC clock gate
- - description: PCIe PHY clock
clock-names:
items:
+ - const: general
- const: pclk
- const: port
- - const: general
phys:
maxItems: 1
@@ -88,7 +89,7 @@ required:
- reg
- reg-names
- interrupts
- - clock
+ - clocks
- clock-names
- "#address-cells"
- "#size-cells"
@@ -113,10 +114,10 @@ examples:
pcie: pcie@f9800000 {
compatible = "amlogic,axg-pcie", "snps,dw-pcie";
reg = <0xf9800000 0x400000>, <0xff646000 0x2000>, <0xf9f00000 0x100000>;
- reg-names = "elbi", "cfg", "config";
+ reg-names = "dbi", "cfg", "config";
interrupts = <GIC_SPI 177 IRQ_TYPE_EDGE_RISING>;
- clocks = <&pclk>, <&clk_port>, <&clk_phy>;
- clock-names = "pclk", "port", "general";
+ clocks = <&clk_phy>, <&pclk>, <&clk_port>;
+ clock-names = "general", "pclk", "port";
resets = <&reset_pcie_port>, <&reset_pcie_apb>;
reset-names = "port", "apb";
phys = <&pcie_phy>;
diff --git a/Documentation/devicetree/bindings/pci/cix,sky1-pcie-host.yaml b/Documentation/devicetree/bindings/pci/cix,sky1-pcie-host.yaml
new file mode 100644
index 000000000000..b910a42e0843
--- /dev/null
+++ b/Documentation/devicetree/bindings/pci/cix,sky1-pcie-host.yaml
@@ -0,0 +1,83 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/pci/cix,sky1-pcie-host.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: CIX Sky1 PCIe Root Complex
+
+maintainers:
+ - Hans Zhang <hans.zhang@cixtech.com>
+
+description:
+ PCIe root complex controller based on the Cadence PCIe core.
+
+allOf:
+ - $ref: /schemas/pci/pci-host-bridge.yaml#
+
+properties:
+ compatible:
+ const: cix,sky1-pcie-host
+
+ reg:
+ items:
+ - description: PCIe controller registers.
+ - description: ECAM registers.
+ - description: Remote CIX System Unit strap registers.
+ - description: Remote CIX System Unit status registers.
+ - description: Region for sending messages registers.
+
+ reg-names:
+ items:
+ - const: reg
+ - const: cfg
+ - const: rcsu_strap
+ - const: rcsu_status
+ - const: msg
+
+ ranges:
+ maxItems: 3
+
+required:
+ - compatible
+ - ranges
+ - bus-range
+ - device_type
+ - interrupt-map
+ - interrupt-map-mask
+ - msi-map
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+
+ soc {
+ #address-cells = <2>;
+ #size-cells = <2>;
+
+ pcie@a010000 {
+ compatible = "cix,sky1-pcie-host";
+ reg = <0x00 0x0a010000 0x00 0x10000>,
+ <0x00 0x2c000000 0x00 0x4000000>,
+ <0x00 0x0a000300 0x00 0x100>,
+ <0x00 0x0a000400 0x00 0x100>,
+ <0x00 0x60000000 0x00 0x00100000>;
+ reg-names = "reg", "cfg", "rcsu_strap", "rcsu_status", "msg";
+ ranges = <0x01000000 0x00 0x60100000 0x00 0x60100000 0x00 0x00100000>,
+ <0x02000000 0x00 0x60200000 0x00 0x60200000 0x00 0x1fe00000>,
+ <0x43000000 0x18 0x00000000 0x18 0x00000000 0x04 0x00000000>;
+ #address-cells = <3>;
+ #size-cells = <2>;
+ bus-range = <0xc0 0xff>;
+ device_type = "pci";
+ #interrupt-cells = <1>;
+ interrupt-map-mask = <0 0 0 0x7>;
+ interrupt-map = <0 0 0 1 &gic 0 0 GIC_SPI 407 IRQ_TYPE_LEVEL_HIGH 0>,
+ <0 0 0 2 &gic 0 0 GIC_SPI 408 IRQ_TYPE_LEVEL_HIGH 0>,
+ <0 0 0 3 &gic 0 0 GIC_SPI 409 IRQ_TYPE_LEVEL_HIGH 0>,
+ <0 0 0 4 &gic 0 0 GIC_SPI 410 IRQ_TYPE_LEVEL_HIGH 0>;
+ msi-map = <0xc000 &gic_its 0xc000 0x4000>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/pci/mediatek-pcie-mt7623.yaml b/Documentation/devicetree/bindings/pci/mediatek-pcie-mt7623.yaml
new file mode 100644
index 000000000000..e33bcc216e30
--- /dev/null
+++ b/Documentation/devicetree/bindings/pci/mediatek-pcie-mt7623.yaml
@@ -0,0 +1,164 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/pci/mediatek-pcie-mt7623.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: PCIe controller on MediaTek SoCs
+
+maintainers:
+ - Christian Marangi <ansuelsmth@gmail.com>
+
+properties:
+ compatible:
+ enum:
+ - mediatek,mt2701-pcie
+ - mediatek,mt7623-pcie
+
+ reg:
+ minItems: 4
+ maxItems: 4
+
+ reg-names:
+ items:
+ - const: subsys
+ - const: port0
+ - const: port1
+ - const: port2
+
+ clocks:
+ minItems: 4
+ maxItems: 4
+
+ clock-names:
+ items:
+ - const: free_ck
+ - const: sys_ck0
+ - const: sys_ck1
+ - const: sys_ck2
+
+ resets:
+ minItems: 3
+ maxItems: 3
+
+ reset-names:
+ items:
+ - const: pcie-rst0
+ - const: pcie-rst1
+ - const: pcie-rst2
+
+ phys:
+ minItems: 3
+ maxItems: 3
+
+ phy-names:
+ items:
+ - const: pcie-phy0
+ - const: pcie-phy1
+ - const: pcie-phy2
+
+ power-domains:
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+ - reg-names
+ - ranges
+ - clocks
+ - clock-names
+ - '#interrupt-cells'
+ - resets
+ - reset-names
+ - phys
+ - phy-names
+ - power-domains
+ - pcie@0,0
+ - pcie@1,0
+ - pcie@2,0
+
+allOf:
+ - $ref: /schemas/pci/pci-host-bridge.yaml#
+
+unevaluatedProperties: false
+
+examples:
+ # MT7623
+ - |
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/interrupt-controller/irq.h>
+ #include <dt-bindings/clock/mt2701-clk.h>
+ #include <dt-bindings/reset/mt2701-resets.h>
+ #include <dt-bindings/phy/phy.h>
+ #include <dt-bindings/power/mt2701-power.h>
+
+ soc {
+ #address-cells = <2>;
+ #size-cells = <2>;
+
+ pcie@1a140000 {
+ compatible = "mediatek,mt7623-pcie";
+ device_type = "pci";
+ reg = <0 0x1a140000 0 0x1000>, /* PCIe shared registers */
+ <0 0x1a142000 0 0x1000>, /* Port0 registers */
+ <0 0x1a143000 0 0x1000>, /* Port1 registers */
+ <0 0x1a144000 0 0x1000>; /* Port2 registers */
+ reg-names = "subsys", "port0", "port1", "port2";
+ #address-cells = <3>;
+ #size-cells = <2>;
+ #interrupt-cells = <1>;
+ interrupt-map-mask = <0xf800 0 0 0>;
+ interrupt-map = <0x0000 0 0 0 &sysirq GIC_SPI 193 IRQ_TYPE_LEVEL_LOW>,
+ <0x0800 0 0 0 &sysirq GIC_SPI 194 IRQ_TYPE_LEVEL_LOW>,
+ <0x1000 0 0 0 &sysirq GIC_SPI 195 IRQ_TYPE_LEVEL_LOW>;
+ clocks = <&topckgen CLK_TOP_ETHIF_SEL>,
+ <&hifsys CLK_HIFSYS_PCIE0>,
+ <&hifsys CLK_HIFSYS_PCIE1>,
+ <&hifsys CLK_HIFSYS_PCIE2>;
+ clock-names = "free_ck", "sys_ck0", "sys_ck1", "sys_ck2";
+ resets = <&hifsys MT2701_HIFSYS_PCIE0_RST>,
+ <&hifsys MT2701_HIFSYS_PCIE1_RST>,
+ <&hifsys MT2701_HIFSYS_PCIE2_RST>;
+ reset-names = "pcie-rst0", "pcie-rst1", "pcie-rst2";
+ phys = <&pcie0_phy PHY_TYPE_PCIE>, <&pcie1_phy PHY_TYPE_PCIE>,
+ <&pcie2_phy PHY_TYPE_PCIE>;
+ phy-names = "pcie-phy0", "pcie-phy1", "pcie-phy2";
+ power-domains = <&scpsys MT2701_POWER_DOMAIN_HIF>;
+ bus-range = <0x00 0xff>;
+ ranges = <0x81000000 0 0x1a160000 0 0x1a160000 0 0x00010000>, /* I/O space */
+ <0x83000000 0 0x60000000 0 0x60000000 0 0x10000000>; /* memory space */
+
+ pcie@0,0 {
+ device_type = "pci";
+ reg = <0x0000 0 0 0 0>;
+ #address-cells = <3>;
+ #size-cells = <2>;
+ #interrupt-cells = <1>;
+ interrupt-map-mask = <0 0 0 0>;
+ interrupt-map = <0 0 0 0 &sysirq GIC_SPI 193 IRQ_TYPE_LEVEL_LOW>;
+ ranges;
+ };
+
+ pcie@1,0 {
+ device_type = "pci";
+ reg = <0x0800 0 0 0 0>;
+ #address-cells = <3>;
+ #size-cells = <2>;
+ #interrupt-cells = <1>;
+ interrupt-map-mask = <0 0 0 0>;
+ interrupt-map = <0 0 0 0 &sysirq GIC_SPI 194 IRQ_TYPE_LEVEL_LOW>;
+ ranges;
+ };
+
+ pcie@2,0 {
+ device_type = "pci";
+ reg = <0x1000 0 0 0 0>;
+ #address-cells = <3>;
+ #size-cells = <2>;
+ #interrupt-cells = <1>;
+ interrupt-map-mask = <0 0 0 0>;
+ interrupt-map = <0 0 0 0 &sysirq GIC_SPI 195 IRQ_TYPE_LEVEL_LOW>;
+ ranges;
+ };
+ };
+ };
diff --git a/Documentation/devicetree/bindings/pci/mediatek-pcie.txt b/Documentation/devicetree/bindings/pci/mediatek-pcie.txt
deleted file mode 100644
index 684227522267..000000000000
--- a/Documentation/devicetree/bindings/pci/mediatek-pcie.txt
+++ /dev/null
@@ -1,289 +0,0 @@
-MediaTek Gen2 PCIe controller
-
-Required properties:
-- compatible: Should contain one of the following strings:
- "mediatek,mt2701-pcie"
- "mediatek,mt2712-pcie"
- "mediatek,mt7622-pcie"
- "mediatek,mt7623-pcie"
- "mediatek,mt7629-pcie"
- "airoha,en7523-pcie"
-- device_type: Must be "pci"
-- reg: Base addresses and lengths of the root ports.
-- reg-names: Names of the above areas to use during resource lookup.
-- #address-cells: Address representation for root ports (must be 3)
-- #size-cells: Size representation for root ports (must be 2)
-- clocks: Must contain an entry for each entry in clock-names.
- See ../clocks/clock-bindings.txt for details.
-- clock-names:
- Mandatory entries:
- - sys_ckN :transaction layer and data link layer clock
- Required entries for MT2701/MT7623:
- - free_ck :for reference clock of PCIe subsys
- Required entries for MT2712/MT7622:
- - ahb_ckN :AHB slave interface operating clock for CSR access and RC
- initiated MMIO access
- Required entries for MT7622:
- - axi_ckN :application layer MMIO channel operating clock
- - aux_ckN :pe2_mac_bridge and pe2_mac_core operating clock when
- pcie_mac_ck/pcie_pipe_ck is turned off
- - obff_ckN :OBFF functional block operating clock
- - pipe_ckN :LTSSM and PHY/MAC layer operating clock
- where N starting from 0 to one less than the number of root ports.
-- phys: List of PHY specifiers (used by generic PHY framework).
-- phy-names : Must be "pcie-phy0", "pcie-phy1", "pcie-phyN".. based on the
- number of PHYs as specified in *phys* property.
-- power-domains: A phandle and power domain specifier pair to the power domain
- which is responsible for collapsing and restoring power to the peripheral.
-- bus-range: Range of bus numbers associated with this controller.
-- ranges: Ranges for the PCI memory and I/O regions.
-
-Required properties for MT7623/MT2701:
-- #interrupt-cells: Size representation for interrupts (must be 1)
-- interrupt-map-mask and interrupt-map: Standard PCI IRQ mapping properties
- Please refer to the standard PCI bus binding document for a more detailed
- explanation.
-- resets: Must contain an entry for each entry in reset-names.
- See ../reset/reset.txt for details.
-- reset-names: Must be "pcie-rst0", "pcie-rst1", "pcie-rstN".. based on the
- number of root ports.
-
-Required properties for MT2712/MT7622/MT7629:
--interrupts: A list of interrupt outputs of the controller, must have one
- entry for each PCIe port
-- interrupt-names: Must include the following entries:
- - "pcie_irq": The interrupt that is asserted when an MSI/INTX is received
-- linux,pci-domain: PCI domain ID. Should be unique for each host controller
-
-In addition, the device tree node must have sub-nodes describing each
-PCIe port interface, having the following mandatory properties:
-
-Required properties:
-- device_type: Must be "pci"
-- reg: Only the first four bytes are used to refer to the correct bus number
- and device number.
-- #address-cells: Must be 3
-- #size-cells: Must be 2
-- #interrupt-cells: Must be 1
-- interrupt-map-mask and interrupt-map: Standard PCI IRQ mapping properties
- Please refer to the standard PCI bus binding document for a more detailed
- explanation.
-- ranges: Sub-ranges distributed from the PCIe controller node. An empty
- property is sufficient.
-
-Examples for MT7623:
-
- hifsys: syscon@1a000000 {
- compatible = "mediatek,mt7623-hifsys",
- "mediatek,mt2701-hifsys",
- "syscon";
- reg = <0 0x1a000000 0 0x1000>;
- #clock-cells = <1>;
- #reset-cells = <1>;
- };
-
- pcie: pcie@1a140000 {
- compatible = "mediatek,mt7623-pcie";
- device_type = "pci";
- reg = <0 0x1a140000 0 0x1000>, /* PCIe shared registers */
- <0 0x1a142000 0 0x1000>, /* Port0 registers */
- <0 0x1a143000 0 0x1000>, /* Port1 registers */
- <0 0x1a144000 0 0x1000>; /* Port2 registers */
- reg-names = "subsys", "port0", "port1", "port2";
- #address-cells = <3>;
- #size-cells = <2>;
- #interrupt-cells = <1>;
- interrupt-map-mask = <0xf800 0 0 0>;
- interrupt-map = <0x0000 0 0 0 &sysirq GIC_SPI 193 IRQ_TYPE_LEVEL_LOW>,
- <0x0800 0 0 0 &sysirq GIC_SPI 194 IRQ_TYPE_LEVEL_LOW>,
- <0x1000 0 0 0 &sysirq GIC_SPI 195 IRQ_TYPE_LEVEL_LOW>;
- clocks = <&topckgen CLK_TOP_ETHIF_SEL>,
- <&hifsys CLK_HIFSYS_PCIE0>,
- <&hifsys CLK_HIFSYS_PCIE1>,
- <&hifsys CLK_HIFSYS_PCIE2>;
- clock-names = "free_ck", "sys_ck0", "sys_ck1", "sys_ck2";
- resets = <&hifsys MT2701_HIFSYS_PCIE0_RST>,
- <&hifsys MT2701_HIFSYS_PCIE1_RST>,
- <&hifsys MT2701_HIFSYS_PCIE2_RST>;
- reset-names = "pcie-rst0", "pcie-rst1", "pcie-rst2";
- phys = <&pcie0_phy PHY_TYPE_PCIE>, <&pcie1_phy PHY_TYPE_PCIE>,
- <&pcie2_phy PHY_TYPE_PCIE>;
- phy-names = "pcie-phy0", "pcie-phy1", "pcie-phy2";
- power-domains = <&scpsys MT2701_POWER_DOMAIN_HIF>;
- bus-range = <0x00 0xff>;
- ranges = <0x81000000 0 0x1a160000 0 0x1a160000 0 0x00010000 /* I/O space */
- 0x83000000 0 0x60000000 0 0x60000000 0 0x10000000>; /* memory space */
-
- pcie@0,0 {
- reg = <0x0000 0 0 0 0>;
- #address-cells = <3>;
- #size-cells = <2>;
- #interrupt-cells = <1>;
- interrupt-map-mask = <0 0 0 0>;
- interrupt-map = <0 0 0 0 &sysirq GIC_SPI 193 IRQ_TYPE_LEVEL_LOW>;
- ranges;
- };
-
- pcie@1,0 {
- reg = <0x0800 0 0 0 0>;
- #address-cells = <3>;
- #size-cells = <2>;
- #interrupt-cells = <1>;
- interrupt-map-mask = <0 0 0 0>;
- interrupt-map = <0 0 0 0 &sysirq GIC_SPI 194 IRQ_TYPE_LEVEL_LOW>;
- ranges;
- };
-
- pcie@2,0 {
- reg = <0x1000 0 0 0 0>;
- #address-cells = <3>;
- #size-cells = <2>;
- #interrupt-cells = <1>;
- interrupt-map-mask = <0 0 0 0>;
- interrupt-map = <0 0 0 0 &sysirq GIC_SPI 195 IRQ_TYPE_LEVEL_LOW>;
- ranges;
- };
- };
-
-Examples for MT2712:
-
- pcie1: pcie@112ff000 {
- compatible = "mediatek,mt2712-pcie";
- device_type = "pci";
- reg = <0 0x112ff000 0 0x1000>;
- reg-names = "port1";
- linux,pci-domain = <1>;
- #address-cells = <3>;
- #size-cells = <2>;
- interrupts = <GIC_SPI 117 IRQ_TYPE_LEVEL_HIGH>;
- interrupt-names = "pcie_irq";
- clocks = <&topckgen CLK_TOP_PE2_MAC_P1_SEL>,
- <&pericfg CLK_PERI_PCIE1>;
- clock-names = "sys_ck1", "ahb_ck1";
- phys = <&u3port1 PHY_TYPE_PCIE>;
- phy-names = "pcie-phy1";
- bus-range = <0x00 0xff>;
- ranges = <0x82000000 0 0x11400000 0x0 0x11400000 0 0x300000>;
- status = "disabled";
-
- #interrupt-cells = <1>;
- interrupt-map-mask = <0 0 0 7>;
- interrupt-map = <0 0 0 1 &pcie_intc1 0>,
- <0 0 0 2 &pcie_intc1 1>,
- <0 0 0 3 &pcie_intc1 2>,
- <0 0 0 4 &pcie_intc1 3>;
- pcie_intc1: interrupt-controller {
- interrupt-controller;
- #address-cells = <0>;
- #interrupt-cells = <1>;
- };
- };
-
- pcie0: pcie@11700000 {
- compatible = "mediatek,mt2712-pcie";
- device_type = "pci";
- reg = <0 0x11700000 0 0x1000>;
- reg-names = "port0";
- linux,pci-domain = <0>;
- #address-cells = <3>;
- #size-cells = <2>;
- interrupts = <GIC_SPI 115 IRQ_TYPE_LEVEL_HIGH>;
- interrupt-names = "pcie_irq";
- clocks = <&topckgen CLK_TOP_PE2_MAC_P0_SEL>,
- <&pericfg CLK_PERI_PCIE0>;
- clock-names = "sys_ck0", "ahb_ck0";
- phys = <&u3port0 PHY_TYPE_PCIE>;
- phy-names = "pcie-phy0";
- bus-range = <0x00 0xff>;
- ranges = <0x82000000 0 0x20000000 0x0 0x20000000 0 0x10000000>;
- status = "disabled";
-
- #interrupt-cells = <1>;
- interrupt-map-mask = <0 0 0 7>;
- interrupt-map = <0 0 0 1 &pcie_intc0 0>,
- <0 0 0 2 &pcie_intc0 1>,
- <0 0 0 3 &pcie_intc0 2>,
- <0 0 0 4 &pcie_intc0 3>;
- pcie_intc0: interrupt-controller {
- interrupt-controller;
- #address-cells = <0>;
- #interrupt-cells = <1>;
- };
- };
-
-Examples for MT7622:
-
- pcie0: pcie@1a143000 {
- compatible = "mediatek,mt7622-pcie";
- device_type = "pci";
- reg = <0 0x1a143000 0 0x1000>;
- reg-names = "port0";
- linux,pci-domain = <0>;
- #address-cells = <3>;
- #size-cells = <2>;
- interrupts = <GIC_SPI 228 IRQ_TYPE_LEVEL_LOW>;
- interrupt-names = "pcie_irq";
- clocks = <&pciesys CLK_PCIE_P0_MAC_EN>,
- <&pciesys CLK_PCIE_P0_AHB_EN>,
- <&pciesys CLK_PCIE_P0_AUX_EN>,
- <&pciesys CLK_PCIE_P0_AXI_EN>,
- <&pciesys CLK_PCIE_P0_OBFF_EN>,
- <&pciesys CLK_PCIE_P0_PIPE_EN>;
- clock-names = "sys_ck0", "ahb_ck0", "aux_ck0",
- "axi_ck0", "obff_ck0", "pipe_ck0";
-
- power-domains = <&scpsys MT7622_POWER_DOMAIN_HIF0>;
- bus-range = <0x00 0xff>;
- ranges = <0x82000000 0 0x20000000 0x0 0x20000000 0 0x8000000>;
- status = "disabled";
-
- #interrupt-cells = <1>;
- interrupt-map-mask = <0 0 0 7>;
- interrupt-map = <0 0 0 1 &pcie_intc0 0>,
- <0 0 0 2 &pcie_intc0 1>,
- <0 0 0 3 &pcie_intc0 2>,
- <0 0 0 4 &pcie_intc0 3>;
- pcie_intc0: interrupt-controller {
- interrupt-controller;
- #address-cells = <0>;
- #interrupt-cells = <1>;
- };
- };
-
- pcie1: pcie@1a145000 {
- compatible = "mediatek,mt7622-pcie";
- device_type = "pci";
- reg = <0 0x1a145000 0 0x1000>;
- reg-names = "port1";
- linux,pci-domain = <1>;
- #address-cells = <3>;
- #size-cells = <2>;
- interrupts = <GIC_SPI 229 IRQ_TYPE_LEVEL_LOW>;
- interrupt-names = "pcie_irq";
- clocks = <&pciesys CLK_PCIE_P1_MAC_EN>,
- /* designer has connect RC1 with p0_ahb clock */
- <&pciesys CLK_PCIE_P0_AHB_EN>,
- <&pciesys CLK_PCIE_P1_AUX_EN>,
- <&pciesys CLK_PCIE_P1_AXI_EN>,
- <&pciesys CLK_PCIE_P1_OBFF_EN>,
- <&pciesys CLK_PCIE_P1_PIPE_EN>;
- clock-names = "sys_ck1", "ahb_ck1", "aux_ck1",
- "axi_ck1", "obff_ck1", "pipe_ck1";
-
- power-domains = <&scpsys MT7622_POWER_DOMAIN_HIF0>;
- bus-range = <0x00 0xff>;
- ranges = <0x82000000 0 0x28000000 0x0 0x28000000 0 0x8000000>;
- status = "disabled";
-
- #interrupt-cells = <1>;
- interrupt-map-mask = <0 0 0 7>;
- interrupt-map = <0 0 0 1 &pcie_intc1 0>,
- <0 0 0 2 &pcie_intc1 1>,
- <0 0 0 3 &pcie_intc1 2>,
- <0 0 0 4 &pcie_intc1 3>;
- pcie_intc1: interrupt-controller {
- interrupt-controller;
- #address-cells = <0>;
- #interrupt-cells = <1>;
- };
- };
diff --git a/Documentation/devicetree/bindings/pci/mediatek-pcie.yaml b/Documentation/devicetree/bindings/pci/mediatek-pcie.yaml
new file mode 100644
index 000000000000..0b8c78ec4f91
--- /dev/null
+++ b/Documentation/devicetree/bindings/pci/mediatek-pcie.yaml
@@ -0,0 +1,438 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/pci/mediatek-pcie.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: PCIe controller on MediaTek SoCs
+
+maintainers:
+ - Christian Marangi <ansuelsmth@gmail.com>
+
+properties:
+ compatible:
+ oneOf:
+ - enum:
+ - airoha,an7583-pcie
+ - mediatek,mt2712-pcie
+ - mediatek,mt7622-pcie
+ - mediatek,mt7629-pcie
+ - items:
+ - const: airoha,en7523-pcie
+ - const: mediatek,mt7622-pcie
+
+ reg:
+ maxItems: 1
+
+ reg-names:
+ enum: [ port0, port1 ]
+
+ clocks:
+ minItems: 1
+ maxItems: 6
+
+ clock-names:
+ minItems: 1
+ items:
+ - enum: [ sys_ck0, sys_ck1 ]
+ - enum: [ ahb_ck0, ahb_ck1 ]
+ - enum: [ aux_ck0, aux_ck1 ]
+ - enum: [ axi_ck0, axi_ck1 ]
+ - enum: [ obff_ck0, obff_ck1 ]
+ - enum: [ pipe_ck0, pipe_ck1 ]
+
+ resets:
+ maxItems: 1
+
+ reset-names:
+ const: pcie-rst1
+
+ interrupts:
+ maxItems: 1
+
+ interrupt-names:
+ const: pcie_irq
+
+ phys:
+ maxItems: 1
+
+ phy-names:
+ enum: [ pcie-phy0, pcie-phy1 ]
+
+ power-domains:
+ maxItems: 1
+
+ mediatek,pbus-csr:
+ $ref: /schemas/types.yaml#/definitions/phandle-array
+ items:
+ - items:
+ - description: phandle to pbus-csr syscon
+ - description: offset of pbus-csr base address register
+ - description: offset of pbus-csr base address mask register
+ description:
+ Phandle with two arguments to the syscon node used to detect if
+ a given address is accessible on PCIe controller.
+
+ '#interrupt-cells':
+ const: 1
+
+ interrupt-controller:
+ description: Interrupt controller node for handling legacy PCI interrupts.
+ type: object
+ properties:
+ '#address-cells':
+ const: 0
+ '#interrupt-cells':
+ const: 1
+ interrupt-controller: true
+
+ required:
+ - '#address-cells'
+ - '#interrupt-cells'
+ - interrupt-controller
+
+ additionalProperties: false
+
+required:
+ - compatible
+ - reg
+ - reg-names
+ - ranges
+ - clocks
+ - clock-names
+ - '#interrupt-cells'
+ - interrupts
+ - interrupt-names
+ - interrupt-controller
+
+allOf:
+ - $ref: /schemas/pci/pci-host-bridge.yaml#
+
+ - if:
+ properties:
+ compatible:
+ const: airoha,an7583-pcie
+ then:
+ properties:
+ reg-names:
+ const: port1
+
+ clocks:
+ maxItems: 1
+
+ clock-names:
+ const: sys_ck1
+
+ phy-names:
+ const: pcie-phy1
+
+ power-domain: false
+
+ required:
+ - resets
+ - reset-names
+ - phys
+ - phy-names
+ - mediatek,pbus-csr
+
+ - if:
+ properties:
+ compatible:
+ const: mediatek,mt2712-pcie
+ then:
+ properties:
+ clocks:
+ minItems: 2
+ maxItems: 2
+
+ clock-names:
+ minItems: 2
+ maxItems: 2
+
+ reset: false
+
+ reset-names: false
+
+ power-domains: false
+
+ mediatek,pbus-csr: false
+
+ required:
+ - phys
+ - phy-names
+
+ - if:
+ properties:
+ compatible:
+ const: mediatek,mt7622-pcie
+ then:
+ properties:
+ clocks:
+ minItems: 6
+
+ reset: false
+
+ reset-names: false
+
+ phys: false
+
+ phy-names: false
+
+ mediatek,pbus-csr: false
+
+ required:
+ - power-domains
+
+ - if:
+ properties:
+ compatible:
+ const: mediatek,mt7629-pcie
+ then:
+ properties:
+ clocks:
+ minItems: 6
+
+ reset: false
+
+ reset-names: false
+
+ mediatek,pbus-csr: false
+
+ required:
+ - power-domains
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: airoha,en7523-pcie
+ then:
+ properties:
+ clocks:
+ maxItems: 1
+
+ clock-names:
+ maxItems: 1
+
+ reset: false
+
+ reset-names: false
+
+ phys: false
+
+ phy-names: false
+
+ power-domain: false
+
+ mediatek,pbus-csr: false
+
+unevaluatedProperties: false
+
+examples:
+ # MT2712
+ - |
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/interrupt-controller/irq.h>
+ #include <dt-bindings/phy/phy.h>
+
+ soc_1 {
+ #address-cells = <2>;
+ #size-cells = <2>;
+
+ pcie@112ff000 {
+ compatible = "mediatek,mt2712-pcie";
+ device_type = "pci";
+ reg = <0 0x112ff000 0 0x1000>;
+ reg-names = "port1";
+ linux,pci-domain = <1>;
+ #address-cells = <3>;
+ #size-cells = <2>;
+ interrupts = <GIC_SPI 117 IRQ_TYPE_LEVEL_HIGH>;
+ interrupt-names = "pcie_irq";
+ clocks = <&topckgen>, /* CLK_TOP_PE2_MAC_P1_SEL */
+ <&pericfg>; /* CLK_PERI_PCIE1 */
+ clock-names = "sys_ck1", "ahb_ck1";
+ phys = <&u3port1 PHY_TYPE_PCIE>;
+ phy-names = "pcie-phy1";
+ bus-range = <0x00 0xff>;
+ ranges = <0x82000000 0 0x11400000 0x0 0x11400000 0 0x300000>;
+
+ #interrupt-cells = <1>;
+ interrupt-map-mask = <0 0 0 7>;
+ interrupt-map = <0 0 0 1 &pcie_intc1 0>,
+ <0 0 0 2 &pcie_intc1 1>,
+ <0 0 0 3 &pcie_intc1 2>,
+ <0 0 0 4 &pcie_intc1 3>;
+ pcie_intc1: interrupt-controller {
+ interrupt-controller;
+ #address-cells = <0>;
+ #interrupt-cells = <1>;
+ };
+ };
+
+ pcie@11700000 {
+ compatible = "mediatek,mt2712-pcie";
+ device_type = "pci";
+ reg = <0 0x11700000 0 0x1000>;
+ reg-names = "port0";
+ linux,pci-domain = <0>;
+ #address-cells = <3>;
+ #size-cells = <2>;
+ interrupts = <GIC_SPI 115 IRQ_TYPE_LEVEL_HIGH>;
+ interrupt-names = "pcie_irq";
+ clocks = <&topckgen>, /* CLK_TOP_PE2_MAC_P0_SEL */
+ <&pericfg>; /* CLK_PERI_PCIE0 */
+ clock-names = "sys_ck0", "ahb_ck0";
+ phys = <&u3port0 PHY_TYPE_PCIE>;
+ phy-names = "pcie-phy0";
+ bus-range = <0x00 0xff>;
+ ranges = <0x82000000 0 0x20000000 0x0 0x20000000 0 0x10000000>;
+
+ #interrupt-cells = <1>;
+ interrupt-map-mask = <0 0 0 7>;
+ interrupt-map = <0 0 0 1 &pcie_intc0 0>,
+ <0 0 0 2 &pcie_intc0 1>,
+ <0 0 0 3 &pcie_intc0 2>,
+ <0 0 0 4 &pcie_intc0 3>;
+ pcie_intc0: interrupt-controller {
+ interrupt-controller;
+ #address-cells = <0>;
+ #interrupt-cells = <1>;
+ };
+ };
+ };
+
+ # MT7622
+ - |
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/interrupt-controller/irq.h>
+ #include <dt-bindings/power/mt7622-power.h>
+
+ soc_2 {
+ #address-cells = <2>;
+ #size-cells = <2>;
+
+ pcie@1a143000 {
+ compatible = "mediatek,mt7622-pcie";
+ device_type = "pci";
+ reg = <0 0x1a143000 0 0x1000>;
+ reg-names = "port0";
+ linux,pci-domain = <0>;
+ #address-cells = <3>;
+ #size-cells = <2>;
+ interrupts = <GIC_SPI 228 IRQ_TYPE_LEVEL_LOW>;
+ interrupt-names = "pcie_irq";
+ clocks = <&pciesys>, /* CLK_PCIE_P0_MAC_EN */
+ <&pciesys>, /* CLK_PCIE_P0_AHB_EN */
+ <&pciesys>, /* CLK_PCIE_P0_AUX_EN */
+ <&pciesys>, /* CLK_PCIE_P0_AXI_EN */
+ <&pciesys>, /* CLK_PCIE_P0_OBFF_EN */
+ <&pciesys>; /* CLK_PCIE_P0_PIPE_EN */
+ clock-names = "sys_ck0", "ahb_ck0", "aux_ck0",
+ "axi_ck0", "obff_ck0", "pipe_ck0";
+
+ power-domains = <&scpsys MT7622_POWER_DOMAIN_HIF0>;
+ bus-range = <0x00 0xff>;
+ ranges = <0x82000000 0 0x20000000 0x0 0x20000000 0 0x8000000>;
+
+ #interrupt-cells = <1>;
+ interrupt-map-mask = <0 0 0 7>;
+ interrupt-map = <0 0 0 1 &pcie_intc0_1 0>,
+ <0 0 0 2 &pcie_intc0_1 1>,
+ <0 0 0 3 &pcie_intc0_1 2>,
+ <0 0 0 4 &pcie_intc0_1 3>;
+ pcie_intc0_1: interrupt-controller {
+ interrupt-controller;
+ #address-cells = <0>;
+ #interrupt-cells = <1>;
+ };
+ };
+
+ pcie@1a145000 {
+ compatible = "mediatek,mt7622-pcie";
+ device_type = "pci";
+ reg = <0 0x1a145000 0 0x1000>;
+ reg-names = "port1";
+ linux,pci-domain = <1>;
+ #address-cells = <3>;
+ #size-cells = <2>;
+ interrupts = <GIC_SPI 229 IRQ_TYPE_LEVEL_LOW>;
+ interrupt-names = "pcie_irq";
+ clocks = <&pciesys>, /* CLK_PCIE_P1_MAC_EN */
+ /* designer has connect RC1 with p0_ahb clock */
+ <&pciesys>, /* CLK_PCIE_P0_AHB_EN */
+ <&pciesys>, /* CLK_PCIE_P1_AUX_EN */
+ <&pciesys>, /* CLK_PCIE_P1_AXI_EN */
+ <&pciesys>, /* CLK_PCIE_P1_OBFF_EN */
+ <&pciesys>; /* CLK_PCIE_P1_PIPE_EN */
+ clock-names = "sys_ck1", "ahb_ck1", "aux_ck1",
+ "axi_ck1", "obff_ck1", "pipe_ck1";
+
+ power-domains = <&scpsys MT7622_POWER_DOMAIN_HIF0>;
+ bus-range = <0x00 0xff>;
+ ranges = <0x82000000 0 0x28000000 0x0 0x28000000 0 0x8000000>;
+
+ #interrupt-cells = <1>;
+ interrupt-map-mask = <0 0 0 7>;
+ interrupt-map = <0 0 0 1 &pcie_intc1_1 0>,
+ <0 0 0 2 &pcie_intc1_1 1>,
+ <0 0 0 3 &pcie_intc1_1 2>,
+ <0 0 0 4 &pcie_intc1_1 3>;
+ pcie_intc1_1: interrupt-controller {
+ interrupt-controller;
+ #address-cells = <0>;
+ #interrupt-cells = <1>;
+ };
+ };
+ };
+
+ # AN7583
+ - |
+ #include <dt-bindings/interrupt-controller/irq.h>
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/clock/en7523-clk.h>
+
+ soc_3 {
+ #address-cells = <2>;
+ #size-cells = <2>;
+
+ pcie@1fa92000 {
+ compatible = "airoha,an7583-pcie";
+ device_type = "pci";
+ linux,pci-domain = <1>;
+ #address-cells = <3>;
+ #size-cells = <2>;
+
+ reg = <0x0 0x1fa92000 0x0 0x1670>;
+ reg-names = "port1";
+
+ clocks = <&scuclk EN7523_CLK_PCIE>;
+ clock-names = "sys_ck1";
+
+ phys = <&pciephy>;
+ phy-names = "pcie-phy1";
+
+ ranges = <0x02000000 0 0x24000000 0x0 0x24000000 0 0x4000000>;
+
+ resets = <&scuclk>; /* AN7583_PCIE1_RST */
+ reset-names = "pcie-rst1";
+
+ mediatek,pbus-csr = <&pbus_csr 0x8 0xc>;
+
+ interrupts = <GIC_SPI 40 IRQ_TYPE_LEVEL_HIGH>;
+ interrupt-names = "pcie_irq";
+ bus-range = <0x00 0xff>;
+ #interrupt-cells = <1>;
+ interrupt-map-mask = <0 0 0 7>;
+ interrupt-map = <0 0 0 1 &pcie_intc1 0>,
+ <0 0 0 2 &pcie_intc1 1>,
+ <0 0 0 3 &pcie_intc1 2>,
+ <0 0 0 4 &pcie_intc1 3>;
+
+ pcie_intc1_4: interrupt-controller {
+ interrupt-controller;
+ #address-cells = <0>;
+ #interrupt-cells = <1>;
+ };
+ };
+ };
diff --git a/Documentation/devicetree/bindings/pci/nxp,s32g-pcie.yaml b/Documentation/devicetree/bindings/pci/nxp,s32g-pcie.yaml
new file mode 100644
index 000000000000..66a050028278
--- /dev/null
+++ b/Documentation/devicetree/bindings/pci/nxp,s32g-pcie.yaml
@@ -0,0 +1,130 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/pci/nxp,s32g-pcie.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: NXP S32G2xxx/S32G3xxx PCIe Root Complex controller
+
+maintainers:
+ - Bogdan Hamciuc <bogdan.hamciuc@nxp.com>
+ - Ionut Vicovan <ionut.vicovan@nxp.com>
+
+description:
+ This PCIe controller is based on the Synopsys DesignWare PCIe IP.
+ The S32G SoC family has two PCIe controllers, which can be configured as
+ either Root Complex or Endpoint.
+
+properties:
+ compatible:
+ oneOf:
+ - enum:
+ - nxp,s32g2-pcie
+ - items:
+ - const: nxp,s32g3-pcie
+ - const: nxp,s32g2-pcie
+
+ reg:
+ maxItems: 6
+
+ reg-names:
+ items:
+ - const: dbi
+ - const: dbi2
+ - const: atu
+ - const: dma
+ - const: ctrl
+ - const: config
+
+ interrupts:
+ minItems: 1
+ maxItems: 2
+
+ interrupt-names:
+ items:
+ - const: msi
+ - const: dma
+ minItems: 1
+
+ pcie@0:
+ description:
+ Describe the S32G Root Port.
+ type: object
+ $ref: /schemas/pci/pci-pci-bridge.yaml#
+
+ properties:
+ reg:
+ maxItems: 1
+
+ phys:
+ maxItems: 1
+
+ required:
+ - reg
+ - phys
+
+ unevaluatedProperties: false
+
+required:
+ - compatible
+ - reg
+ - reg-names
+ - interrupts
+ - interrupt-names
+ - ranges
+ - pcie@0
+
+allOf:
+ - $ref: /schemas/pci/snps,dw-pcie.yaml#
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/phy/phy.h>
+
+ bus {
+ #address-cells = <2>;
+ #size-cells = <2>;
+
+ pcie@40400000 {
+ compatible = "nxp,s32g3-pcie", "nxp,s32g2-pcie";
+ reg = <0x00 0x40400000 0x0 0x00001000>, /* dbi registers */
+ <0x00 0x40420000 0x0 0x00001000>, /* dbi2 registers */
+ <0x00 0x40460000 0x0 0x00001000>, /* atu registers */
+ <0x00 0x40470000 0x0 0x00001000>, /* dma registers */
+ <0x00 0x40481000 0x0 0x000000f8>, /* ctrl registers */
+ <0x5f 0xffffe000 0x0 0x00002000>; /* config space */
+ reg-names = "dbi", "dbi2", "atu", "dma", "ctrl", "config";
+ dma-coherent;
+ #address-cells = <3>;
+ #size-cells = <2>;
+ device_type = "pci";
+ ranges =
+ <0x01000000 0x0 0x00000000 0x5f 0xfffe0000 0x0 0x00010000>,
+ <0x02000000 0x0 0x00000000 0x58 0x00000000 0x0 0x80000000>,
+ <0x02000000 0x1 0x00000000 0x59 0x00000000 0x6 0xfffe0000>;
+
+ bus-range = <0x0 0xff>;
+ interrupts = <GIC_SPI 125 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 123 IRQ_TYPE_LEVEL_HIGH>;
+ interrupt-names = "msi", "dma";
+ #interrupt-cells = <1>;
+ interrupt-map-mask = <0 0 0 0x7>;
+ interrupt-map = <0 0 0 1 &gic 0 0 GIC_SPI 128 IRQ_TYPE_LEVEL_HIGH>,
+ <0 0 0 2 &gic 0 0 GIC_SPI 129 IRQ_TYPE_LEVEL_HIGH>,
+ <0 0 0 3 &gic 0 0 GIC_SPI 130 IRQ_TYPE_LEVEL_HIGH>,
+ <0 0 0 4 &gic 0 0 GIC_SPI 131 IRQ_TYPE_LEVEL_HIGH>;
+
+ pcie@0 {
+ reg = <0x0 0x0 0x0 0x0 0x0>;
+ #address-cells = <3>;
+ #size-cells = <2>;
+ ranges;
+
+ device_type = "pci";
+ phys = <&serdes0 PHY_TYPE_PCIE 0 0>;
+ };
+ };
+ };
diff --git a/Documentation/devicetree/bindings/pci/pci-ep.yaml b/Documentation/devicetree/bindings/pci/pci-ep.yaml
index 1868a10d5b10..baeb583e0bcd 100644
--- a/Documentation/devicetree/bindings/pci/pci-ep.yaml
+++ b/Documentation/devicetree/bindings/pci/pci-ep.yaml
@@ -11,7 +11,7 @@ description: |
maintainers:
- Kishon Vijay Abraham I <kishon@kernel.org>
- - Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
+ - Manivannan Sadhasivam <mani@kernel.org>
properties:
$nodename:
diff --git a/Documentation/devicetree/bindings/pci/qcom,pcie-common.yaml b/Documentation/devicetree/bindings/pci/qcom,pcie-common.yaml
index ab2509ec1c4b..77f8faf54737 100644
--- a/Documentation/devicetree/bindings/pci/qcom,pcie-common.yaml
+++ b/Documentation/devicetree/bindings/pci/qcom,pcie-common.yaml
@@ -8,7 +8,7 @@ title: Qualcomm PCI Express Root Complex Common Properties
maintainers:
- Bjorn Andersson <andersson@kernel.org>
- - Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
+ - Manivannan Sadhasivam <mani@kernel.org>
properties:
reg:
diff --git a/Documentation/devicetree/bindings/pci/qcom,pcie-ep.yaml b/Documentation/devicetree/bindings/pci/qcom,pcie-ep.yaml
index ac3414203d38..bed9a40b186b 100644
--- a/Documentation/devicetree/bindings/pci/qcom,pcie-ep.yaml
+++ b/Documentation/devicetree/bindings/pci/qcom,pcie-ep.yaml
@@ -7,7 +7,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml#
title: Qualcomm PCIe Endpoint Controller
maintainers:
- - Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
+ - Manivannan Sadhasivam <mani@kernel.org>
properties:
compatible:
diff --git a/Documentation/devicetree/bindings/pci/qcom,pcie-sa8255p.yaml b/Documentation/devicetree/bindings/pci/qcom,pcie-sa8255p.yaml
index bdddd4f499d1..1f2d098b8638 100644
--- a/Documentation/devicetree/bindings/pci/qcom,pcie-sa8255p.yaml
+++ b/Documentation/devicetree/bindings/pci/qcom,pcie-sa8255p.yaml
@@ -8,7 +8,7 @@ title: Qualcomm SA8255p based firmware managed and ECAM compliant PCIe Root Comp
maintainers:
- Bjorn Andersson <andersson@kernel.org>
- - Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
+ - Manivannan Sadhasivam <mani@kernel.org>
description:
Qualcomm SA8255p SoC PCIe root complex controller is based on the Synopsys
diff --git a/Documentation/devicetree/bindings/pci/qcom,pcie-sa8775p.yaml b/Documentation/devicetree/bindings/pci/qcom,pcie-sa8775p.yaml
index 19afe2a03409..63630a814f28 100644
--- a/Documentation/devicetree/bindings/pci/qcom,pcie-sa8775p.yaml
+++ b/Documentation/devicetree/bindings/pci/qcom,pcie-sa8775p.yaml
@@ -8,7 +8,7 @@ title: Qualcomm SA8775p PCI Express Root Complex
maintainers:
- Bjorn Andersson <andersson@kernel.org>
- - Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
+ - Manivannan Sadhasivam <mani@kernel.org>
description:
Qualcomm SA8775p SoC PCIe root complex controller is based on the Synopsys
@@ -78,6 +78,9 @@ properties:
required:
- interconnects
- interconnect-names
+ - power-domains
+ - resets
+ - reset-names
allOf:
- $ref: qcom,pcie-common.yaml#
diff --git a/Documentation/devicetree/bindings/pci/qcom,pcie-sc7280.yaml b/Documentation/devicetree/bindings/pci/qcom,pcie-sc7280.yaml
index 4d0a91556603..1f942b3075f1 100644
--- a/Documentation/devicetree/bindings/pci/qcom,pcie-sc7280.yaml
+++ b/Documentation/devicetree/bindings/pci/qcom,pcie-sc7280.yaml
@@ -8,7 +8,7 @@ title: Qualcomm SC7280 PCI Express Root Complex
maintainers:
- Bjorn Andersson <andersson@kernel.org>
- - Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
+ - Manivannan Sadhasivam <mani@kernel.org>
description:
Qualcomm SC7280 SoC PCIe root complex controller is based on the Synopsys
@@ -76,6 +76,11 @@ properties:
items:
- const: pci
+required:
+ - power-domains
+ - resets
+ - reset-names
+
allOf:
- $ref: qcom,pcie-common.yaml#
diff --git a/Documentation/devicetree/bindings/pci/qcom,pcie-sc8180x.yaml b/Documentation/devicetree/bindings/pci/qcom,pcie-sc8180x.yaml
index 34a4d7b2c845..6a7c410c9fc3 100644
--- a/Documentation/devicetree/bindings/pci/qcom,pcie-sc8180x.yaml
+++ b/Documentation/devicetree/bindings/pci/qcom,pcie-sc8180x.yaml
@@ -8,7 +8,7 @@ title: Qualcomm SC8180x PCI Express Root Complex
maintainers:
- Bjorn Andersson <andersson@kernel.org>
- - Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
+ - Manivannan Sadhasivam <mani@kernel.org>
description:
Qualcomm SC8180x SoC PCIe root complex controller is based on the Synopsys
diff --git a/Documentation/devicetree/bindings/pci/qcom,pcie-sc8280xp.yaml b/Documentation/devicetree/bindings/pci/qcom,pcie-sc8280xp.yaml
index 15ba2385eb73..bc0e71dc06a3 100644
--- a/Documentation/devicetree/bindings/pci/qcom,pcie-sc8280xp.yaml
+++ b/Documentation/devicetree/bindings/pci/qcom,pcie-sc8280xp.yaml
@@ -8,7 +8,7 @@ title: Qualcomm SC8280XP PCI Express Root Complex
maintainers:
- Bjorn Andersson <andersson@kernel.org>
- - Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
+ - Manivannan Sadhasivam <mani@kernel.org>
description:
Qualcomm SC8280XP SoC PCIe root complex controller is based on the Synopsys
@@ -61,6 +61,9 @@ properties:
required:
- interconnects
- interconnect-names
+ - power-domains
+ - resets
+ - reset-names
allOf:
- $ref: qcom,pcie-common.yaml#
diff --git a/Documentation/devicetree/bindings/pci/qcom,pcie-sm8150.yaml b/Documentation/devicetree/bindings/pci/qcom,pcie-sm8150.yaml
index 26b247a41785..6a5421e4f19d 100644
--- a/Documentation/devicetree/bindings/pci/qcom,pcie-sm8150.yaml
+++ b/Documentation/devicetree/bindings/pci/qcom,pcie-sm8150.yaml
@@ -8,7 +8,7 @@ title: Qualcomm SM8150 PCI Express Root Complex
maintainers:
- Bjorn Andersson <andersson@kernel.org>
- - Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
+ - Manivannan Sadhasivam <mani@kernel.org>
description:
Qualcomm SM8150 SoC PCIe root complex controller is based on the Synopsys
@@ -74,6 +74,11 @@ properties:
items:
- const: pci
+required:
+ - power-domains
+ - resets
+ - reset-names
+
allOf:
- $ref: qcom,pcie-common.yaml#
diff --git a/Documentation/devicetree/bindings/pci/qcom,pcie-sm8250.yaml b/Documentation/devicetree/bindings/pci/qcom,pcie-sm8250.yaml
index af4dae68d508..adbeaa8f2c13 100644
--- a/Documentation/devicetree/bindings/pci/qcom,pcie-sm8250.yaml
+++ b/Documentation/devicetree/bindings/pci/qcom,pcie-sm8250.yaml
@@ -8,7 +8,7 @@ title: Qualcomm SM8250 PCI Express Root Complex
maintainers:
- Bjorn Andersson <andersson@kernel.org>
- - Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
+ - Manivannan Sadhasivam <mani@kernel.org>
description:
Qualcomm SM8250 SoC PCIe root complex controller is based on the Synopsys
@@ -83,6 +83,11 @@ properties:
items:
- const: pci
+required:
+ - power-domains
+ - resets
+ - reset-names
+
allOf:
- $ref: qcom,pcie-common.yaml#
diff --git a/Documentation/devicetree/bindings/pci/qcom,pcie-sm8350.yaml b/Documentation/devicetree/bindings/pci/qcom,pcie-sm8350.yaml
index dde3079adbb3..5744d5e969fb 100644
--- a/Documentation/devicetree/bindings/pci/qcom,pcie-sm8350.yaml
+++ b/Documentation/devicetree/bindings/pci/qcom,pcie-sm8350.yaml
@@ -8,7 +8,7 @@ title: Qualcomm SM8350 PCI Express Root Complex
maintainers:
- Bjorn Andersson <andersson@kernel.org>
- - Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
+ - Manivannan Sadhasivam <mani@kernel.org>
description:
Qualcomm SM8350 SoC PCIe root complex controller is based on the Synopsys
@@ -73,6 +73,11 @@ properties:
items:
- const: pci
+required:
+ - power-domains
+ - resets
+ - reset-names
+
allOf:
- $ref: qcom,pcie-common.yaml#
diff --git a/Documentation/devicetree/bindings/pci/qcom,pcie-sm8450.yaml b/Documentation/devicetree/bindings/pci/qcom,pcie-sm8450.yaml
index 6e0a6d8f0ed0..28b8ffb74124 100644
--- a/Documentation/devicetree/bindings/pci/qcom,pcie-sm8450.yaml
+++ b/Documentation/devicetree/bindings/pci/qcom,pcie-sm8450.yaml
@@ -8,7 +8,7 @@ title: Qualcomm SM8450 PCI Express Root Complex
maintainers:
- Bjorn Andersson <andersson@kernel.org>
- - Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
+ - Manivannan Sadhasivam <mani@kernel.org>
description:
Qualcomm SM8450 SoC PCIe root complex controller is based on the Synopsys
@@ -77,6 +77,11 @@ properties:
items:
- const: pci
+required:
+ - power-domains
+ - resets
+ - reset-names
+
allOf:
- $ref: qcom,pcie-common.yaml#
diff --git a/Documentation/devicetree/bindings/pci/qcom,pcie-sm8550.yaml b/Documentation/devicetree/bindings/pci/qcom,pcie-sm8550.yaml
index 38b561e23c1f..3a94a9c1bb15 100644
--- a/Documentation/devicetree/bindings/pci/qcom,pcie-sm8550.yaml
+++ b/Documentation/devicetree/bindings/pci/qcom,pcie-sm8550.yaml
@@ -8,7 +8,7 @@ title: Qualcomm SM8550 PCI Express Root Complex
maintainers:
- Bjorn Andersson <andersson@kernel.org>
- - Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
+ - Manivannan Sadhasivam <mani@kernel.org>
description:
Qualcomm SM8550 SoC (and compatible) PCIe root complex controller is based on
@@ -20,6 +20,7 @@ properties:
- const: qcom,pcie-sm8550
- items:
- enum:
+ - qcom,kaanapali-pcie
- qcom,sar2130p-pcie
- qcom,pcie-sm8650
- qcom,pcie-sm8750
@@ -83,6 +84,11 @@ properties:
- const: pci # PCIe core reset
- const: link_down # PCIe link down reset
+required:
+ - power-domains
+ - resets
+ - reset-names
+
allOf:
- $ref: qcom,pcie-common.yaml#
diff --git a/Documentation/devicetree/bindings/pci/qcom,pcie-x1e80100.yaml b/Documentation/devicetree/bindings/pci/qcom,pcie-x1e80100.yaml
index 61581ffbfb24..62c674ca0cf7 100644
--- a/Documentation/devicetree/bindings/pci/qcom,pcie-x1e80100.yaml
+++ b/Documentation/devicetree/bindings/pci/qcom,pcie-x1e80100.yaml
@@ -8,7 +8,7 @@ title: Qualcomm X1E80100 PCI Express Root Complex
maintainers:
- Bjorn Andersson <andersson@kernel.org>
- - Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
+ - Manivannan Sadhasivam <mani@kernel.org>
description:
Qualcomm X1E80100 SoC (and compatible) PCIe root complex controller is based on
@@ -73,6 +73,11 @@ properties:
- const: pci # PCIe core reset
- const: link_down # PCIe link down reset
+required:
+ - power-domains
+ - resets
+ - reset-names
+
allOf:
- $ref: qcom,pcie-common.yaml#
diff --git a/Documentation/devicetree/bindings/pci/qcom,pcie.yaml b/Documentation/devicetree/bindings/pci/qcom,pcie.yaml
index 0e1808105a81..c61930441be0 100644
--- a/Documentation/devicetree/bindings/pci/qcom,pcie.yaml
+++ b/Documentation/devicetree/bindings/pci/qcom,pcie.yaml
@@ -8,7 +8,7 @@ title: Qualcomm PCI express root complex
maintainers:
- Bjorn Andersson <bjorn.andersson@linaro.org>
- - Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
+ - Manivannan Sadhasivam <mani@kernel.org>
description: |
Qualcomm PCIe root complex controller is based on the Synopsys DesignWare
diff --git a/Documentation/devicetree/bindings/pci/renesas,r9a08g045-pcie.yaml b/Documentation/devicetree/bindings/pci/renesas,r9a08g045-pcie.yaml
new file mode 100644
index 000000000000..d668782546a2
--- /dev/null
+++ b/Documentation/devicetree/bindings/pci/renesas,r9a08g045-pcie.yaml
@@ -0,0 +1,249 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/pci/renesas,r9a08g045-pcie.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Renesas RZ/G3S PCIe host controller
+
+maintainers:
+ - Claudiu Beznea <claudiu.beznea.uj@bp.renesas.com>
+
+description:
+ Renesas RZ/G3S PCIe host controller complies with PCIe Base Specification
+ 4.0 and supports up to 5 GT/s (Gen2).
+
+properties:
+ compatible:
+ const: renesas,r9a08g045-pcie # RZ/G3S
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ items:
+ - description: System error interrupt
+ - description: System error on correctable error interrupt
+ - description: System error on non-fatal error interrupt
+ - description: System error on fatal error interrupt
+ - description: AXI error interrupt
+ - description: INTA interrupt
+ - description: INTB interrupt
+ - description: INTC interrupt
+ - description: INTD interrupt
+ - description: MSI interrupt
+ - description: Link bandwidth interrupt
+ - description: PME interrupt
+ - description: DMA interrupt
+ - description: PCIe event interrupt
+ - description: Message interrupt
+ - description: All interrupts
+
+ interrupt-names:
+ items:
+ - description: serr
+ - description: ser_cor
+ - description: serr_nonfatal
+ - description: serr_fatal
+ - description: axi_err
+ - description: inta
+ - description: intb
+ - description: intc
+ - description: intd
+ - description: msi
+ - description: link_bandwidth
+ - description: pm_pme
+ - description: dma
+ - description: pcie_evt
+ - description: msg
+ - description: all
+
+ interrupt-controller: true
+
+ clocks:
+ items:
+ - description: System clock
+ - description: PM control clock
+
+ clock-names:
+ items:
+ - description: aclk
+ - description: pm
+
+ resets:
+ items:
+ - description: AXI2PCIe Bridge reset
+ - description: Data link layer/transaction layer reset
+ - description: Transaction layer (ACLK domain) reset
+ - description: Transaction layer (PCLK domain) reset
+ - description: Physical layer reset
+ - description: Configuration register reset
+ - description: Configuration register reset
+
+ reset-names:
+ items:
+ - description: aresetn
+ - description: rst_b
+ - description: rst_gp_b
+ - description: rst_ps_b
+ - description: rst_rsm_b
+ - description: rst_cfg_b
+ - description: rst_load_b
+
+ power-domains:
+ maxItems: 1
+
+ dma-ranges:
+ description:
+ A single range for the inbound memory region.
+ maxItems: 1
+
+ renesas,sysc:
+ description: |
+ System controller registers control and monitor various PCIe
+ functionalities.
+
+ Control:
+ - transition to L1 state
+ - receiver termination settings
+ - RST_RSM_B signal
+
+ Monitor:
+ - clkl1pm clock request state
+ - power off information in L2 state
+ - errors (fatal, non-fatal, correctable)
+ $ref: /schemas/types.yaml#/definitions/phandle
+
+patternProperties:
+ "^pcie@0,[0-0]$":
+ type: object
+ allOf:
+ - $ref: /schemas/pci/pci-pci-bridge.yaml#
+
+ properties:
+ reg:
+ maxItems: 1
+
+ vendor-id:
+ const: 0x1912
+
+ device-id:
+ const: 0x0033
+
+ clocks:
+ items:
+ - description: Reference clock
+
+ clock-names:
+ items:
+ - const: ref
+
+ required:
+ - device_type
+ - vendor-id
+ - device-id
+ - clocks
+ - clock-names
+
+ unevaluatedProperties: false
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - clock-names
+ - resets
+ - reset-names
+ - interrupts
+ - interrupt-names
+ - interrupt-map
+ - interrupt-map-mask
+ - interrupt-controller
+ - power-domains
+ - "#address-cells"
+ - "#size-cells"
+ - "#interrupt-cells"
+ - renesas,sysc
+
+allOf:
+ - $ref: /schemas/pci/pci-host-bridge.yaml#
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/r9a08g045-cpg.h>
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+
+ bus {
+ #address-cells = <2>;
+ #size-cells = <2>;
+
+ pcie@11e40000 {
+ compatible = "renesas,r9a08g045-pcie";
+ reg = <0 0x11e40000 0 0x10000>;
+ ranges = <0x02000000 0 0x30000000 0 0x30000000 0 0x08000000>;
+ /* Map all possible DRAM ranges (4 GB). */
+ dma-ranges = <0x42000000 0 0x40000000 0 0x40000000 1 0x00000000>;
+ bus-range = <0x0 0xff>;
+ interrupts = <GIC_SPI 395 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 396 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 397 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 398 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 399 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 400 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 401 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 402 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 403 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 404 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 405 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 406 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 407 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 408 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 409 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 410 IRQ_TYPE_LEVEL_HIGH>;
+ interrupt-names = "serr", "serr_cor", "serr_nonfatal",
+ "serr_fatal", "axi_err", "inta",
+ "intb", "intc", "intd", "msi",
+ "link_bandwidth", "pm_pme", "dma",
+ "pcie_evt", "msg", "all";
+ #interrupt-cells = <1>;
+ interrupt-controller;
+ interrupt-map-mask = <0 0 0 7>;
+ interrupt-map = <0 0 0 1 &pcie 0 0 0 0>, /* INTA */
+ <0 0 0 2 &pcie 0 0 0 1>, /* INTB */
+ <0 0 0 3 &pcie 0 0 0 2>, /* INTC */
+ <0 0 0 4 &pcie 0 0 0 3>; /* INTD */
+ clocks = <&cpg CPG_MOD R9A08G045_PCI_ACLK>,
+ <&cpg CPG_MOD R9A08G045_PCI_CLKL1PM>;
+ clock-names = "aclk", "pm";
+ resets = <&cpg R9A08G045_PCI_ARESETN>,
+ <&cpg R9A08G045_PCI_RST_B>,
+ <&cpg R9A08G045_PCI_RST_GP_B>,
+ <&cpg R9A08G045_PCI_RST_PS_B>,
+ <&cpg R9A08G045_PCI_RST_RSM_B>,
+ <&cpg R9A08G045_PCI_RST_CFG_B>,
+ <&cpg R9A08G045_PCI_RST_LOAD_B>;
+ reset-names = "aresetn", "rst_b", "rst_gp_b", "rst_ps_b",
+ "rst_rsm_b", "rst_cfg_b", "rst_load_b";
+ power-domains = <&cpg>;
+ device_type = "pci";
+ #address-cells = <3>;
+ #size-cells = <2>;
+ renesas,sysc = <&sysc>;
+
+ pcie@0,0 {
+ reg = <0x0 0x0 0x0 0x0 0x0>;
+ ranges;
+ clocks = <&versa3 5>;
+ clock-names = "ref";
+ device_type = "pci";
+ vendor-id = <0x1912>;
+ device-id = <0x0033>;
+ #address-cells = <3>;
+ #size-cells = <2>;
+ };
+ };
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/pci/rockchip-dw-pcie.yaml b/Documentation/devicetree/bindings/pci/rockchip-dw-pcie.yaml
index def513d29d26..355c4a46bd31 100644
--- a/Documentation/devicetree/bindings/pci/rockchip-dw-pcie.yaml
+++ b/Documentation/devicetree/bindings/pci/rockchip-dw-pcie.yaml
@@ -22,6 +22,7 @@ properties:
- const: rockchip,rk3568-pcie
- items:
- enum:
+ - rockchip,rk3528-pcie
- rockchip,rk3562-pcie
- rockchip,rk3576-pcie
- rockchip,rk3588-pcie
@@ -78,6 +79,7 @@ allOf:
compatible:
contains:
enum:
+ - rockchip,rk3528-pcie
- rockchip,rk3562-pcie
- rockchip,rk3576-pcie
then:
@@ -89,6 +91,7 @@ allOf:
compatible:
contains:
enum:
+ - rockchip,rk3528-pcie
- rockchip,rk3562-pcie
- rockchip,rk3576-pcie
then:
diff --git a/Documentation/devicetree/bindings/pci/snps,dw-pcie-common.yaml b/Documentation/devicetree/bindings/pci/snps,dw-pcie-common.yaml
index 34594972d8db..6339a76499b2 100644
--- a/Documentation/devicetree/bindings/pci/snps,dw-pcie-common.yaml
+++ b/Documentation/devicetree/bindings/pci/snps,dw-pcie-common.yaml
@@ -115,11 +115,11 @@ properties:
above for new bindings.
oneOf:
- description: See native 'dbi' clock for details
- enum: [ pcie, pcie_apb_sys, aclk_dbi, reg ]
+ enum: [ pcie, pcie_apb_sys, aclk_dbi, reg, port ]
- description: See native 'mstr/slv' clock for details
enum: [ pcie_bus, pcie_inbound_axi, pcie_aclk, aclk_mst, aclk_slv ]
- description: See native 'pipe' clock for details
- enum: [ pcie_phy, pcie_phy_ref, link ]
+ enum: [ pcie_phy, pcie_phy_ref, link, general ]
- description: See native 'aux' clock for details
enum: [ pcie_aux ]
- description: See native 'ref' clock for details.
@@ -176,7 +176,7 @@ properties:
- description: See native 'phy' reset for details
enum: [ pciephy, link ]
- description: See native 'pwr' reset for details
- enum: [ turnoff ]
+ enum: [ turnoff, port ]
phys:
description:
diff --git a/Documentation/devicetree/bindings/pci/spacemit,k1-pcie-host.yaml b/Documentation/devicetree/bindings/pci/spacemit,k1-pcie-host.yaml
new file mode 100644
index 000000000000..c4c00b5fcdc0
--- /dev/null
+++ b/Documentation/devicetree/bindings/pci/spacemit,k1-pcie-host.yaml
@@ -0,0 +1,157 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/pci/spacemit,k1-pcie-host.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: SpacemiT K1 PCI Express Host Controller
+
+maintainers:
+ - Alex Elder <elder@riscstar.com>
+
+description: >
+ The SpacemiT K1 SoC PCIe host controller is based on the Synopsys DesignWare
+ PCIe IP. The controller uses the DesignWare built-in MSI interrupt
+ controller, and supports 256 MSIs.
+
+allOf:
+ - $ref: /schemas/pci/snps,dw-pcie.yaml#
+
+properties:
+ compatible:
+ const: spacemit,k1-pcie
+
+ reg:
+ items:
+ - description: DesignWare PCIe registers
+ - description: ATU address space
+ - description: PCIe configuration space
+ - description: Link control registers
+
+ reg-names:
+ items:
+ - const: dbi
+ - const: atu
+ - const: config
+ - const: link
+
+ clocks:
+ items:
+ - description: DWC PCIe Data Bus Interface (DBI) clock
+ - description: DWC PCIe application AXI-bus master interface clock
+ - description: DWC PCIe application AXI-bus slave interface clock
+
+ clock-names:
+ items:
+ - const: dbi
+ - const: mstr
+ - const: slv
+
+ resets:
+ items:
+ - description: DWC PCIe Data Bus Interface (DBI) reset
+ - description: DWC PCIe application AXI-bus master interface reset
+ - description: DWC PCIe application AXI-bus slave interface reset
+
+ reset-names:
+ items:
+ - const: dbi
+ - const: mstr
+ - const: slv
+
+ interrupts:
+ items:
+ - description: Interrupt used for MSIs
+
+ interrupt-names:
+ const: msi
+
+ spacemit,apmu:
+ $ref: /schemas/types.yaml#/definitions/phandle-array
+ description:
+ A phandle that refers to the APMU system controller, whose regmap is
+ used in managing resets and link state, along with and offset of its
+ reset control register.
+ items:
+ - items:
+ - description: phandle to APMU system controller
+ - description: register offset
+
+patternProperties:
+ '^pcie@':
+ type: object
+ $ref: /schemas/pci/pci-pci-bridge.yaml#
+
+ properties:
+ phys:
+ maxItems: 1
+
+ vpcie3v3-supply:
+ description:
+ A phandle for 3.3v regulator to use for PCIe
+
+ required:
+ - phys
+ - vpcie3v3-supply
+
+ unevaluatedProperties: false
+
+required:
+ - clocks
+ - clock-names
+ - resets
+ - reset-names
+ - interrupts
+ - interrupt-names
+ - spacemit,apmu
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/spacemit,k1-syscon.h>
+ pcie@ca400000 {
+ device_type = "pci";
+ compatible = "spacemit,k1-pcie";
+ reg = <0xca400000 0x00001000>,
+ <0xca700000 0x0001ff24>,
+ <0x9f000000 0x00002000>,
+ <0xc0c20000 0x00001000>;
+ reg-names = "dbi",
+ "atu",
+ "config",
+ "link";
+ #address-cells = <3>;
+ #size-cells = <2>;
+ ranges = <0x01000000 0x0 0x00000000 0x9f002000 0x0 0x00100000>,
+ <0x02000000 0x0 0x90000000 0x90000000 0x0 0x0f000000>;
+ interrupts = <142>;
+ interrupt-names = "msi";
+ clocks = <&syscon_apmu CLK_PCIE1_DBI>,
+ <&syscon_apmu CLK_PCIE1_MASTER>,
+ <&syscon_apmu CLK_PCIE1_SLAVE>;
+ clock-names = "dbi",
+ "mstr",
+ "slv";
+ resets = <&syscon_apmu RESET_PCIE1_DBI>,
+ <&syscon_apmu RESET_PCIE1_MASTER>,
+ <&syscon_apmu RESET_PCIE1_SLAVE>;
+ reset-names = "dbi",
+ "mstr",
+ "slv";
+ pinctrl-names = "default";
+ pinctrl-0 = <&pcie1_3_cfg>;
+ spacemit,apmu = <&syscon_apmu 0x3d4>;
+
+ pcie@0 {
+ device_type = "pci";
+ compatible = "pciclass,0604";
+ reg = <0x0 0x0 0x0 0x0 0x0>;
+ bus-range = <0x01 0xff>;
+ #address-cells = <3>;
+ #size-cells = <2>;
+ ranges;
+ phys = <&pcie1_phy>;
+ vpcie3v3-supply = <&pcie_vcc_3v3>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/pci/toshiba,tc9563.yaml b/Documentation/devicetree/bindings/pci/toshiba,tc9563.yaml
new file mode 100644
index 000000000000..fae466064780
--- /dev/null
+++ b/Documentation/devicetree/bindings/pci/toshiba,tc9563.yaml
@@ -0,0 +1,179 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/pci/toshiba,tc9563.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Toshiba TC9563 PCIe switch
+
+maintainers:
+ - Krishna Chaitanya Chundru <krishna.chundru@oss.qualcomm.com>
+
+description: |
+ Toshiba TC9563 PCIe switch has one upstream and three downstream ports.
+ The 3rd downstream port has integrated endpoint device of Ethernet MAC.
+ Other two downstream ports are supposed to connect to external device.
+
+ The TC9563 PCIe switch can be configured through I2C interface before
+ PCIe link is established to change FTS, ASPM related entry delays,
+ tx amplitude etc for better power efficiency and functionality.
+
+properties:
+ compatible:
+ enum:
+ - pci1179,0623
+
+ reg:
+ maxItems: 1
+
+ resx-gpios:
+ maxItems: 1
+ description:
+ GPIO controlling the RESX# pin.
+
+ vdd18-supply: true
+
+ vdd09-supply: true
+
+ vddc-supply: true
+
+ vddio1-supply: true
+
+ vddio2-supply: true
+
+ vddio18-supply: true
+
+ i2c-parent:
+ $ref: /schemas/types.yaml#/definitions/phandle-array
+ description:
+ A phandle to the parent I2C node and the slave address of the device
+ used to configure tc9563 to change FTS, tx amplitude etc.
+ items:
+ - description: Phandle to the I2C controller node
+ - description: I2C slave address
+
+patternProperties:
+ "^pcie@[1-3],0$":
+ description:
+ child nodes describing the internal downstream ports of
+ the tc9563 switch.
+ type: object
+ allOf:
+ - $ref: "#/$defs/tc9563-node"
+ - $ref: /schemas/pci/pci-pci-bridge.yaml#
+ unevaluatedProperties: false
+
+$defs:
+ tc9563-node:
+ type: object
+
+ properties:
+ toshiba,tx-amplitude-microvolt:
+ description:
+ Change Tx Margin setting for low power consumption.
+
+ toshiba,no-dfe-support:
+ type: boolean
+ description:
+ Disable DFE (Decision Feedback Equalizer), which mitigates
+ intersymbol interference and some reflections caused by
+ impedance mismatches.
+
+required:
+ - resx-gpios
+ - vdd18-supply
+ - vdd09-supply
+ - vddc-supply
+ - vddio1-supply
+ - vddio2-supply
+ - vddio18-supply
+ - i2c-parent
+
+allOf:
+ - $ref: "#/$defs/tc9563-node"
+ - $ref: /schemas/pci/pci-bus-common.yaml#
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/gpio/gpio.h>
+
+ pcie {
+ #address-cells = <3>;
+ #size-cells = <2>;
+
+ pcie@0 {
+ device_type = "pci";
+ reg = <0x0 0x0 0x0 0x0 0x0>;
+
+ #address-cells = <3>;
+ #size-cells = <2>;
+ ranges;
+ bus-range = <0x01 0xff>;
+
+ pcie@0,0 {
+ compatible = "pci1179,0623";
+
+ reg = <0x10000 0x0 0x0 0x0 0x0>;
+ device_type = "pci";
+ #address-cells = <3>;
+ #size-cells = <2>;
+ ranges;
+ bus-range = <0x02 0xff>;
+
+ i2c-parent = <&qup_i2c 0x77>;
+
+ vdd18-supply = <&vdd>;
+ vdd09-supply = <&vdd>;
+ vddc-supply = <&vdd>;
+ vddio1-supply = <&vdd>;
+ vddio2-supply = <&vdd>;
+ vddio18-supply = <&vdd>;
+
+ resx-gpios = <&gpio 1 GPIO_ACTIVE_LOW>;
+
+ pcie@1,0 {
+ compatible = "pciclass,0604";
+ reg = <0x20800 0x0 0x0 0x0 0x0>;
+ #address-cells = <3>;
+ #size-cells = <2>;
+ device_type = "pci";
+ ranges;
+ bus-range = <0x03 0xff>;
+
+ toshiba,no-dfe-support;
+ };
+
+ pcie@2,0 {
+ compatible = "pciclass,0604";
+ reg = <0x21000 0x0 0x0 0x0 0x0>;
+ #address-cells = <3>;
+ #size-cells = <2>;
+ device_type = "pci";
+ ranges;
+ bus-range = <0x04 0xff>;
+ };
+
+ pcie@3,0 {
+ compatible = "pciclass,0604";
+ reg = <0x21800 0x0 0x0 0x0 0x0>;
+ #address-cells = <3>;
+ #size-cells = <2>;
+ device_type = "pci";
+ ranges;
+ bus-range = <0x05 0xff>;
+
+ toshiba,tx-amplitude-microvolt = <10>;
+
+ ethernet@0,0 {
+ reg = <0x50000 0x0 0x0 0x0 0x0>;
+ };
+
+ ethernet@0,1 {
+ reg = <0x50100 0x0 0x0 0x0 0x0>;
+ };
+ };
+ };
+ };
+ };
diff --git a/Documentation/devicetree/bindings/power/supply/richtek,rt9756.yaml b/Documentation/devicetree/bindings/power/supply/richtek,rt9756.yaml
new file mode 100644
index 000000000000..a88bf6cd1927
--- /dev/null
+++ b/Documentation/devicetree/bindings/power/supply/richtek,rt9756.yaml
@@ -0,0 +1,72 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/power/supply/richtek,rt9756.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Richtek RT9756 Smart Cap Divider Charger
+
+maintainers:
+ - ChiYuan Huang <cy_huang@richtek.com>
+
+description: |
+ The RT9756/RT9757 is a high efficiency and high charge current charger.
+
+ The efficiency is up to 98.2% when VBAT = 4V, IBAT = 2A in DIV2 mode and 99.1%
+ when VBAT=4V, IBAT=1A in bypass mode. The maximum charger current is up to 8A
+ in DIV2 mode and 5A in bypass mode. The device integrates smart cap divider
+ topology, direct charging mode, external over-voltage protection control, an
+ input reverse blocking NFET and 2-way regulation, a dual phase charge pump
+ core, 8-Channel high speed ADCs and USB BC 1.2 detection.
+
+ RT9770 is almost the same with RT9756/57, only BC 1.2 detection function is
+ removed to shrink the die size.
+
+allOf:
+ - $ref: power-supply.yaml#
+
+properties:
+ compatible:
+ oneOf:
+ - enum:
+ - richtek,rt9756
+ - richtek,rt9770
+ - items:
+ - enum:
+ - richtek,rt9757
+ - const: richtek,rt9756
+
+ reg:
+ maxItems: 1
+
+ wakeup-source: true
+
+ interrupts:
+ maxItems: 1
+
+ shunt-resistor-micro-ohms:
+ description: Battery current sense resistor mounted.
+ default: 2000
+
+required:
+ - compatible
+ - reg
+ - interrupts
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/irq.h>
+ i2c {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ charger@6f {
+ compatible = "richtek,rt9756";
+ reg = <0x6f>;
+ wakeup-source;
+ interrupts-extended = <&gpio_intc 32 IRQ_TYPE_EDGE_FALLING>;
+ shunt-resistor-micro-ohms = <5000>;
+ };
+ };
diff --git a/Documentation/driver-api/cxl/allocation/page-allocator.rst b/Documentation/driver-api/cxl/allocation/page-allocator.rst
index 7b8fe1b8d5bb..3fa584a248bd 100644
--- a/Documentation/driver-api/cxl/allocation/page-allocator.rst
+++ b/Documentation/driver-api/cxl/allocation/page-allocator.rst
@@ -41,37 +41,6 @@ To simplify this, the page allocator will prefer :code:`ZONE_MOVABLE` over
will fallback to allocate from :code:`ZONE_NORMAL`.
-Zone and Node Quirks
-====================
-Let's consider a configuration where the local DRAM capacity is largely onlined
-into :code:`ZONE_NORMAL`, with no :code:`ZONE_MOVABLE` capacity present. The
-CXL capacity has the opposite configuration - all onlined in
-:code:`ZONE_MOVABLE`.
-
-Under the default allocation policy, the page allocator will completely skip
-:code:`ZONE_MOVABLE` as a valid allocation target. This is because, as of
-Linux v6.15, the page allocator does (approximately) the following: ::
-
- for (each zone in local_node):
-
- for (each node in fallback_order):
-
- attempt_allocation(gfp_flags);
-
-Because the local node does not have :code:`ZONE_MOVABLE`, the CXL node is
-functionally unreachable for direct allocation. As a result, the only way
-for CXL capacity to be used is via `demotion` in the reclaim path.
-
-This configuration also means that if the DRAM ndoe has :code:`ZONE_MOVABLE`
-capacity - when that capacity is depleted, the page allocator will actually
-prefer CXL :code:`ZONE_MOVABLE` pages over DRAM :code:`ZONE_NORMAL` pages.
-
-We may wish to invert this priority in future Linux versions.
-
-If `demotion` and `swap` are disabled, Linux will begin to cause OOM crashes
-when the DRAM nodes are depleted. See the reclaim section for more details.
-
-
CGroups and CPUSets
===================
Finally, assuming CXL memory is reachable via the page allocation (i.e. onlined
diff --git a/Documentation/driver-api/firmware/efi/index.rst b/Documentation/driver-api/firmware/efi/index.rst
index 4fe8abba9fc6..5a6b6229592c 100644
--- a/Documentation/driver-api/firmware/efi/index.rst
+++ b/Documentation/driver-api/firmware/efi/index.rst
@@ -1,11 +1,16 @@
.. SPDX-License-Identifier: GPL-2.0
-============
-UEFI Support
-============
+====================================================
+Unified Extensible Firmware Interface (UEFI) Support
+====================================================
UEFI stub library functions
===========================
.. kernel-doc:: drivers/firmware/efi/libstub/mem.c
:internal:
+
+UEFI Common Platform Error Record (CPER) functions
+==================================================
+
+.. kernel-doc:: drivers/firmware/efi/cper.c
diff --git a/Documentation/driver-api/generic_pt.rst b/Documentation/driver-api/generic_pt.rst
new file mode 100644
index 000000000000..fd29d1b525e5
--- /dev/null
+++ b/Documentation/driver-api/generic_pt.rst
@@ -0,0 +1,137 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+========================
+Generic Radix Page Table
+========================
+
+.. kernel-doc:: include/linux/generic_pt/common.h
+ :doc: Generic Radix Page Table
+
+.. kernel-doc:: drivers/iommu/generic_pt/pt_defs.h
+ :doc: Generic Page Table Language
+
+Usage
+=====
+
+Generic PT is structured as a multi-compilation system. Since each format
+provides an API using a common set of names there can be only one format active
+within a compilation unit. This design avoids function pointers around the low
+level API.
+
+Instead the function pointers can end up at the higher level API (i.e.
+map/unmap, etc.) and the per-format code can be directly inlined into the
+per-format compilation unit. For something like IOMMU each format will be
+compiled into a per-format IOMMU operations kernel module.
+
+For this to work the .c file for each compilation unit will include both the
+format headers and the generic code for the implementation. For instance in an
+implementation compilation unit the headers would normally be included as
+follows:
+
+generic_pt/fmt/iommu_amdv1.c::
+
+ #include <linux/generic_pt/common.h>
+ #include "defs_amdv1.h"
+ #include "../pt_defs.h"
+ #include "amdv1.h"
+ #include "../pt_common.h"
+ #include "../pt_iter.h"
+ #include "../iommu_pt.h" /* The IOMMU implementation */
+
+iommu_pt.h includes definitions that will generate the operations functions for
+map/unmap/etc. using the definitions provided by AMDv1. The resulting module
+will have exported symbols named like pt_iommu_amdv1_init().
+
+Refer to drivers/iommu/generic_pt/fmt/iommu_template.h for an example of how the
+IOMMU implementation uses multi-compilation to generate per-format ops structs
+pointers.
+
+The format code is written so that the common names arise from #defines to
+distinct format specific names. This is intended to aid debuggability by
+avoiding symbol clashes across all the different formats.
+
+Exported symbols and other global names are mangled using a per-format string
+via the NS() helper macro.
+
+The format uses struct pt_common as the top-level struct for the table,
+and each format will have its own struct pt_xxx which embeds it to store
+format-specific information.
+
+The implementation will further wrap struct pt_common in its own top-level
+struct, such as struct pt_iommu_amdv1.
+
+Format functions at the struct pt_common level
+----------------------------------------------
+
+.. kernel-doc:: include/linux/generic_pt/common.h
+ :identifiers:
+.. kernel-doc:: drivers/iommu/generic_pt/pt_common.h
+
+Iteration Helpers
+-----------------
+
+.. kernel-doc:: drivers/iommu/generic_pt/pt_iter.h
+
+Writing a Format
+----------------
+
+It is best to start from a simple format that is similar to the target. x86_64
+is usually a good reference for something simple, and AMDv1 is something fairly
+complete.
+
+The required inline functions need to be implemented in the format header.
+These should all follow the standard pattern of::
+
+ static inline pt_oaddr_t amdv1pt_entry_oa(const struct pt_state *pts)
+ {
+ [..]
+ }
+ #define pt_entry_oa amdv1pt_entry_oa
+
+where a uniquely named per-format inline function provides the implementation
+and a define maps it to the generic name. This is intended to make debug symbols
+work better. inline functions should always be used as the prototypes in
+pt_common.h will cause the compiler to validate the function signature to
+prevent errors.
+
+Review pt_fmt_defaults.h to understand some of the optional inlines.
+
+Once the format compiles then it should be run through the generic page table
+kunit test in kunit_generic_pt.h using kunit. For example::
+
+ $ tools/testing/kunit/kunit.py run --build_dir build_kunit_x86_64 --arch x86_64 --kunitconfig ./drivers/iommu/generic_pt/.kunitconfig amdv1_fmt_test.*
+ [...]
+ [11:15:08] Testing complete. Ran 9 tests: passed: 9
+ [11:15:09] Elapsed time: 3.137s total, 0.001s configuring, 2.368s building, 0.311s running
+
+The generic tests are intended to prove out the format functions and give
+clearer failures to speed up finding the problems. Once those pass then the
+entire kunit suite should be run.
+
+IOMMU Invalidation Features
+---------------------------
+
+Invalidation is how the page table algorithms synchronize with a HW cache of the
+page table memory, typically called the TLB (or IOTLB for IOMMU cases).
+
+The TLB can store present PTEs, non-present PTEs and table pointers, depending
+on its design. Every HW has its own approach on how to describe what has changed
+to have changed items removed from the TLB.
+
+PT_FEAT_FLUSH_RANGE
+~~~~~~~~~~~~~~~~~~~
+
+PT_FEAT_FLUSH_RANGE is the easiest scheme to understand. It tries to generate a
+single range invalidation for each operation, over-invalidating if there are
+gaps of VA that don't need invalidation. This trades off impacted VA for number
+of invalidation operations. It does not keep track of what is being invalidated;
+however, if pages have to be freed then page table pointers have to be cleaned
+from the walk cache. The range can start/end at any page boundary.
+
+PT_FEAT_FLUSH_RANGE_NO_GAPS
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+PT_FEAT_FLUSH_RANGE_NO_GAPS is similar to PT_FEAT_FLUSH_RANGE; however, it tries
+to minimize the amount of impacted VA by issuing extra flush operations. This is
+useful if the cost of processing VA is very high, for instance because a
+hypervisor is processing the page table with a shadowing algorithm.
diff --git a/Documentation/driver-api/index.rst b/Documentation/driver-api/index.rst
index 3e2a270bd828..baff96b5cf0b 100644
--- a/Documentation/driver-api/index.rst
+++ b/Documentation/driver-api/index.rst
@@ -93,6 +93,7 @@ Subsystem-specific APIs
frame-buffer
aperture
generic-counter
+ generic_pt
gpio/index
hsi
hte/index
diff --git a/Documentation/driver-api/pci/p2pdma.rst b/Documentation/driver-api/pci/p2pdma.rst
index d0b241628cf1..280673b50350 100644
--- a/Documentation/driver-api/pci/p2pdma.rst
+++ b/Documentation/driver-api/pci/p2pdma.rst
@@ -9,22 +9,48 @@ between two devices on the bus. This type of transaction is henceforth
called Peer-to-Peer (or P2P). However, there are a number of issues that
make P2P transactions tricky to do in a perfectly safe way.
-One of the biggest issues is that PCI doesn't require forwarding
-transactions between hierarchy domains, and in PCIe, each Root Port
-defines a separate hierarchy domain. To make things worse, there is no
-simple way to determine if a given Root Complex supports this or not.
-(See PCIe r4.0, sec 1.3.1). Therefore, as of this writing, the kernel
-only supports doing P2P when the endpoints involved are all behind the
-same PCI bridge, as such devices are all in the same PCI hierarchy
-domain, and the spec guarantees that all transactions within the
-hierarchy will be routable, but it does not require routing
-between hierarchies.
-
-The second issue is that to make use of existing interfaces in Linux,
-memory that is used for P2P transactions needs to be backed by struct
-pages. However, PCI BARs are not typically cache coherent so there are
-a few corner case gotchas with these pages so developers need to
-be careful about what they do with them.
+For PCIe the routing of Transaction Layer Packets (TLPs) is well-defined up
+until they reach a host bridge or root port. If the path includes PCIe switches
+then based on the ACS settings the transaction can route entirely within
+the PCIe hierarchy and never reach the root port. The kernel will evaluate
+the PCIe topology and always permit P2P in these well-defined cases.
+
+However, if the P2P transaction reaches the host bridge then it might have to
+hairpin back out the same root port, be routed inside the CPU SOC to another
+PCIe root port, or routed internally to the SOC.
+
+The PCIe specification doesn't define the forwarding of transactions between
+hierarchy domains and kernel defaults to blocking such routing. There is an
+allow list to allow detecting known-good HW, in which case P2P between any
+two PCIe devices will be permitted.
+
+Since P2P inherently is doing transactions between two devices it requires two
+drivers to be co-operating inside the kernel. The providing driver has to convey
+its MMIO to the consuming driver. To meet the driver model lifecycle rules the
+MMIO must have all DMA mapping removed, all CPU accesses prevented, all page
+table mappings undone before the providing driver completes remove().
+
+This requires the providing and consuming driver to actively work together to
+guarantee that the consuming driver has stopped using the MMIO during a removal
+cycle. This is done by either a synchronous invalidation shutdown or waiting
+for all usage refcounts to reach zero.
+
+At the lowest level the P2P subsystem offers a naked struct p2p_provider that
+delegates lifecycle management to the providing driver. It is expected that
+drivers using this option will wrap their MMIO memory in DMABUF and use DMABUF
+to provide an invalidation shutdown. These MMIO addresess have no struct page, and
+if used with mmap() must create special PTEs. As such there are very few
+kernel uAPIs that can accept pointers to them; in particular they cannot be used
+with read()/write(), including O_DIRECT.
+
+Building on this, the subsystem offers a layer to wrap the MMIO in a ZONE_DEVICE
+pgmap of MEMORY_DEVICE_PCI_P2PDMA to create struct pages. The lifecycle of
+pgmap ensures that when the pgmap is destroyed all other drivers have stopped
+using the MMIO. This option works with O_DIRECT flows, in some cases, if the
+underlying subsystem supports handling MEMORY_DEVICE_PCI_P2PDMA through
+FOLL_PCI_P2PDMA. The use of FOLL_LONGTERM is prevented. As this relies on pgmap
+it also relies on architecture support along with alignment and minimum size
+limitations.
Driver Writer's Guide
@@ -114,14 +140,39 @@ allocating scatter-gather lists with P2P memory.
Struct Page Caveats
-------------------
-Driver writers should be very careful about not passing these special
-struct pages to code that isn't prepared for it. At this time, the kernel
-interfaces do not have any checks for ensuring this. This obviously
-precludes passing these pages to userspace.
+While the MEMORY_DEVICE_PCI_P2PDMA pages can be installed in VMAs,
+pin_user_pages() and related will not return them unless FOLL_PCI_P2PDMA is set.
-P2P memory is also technically IO memory but should never have any side
-effects behind it. Thus, the order of loads and stores should not be important
-and ioreadX(), iowriteX() and friends should not be necessary.
+The MEMORY_DEVICE_PCI_P2PDMA pages require care to support in the kernel. The
+KVA is still MMIO and must still be accessed through the normal
+readX()/writeX()/etc helpers. Direct CPU access (e.g. memcpy) is forbidden, just
+like any other MMIO mapping. While this will actually work on some
+architectures, others will experience corruption or just crash in the kernel.
+Supporting FOLL_PCI_P2PDMA in a subsystem requires scrubbing it to ensure no CPU
+access happens.
+
+
+Usage With DMABUF
+=================
+
+DMABUF provides an alternative to the above struct page-based
+client/provider/orchestrator system and should be used when struct page
+doesn't exist. In this mode the exporting driver will wrap
+some of its MMIO in a DMABUF and give the DMABUF FD to userspace.
+
+Userspace can then pass the FD to an importing driver which will ask the
+exporting driver to map it to the importer.
+
+In this case the initiator and target pci_devices are known and the P2P subsystem
+is used to determine the mapping type. The phys_addr_t-based DMA API is used to
+establish the dma_addr_t.
+
+Lifecycle is controlled by DMABUF move_notify(). When the exporting driver wants
+to remove() it must deliver an invalidation shutdown to all DMABUF importing
+drivers through move_notify() and synchronously DMA unmap all the MMIO.
+
+No importing driver can continue to have a DMA map to the MMIO after the
+exporting driver has destroyed its p2p_provider.
P2P DMA Support Library
diff --git a/Documentation/driver-api/pci/pci.rst b/Documentation/driver-api/pci/pci.rst
index 59d86e827198..99a1bbaaec5d 100644
--- a/Documentation/driver-api/pci/pci.rst
+++ b/Documentation/driver-api/pci/pci.rst
@@ -37,6 +37,9 @@ PCI Support Library
.. kernel-doc:: drivers/pci/slot.c
:export:
+.. kernel-doc:: drivers/pci/rebar.c
+ :export:
+
.. kernel-doc:: drivers/pci/rom.c
:export:
diff --git a/Documentation/power/power_supply_class.rst b/Documentation/power/power_supply_class.rst
index da8e275a14ff..6d11f8c594a0 100644
--- a/Documentation/power/power_supply_class.rst
+++ b/Documentation/power/power_supply_class.rst
@@ -7,35 +7,35 @@ Synopsis
Power supply class used to represent battery, UPS, AC or DC power supply
properties to user-space.
-It defines core set of attributes, which should be applicable to (almost)
+It defines a core set of attributes which should be applicable to (almost)
every power supply out there. Attributes are available via sysfs and uevent
interfaces.
-Each attribute has well defined meaning, up to unit of measure used. While
+Each attribute has a well-defined meaning, up to the unit of measure used. While
the attributes provided are believed to be universally applicable to any
power supply, specific monitoring hardware may not be able to provide them
all, so any of them may be skipped.
-Power supply class is extensible, and allows to define drivers own attributes.
-The core attribute set is subject to the standard Linux evolution (i.e.
-if it will be found that some attribute is applicable to many power supply
-types or their drivers, it can be added to the core set).
+The power supply class is extensible and allows drivers to define their own
+attributes. The core attribute set is subject to the standard Linux evolution
+(i.e., if some attribute is found to be applicable to many power
+supply types or their drivers, it can be added to the core set).
-It also integrates with LED framework, for the purpose of providing
+It also integrates with the LED framework, for the purpose of providing
typically expected feedback of battery charging/fully charged status and
AC/USB power supply online status. (Note that specific details of the
indication (including whether to use it at all) are fully controllable by
-user and/or specific machine defaults, per design principles of LED
-framework).
+user and/or specific machine defaults, per design principles of the LED
+framework.)
Attributes/properties
~~~~~~~~~~~~~~~~~~~~~
-Power supply class has predefined set of attributes, this eliminates code
-duplication across drivers. Power supply class insist on reusing its
+The power supply class has a predefined set of attributes. This eliminates code
+duplication across drivers. The power supply class insists on reusing its
predefined attributes *and* their units.
-So, userspace gets predictable set of attributes and their units for any
+So, userspace gets a predictable set of attributes and their units for any
kind of power supply, and can process/present them to a user in consistent
manner. Results for different power supplies and machines are also directly
comparable.
@@ -61,7 +61,7 @@ Attributes/properties detailed
| **Charge/Energy/Capacity - how to not confuse** |
+--------------------------------------------------------------------------+
| **Because both "charge" (µAh) and "energy" (µWh) represents "capacity" |
-| of battery, this class distinguish these terms. Don't mix them!** |
+| of battery, this class distinguishes these terms. Don't mix them!** |
| |
| - `CHARGE_*` |
| attributes represents capacity in µAh only. |
@@ -81,7 +81,7 @@ _NOW
STATUS
this attribute represents operating status (charging, full,
- discharging (i.e. powering a load), etc.). This corresponds to
+ discharging (i.e., powering a load), etc.). This corresponds to
`BATTERY_STATUS_*` values, as defined in battery.h.
CHARGE_TYPE
@@ -92,10 +92,10 @@ CHARGE_TYPE
AUTHENTIC
indicates the power supply (battery or charger) connected
- to the platform is authentic(1) or non authentic(0).
+ to the platform is authentic(1) or non-authentic(0).
HEALTH
- represents health of the battery, values corresponds to
+ represents health of the battery. Values corresponds to
POWER_SUPPLY_HEALTH_*, defined in battery.h.
VOLTAGE_OCV
@@ -103,11 +103,11 @@ VOLTAGE_OCV
VOLTAGE_MAX_DESIGN, VOLTAGE_MIN_DESIGN
design values for maximal and minimal power supply voltages.
- Maximal/minimal means values of voltages when battery considered
+ Maximal/minimal means values of voltages when battery is considered
"full"/"empty" at normal conditions. Yes, there is no direct relation
between voltage and battery capacity, but some dumb
batteries use voltage for very approximated calculation of capacity.
- Battery driver also can use this attribute just to inform userspace
+ A battery driver also can use this attribute just to inform userspace
about maximal and minimal voltage thresholds of a given battery.
VOLTAGE_MAX, VOLTAGE_MIN
@@ -122,16 +122,16 @@ CURRENT_BOOT
Reports the current measured during boot
CHARGE_FULL_DESIGN, CHARGE_EMPTY_DESIGN
- design charge values, when battery considered full/empty.
+ design charge values, when battery is considered full/empty.
ENERGY_FULL_DESIGN, ENERGY_EMPTY_DESIGN
same as above but for energy.
CHARGE_FULL, CHARGE_EMPTY
- These attributes means "last remembered value of charge when battery
- became full/empty". It also could mean "value of charge when battery
+ These attributes mean "last remembered value of charge when battery
+ became full/empty". They also could mean "value of charge when battery is
considered full/empty at given conditions (temperature, age)".
- I.e. these attributes represents real thresholds, not design values.
+ I.e., these attributes represents real thresholds, not design values.
ENERGY_FULL, ENERGY_EMPTY
same as above but for energy.
@@ -153,12 +153,12 @@ CHARGE_TERM_CURRENT
CONSTANT_CHARGE_CURRENT
constant charge current programmed by charger.
-
CONSTANT_CHARGE_CURRENT_MAX
maximum charge current supported by the power supply object.
CONSTANT_CHARGE_VOLTAGE
constant charge voltage programmed by charger.
+
CONSTANT_CHARGE_VOLTAGE_MAX
maximum charge voltage supported by the power supply object.
@@ -208,10 +208,10 @@ TEMP_MAX
TIME_TO_EMPTY
seconds left for battery to be considered empty
- (i.e. while battery powers a load)
+ (i.e., while battery powers a load)
TIME_TO_FULL
seconds left for battery to be considered full
- (i.e. while battery is charging)
+ (i.e., while battery is charging)
Battery <-> external power supply interaction
@@ -220,13 +220,13 @@ Often power supplies are acting as supplies and supplicants at the same
time. Batteries are good example. So, batteries usually care if they're
externally powered or not.
-For that case, power supply class implements notification mechanism for
+For that case, the power supply class implements a notification mechanism for
batteries.
-External power supply (AC) lists supplicants (batteries) names in
+An external power supply (AC) lists supplicants (batteries) names in
"supplied_to" struct member, and each power_supply_changed() call
-issued by external power supply will notify supplicants via
-external_power_changed callback.
+issued by an external power supply will notify supplicants via
+the external_power_changed callback.
Devicetree battery characteristics
@@ -241,14 +241,14 @@ battery node have names corresponding to elements in enum power_supply_property,
for naming consistency between sysfs attributes and battery node properties.
-QA
-~~
+Q&A
+~~~
Q:
Where is POWER_SUPPLY_PROP_XYZ attribute?
A:
- If you cannot find attribute suitable for your driver needs, feel free
- to add it and send patch along with your driver.
+ If you cannot find an attribute suitable for your driver needs, feel free
+ to add it and send a patch along with your driver.
The attributes available currently are the ones currently provided by the
drivers written.
@@ -258,18 +258,18 @@ A:
Q:
- I have some very specific attribute (e.g. battery color), should I add
+ I have some very specific attribute (e.g., battery color). Should I add
this attribute to standard ones?
A:
Most likely, no. Such attribute can be placed in the driver itself, if
- it is useful. Of course, if the attribute in question applicable to
- large set of batteries, provided by many drivers, and/or comes from
+ it is useful. Of course, if the attribute in question is applicable to
+ a large set of batteries, provided by many drivers, and/or comes from
some general battery specification/standard, it may be a candidate to
be added to the core attribute set.
Q:
- Suppose, my battery monitoring chip/firmware does not provides capacity
+ Suppose my battery monitoring chip/firmware does not provide capacity
in percents, but provides charge_{now,full,empty}. Should I calculate
percentage capacity manually, inside the driver, and register CAPACITY
attribute? The same question about time_to_empty/time_to_full.
@@ -278,11 +278,11 @@ A:
directly measurable by the specific hardware available.
Inferring not available properties using some heuristics or mathematical
- model is not subject of work for a battery driver. Such functionality
+ model is not a subject of work for a battery driver. Such functionality
should be factored out, and in fact, apm_power, the driver to serve
- legacy APM API on top of power supply class, uses a simple heuristic of
+ legacy APM API on top of the power supply class, uses a simple heuristic of
approximating remaining battery capacity based on its charge, current,
- voltage and so on. But full-fledged battery model is likely not subject
- for kernel at all, as it would require floating point calculation to deal
- with things like differential equations and Kalman filters. This is
+ voltage and so on. But a full-fledged battery model is likely not a subject
+ for the kernel at all, as it would require floating point calculations to
+ deal with things like differential equations and Kalman filters. This is
better be handled by batteryd/libbattery, yet to be written.
diff --git a/MAINTAINERS b/MAINTAINERS
index 38274c51a2f7..3d295ce46ab7 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -388,7 +388,7 @@ B: https://bugzilla.kernel.org
F: drivers/acpi/*thermal*
ACPI VIOT DRIVER
-M: Jean-Philippe Brucker <jean-philippe@linaro.org>
+M: Jean-Philippe Brucker <jpb@kernel.org>
L: linux-acpi@vger.kernel.org
L: iommu@lists.linux.dev
S: Maintained
@@ -2269,7 +2269,7 @@ F: drivers/iommu/arm/
F: drivers/iommu/io-pgtable-arm*
ARM SMMU SVA SUPPORT
-R: Jean-Philippe Brucker <jean-philippe@linaro.org>
+R: Jean-Philippe Brucker <jpb@kernel.org>
F: drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
ARM SUB-ARCHITECTURES
@@ -3164,6 +3164,15 @@ F: arch/arm64/boot/dts/freescale/s32g*.dts*
F: drivers/pinctrl/nxp/
F: drivers/rtc/rtc-s32g.c
+ARM/NXP S32G PCIE CONTROLLER DRIVER
+M: Ciprian Marian Costea <ciprianmarian.costea@oss.nxp.com>
+R: NXP S32 Linux Team <s32@nxp.com>
+L: imx@lists.linux.dev
+L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
+S: Maintained
+F: Documentation/devicetree/bindings/pci/nxp,s32g-pcie.yaml
+F: drivers/pci/controller/dwc/pcie-nxp-s32g*
+
ARM/NXP S32G/S32R DWMAC ETHERNET DRIVER
M: Jan Petrous <jan.petrous@oss.nxp.com>
R: s32@nxp.com
@@ -5234,6 +5243,13 @@ W: http://www.broadcom.com
F: drivers/infiniband/hw/bnxt_re/
F: include/uapi/rdma/bnxt_re-abi.h
+BROADCOM 800 GIGABIT ROCE DRIVER
+M: Siva Reddy Kallam <siva.kallam@broadcom.com>
+L: linux-rdma@vger.kernel.org
+S: Supported
+W: http://www.broadcom.com
+F: drivers/infiniband/hw/bng_re/
+
BROADCOM NVRAM DRIVER
M: Rafał Miłecki <zajec5@gmail.com>
L: linux-mips@vger.kernel.org
@@ -19775,6 +19791,13 @@ S: Orphan
F: Documentation/devicetree/bindings/pci/cdns,*
F: drivers/pci/controller/cadence/*cadence*
+PCI DRIVER FOR CIX Sky1
+M: Hans Zhang <hans.zhang@cixtech.com>
+L: linux-pci@vger.kernel.org
+S: Maintained
+F: Documentation/devicetree/bindings/pci/cix,sky1-pcie-*.yaml
+F: drivers/pci/controller/cadence/*sky1*
+
PCI DRIVER FOR FREESCALE LAYERSCAPE
M: Minghuan Lian <minghuan.Lian@nxp.com>
M: Mingkai Hu <mingkai.hu@nxp.com>
@@ -20025,6 +20048,7 @@ F: include/linux/pci-p2pdma.h
PCI POWER CONTROL
M: Bartosz Golaszewski <brgl@kernel.org>
+M: Manivannan Sadhasivam <mani@kernel.org>
L: linux-pci@vger.kernel.org
S: Maintained
T: git git://git.kernel.org/pub/scm/linux/kernel/git/pci/pci.git
@@ -20161,6 +20185,14 @@ S: Maintained
F: drivers/pci/controller/dwc/pcie-qcom-common.c
F: drivers/pci/controller/dwc/pcie-qcom.c
+PCIE DRIVER FOR RENESAS RZ/G3S SERIES
+M: Claudiu Beznea <claudiu.beznea.uj@bp.renesas.com>
+L: linux-pci@vger.kernel.org
+L: linux-renesas-soc@vger.kernel.org
+S: Supported
+F: Documentation/devicetree/bindings/pci/renesas,r9a08g045-pcie.yaml
+F: drivers/pci/controller/pcie-rzg3s-host.c
+
PCIE DRIVER FOR ROCKCHIP
M: Shawn Lin <shawn.lin@rock-chips.com>
L: linux-pci@vger.kernel.org
@@ -22479,6 +22511,12 @@ L: linux-serial@vger.kernel.org
S: Odd Fixes
F: drivers/tty/serial/rp2.*
+ROHM BD71828 CHARGER
+M: Andreas Kemnade <andreas@kemnade.info>
+M: Matti Vaittinen <mazziesaccount@gmail.com>
+S: Maintained
+F: drivers/power/supply/bd71828-charger.c
+
ROHM BD79703 DAC
M: Matti Vaittinen <mazziesaccount@gmail.com>
S: Supported
@@ -27424,7 +27462,7 @@ F: drivers/virtio/virtio_input.c
F: include/uapi/linux/virtio_input.h
VIRTIO IOMMU DRIVER
-M: Jean-Philippe Brucker <jean-philippe@linaro.org>
+M: Jean-Philippe Brucker <jpb@kernel.org>
L: virtualization@lists.linux.dev
S: Maintained
F: drivers/iommu/virtio-iommu.c
diff --git a/arch/powerpc/include/asm/mem_encrypt.h b/arch/powerpc/include/asm/mem_encrypt.h
index 2f26b8fc8d29..e355ca46fad9 100644
--- a/arch/powerpc/include/asm/mem_encrypt.h
+++ b/arch/powerpc/include/asm/mem_encrypt.h
@@ -9,6 +9,9 @@
#define _ASM_POWERPC_MEM_ENCRYPT_H
#include <asm/svm.h>
+#include <linux/types.h>
+
+struct device;
static inline bool force_dma_unencrypted(struct device *dev)
{
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 244eb4857e7f..b7dcf07b2499 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -1156,7 +1156,8 @@ EXPORT_SYMBOL_GPL(iommu_add_device);
*/
static int
spapr_tce_platform_iommu_attach_dev(struct iommu_domain *platform_domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
struct iommu_table_group *table_group;
@@ -1189,7 +1190,7 @@ static struct iommu_domain spapr_tce_platform_domain = {
static int
spapr_tce_blocked_iommu_attach_dev(struct iommu_domain *platform_domain,
- struct device *dev)
+ struct device *dev, struct iommu_domain *old)
{
struct iommu_group *grp = iommu_group_get(dev);
struct iommu_table_group *table_group;
diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c
index e9108ccaf4b0..b6dbc9767596 100644
--- a/block/blk-mq-dma.c
+++ b/block/blk-mq-dma.c
@@ -84,7 +84,7 @@ static inline bool blk_can_dma_map_iova(struct request *req,
static bool blk_dma_map_bus(struct blk_dma_iter *iter, struct phys_vec *vec)
{
- iter->addr = pci_p2pdma_bus_addr_map(&iter->p2pdma, vec->paddr);
+ iter->addr = pci_p2pdma_bus_addr_map(iter->p2pdma.mem, vec->paddr);
iter->len = vec->len;
return true;
}
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 97ee19f2cae0..56107aa00274 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -22,6 +22,7 @@
#include <linux/moduleparam.h>
#include <linux/init.h>
#include <linux/acpi.h>
+#include <linux/bitfield.h>
#include <linux/io.h>
#include <linux/interrupt.h>
#include <linux/timer.h>
@@ -552,26 +553,25 @@ static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata,
}
static bool ghes_handle_arm_hw_error(struct acpi_hest_generic_data *gdata,
- int sev, bool sync)
+ int sev, bool sync)
{
struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata);
int flags = sync ? MF_ACTION_REQUIRED : 0;
+ char error_type[120];
bool queued = false;
int sec_sev, i;
char *p;
- log_arm_hw_error(err);
-
sec_sev = ghes_severity(gdata->error_severity);
+ log_arm_hw_error(err, sec_sev);
if (sev != GHES_SEV_RECOVERABLE || sec_sev != GHES_SEV_RECOVERABLE)
return false;
p = (char *)(err + 1);
for (i = 0; i < err->err_info_num; i++) {
struct cper_arm_err_info *err_info = (struct cper_arm_err_info *)p;
- bool is_cache = (err_info->type == CPER_ARM_CACHE_ERROR);
+ bool is_cache = err_info->type & CPER_ARM_CACHE_ERROR;
bool has_pa = (err_info->validation_bits & CPER_ARM_INFO_VALID_PHYSICAL_ADDR);
- const char *error_type = "unknown error";
/*
* The field (err_info->error_info & BIT(26)) is fixed to set to
@@ -585,12 +585,15 @@ static bool ghes_handle_arm_hw_error(struct acpi_hest_generic_data *gdata,
continue;
}
- if (err_info->type < ARRAY_SIZE(cper_proc_error_type_strs))
- error_type = cper_proc_error_type_strs[err_info->type];
+ cper_bits_to_str(error_type, sizeof(error_type),
+ FIELD_GET(CPER_ARM_ERR_TYPE_MASK, err_info->type),
+ cper_proc_error_type_strs,
+ ARRAY_SIZE(cper_proc_error_type_strs));
pr_warn_ratelimited(FW_WARN GHES_PFX
- "Unhandled processor error type: %s\n",
- error_type);
+ "Unhandled processor error type 0x%02x: %s%s\n",
+ err_info->type, error_type,
+ (err_info->type & ~CPER_ARM_ERR_TYPE_MASK) ? " with reserved bit(s)" : "");
p += err_info->length;
}
@@ -895,11 +898,9 @@ static void ghes_do_proc(struct ghes *ghes,
arch_apei_report_mem_error(sev, mem_err);
queued = ghes_handle_memory_failure(gdata, sev, sync);
- }
- else if (guid_equal(sec_type, &CPER_SEC_PCIE)) {
+ } else if (guid_equal(sec_type, &CPER_SEC_PCIE)) {
ghes_handle_aer(gdata);
- }
- else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) {
+ } else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) {
queued = ghes_handle_arm_hw_error(gdata, sev, sync);
} else if (guid_equal(sec_type, &CPER_SEC_CXL_PROT_ERR)) {
struct cxl_cper_sec_prot_err *prot_err = acpi_hest_get_payload(gdata);
diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c
index 11e4483685c9..77a81627aaef 100644
--- a/drivers/acpi/numa/hmat.c
+++ b/drivers/acpi/numa/hmat.c
@@ -910,12 +910,13 @@ static void hmat_register_target(struct memory_target *target)
* Register generic port perf numbers. The nid may not be
* initialized and is still NUMA_NO_NODE.
*/
- mutex_lock(&target_lock);
- if (*(u16 *)target->gen_port_device_handle) {
- hmat_update_generic_target(target);
- target->registered = true;
+ scoped_guard(mutex, &target_lock) {
+ if (*(u16 *)target->gen_port_device_handle) {
+ hmat_update_generic_target(target);
+ target->registered = true;
+ return;
+ }
}
- mutex_unlock(&target_lock);
hmat_hotplug_target(target);
}
diff --git a/drivers/amba/Kconfig b/drivers/amba/Kconfig
index fb6c7e0b4cce..14bb61ff801e 100644
--- a/drivers/amba/Kconfig
+++ b/drivers/amba/Kconfig
@@ -5,7 +5,7 @@ config ARM_AMBA
if ARM_AMBA
config TEGRA_AHB
- bool
+ bool "Enable AHB driver for NVIDIA Tegra SoCs" if COMPILE_TEST
default y if ARCH_TEGRA
help
Adds AHB configuration functionality for NVIDIA Tegra SoCs,
diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c
index be25ecbdba69..f8bfff5dd0bd 100644
--- a/drivers/crypto/hisilicon/qm.c
+++ b/drivers/crypto/hisilicon/qm.c
@@ -3032,11 +3032,36 @@ static void qm_put_pci_res(struct hisi_qm *qm)
pci_release_mem_regions(pdev);
}
+static void hisi_mig_region_clear(struct hisi_qm *qm)
+{
+ u32 val;
+
+ /* Clear migration region set of PF */
+ if (qm->fun_type == QM_HW_PF && qm->ver > QM_HW_V3) {
+ val = readl(qm->io_base + QM_MIG_REGION_SEL);
+ val &= ~QM_MIG_REGION_EN;
+ writel(val, qm->io_base + QM_MIG_REGION_SEL);
+ }
+}
+
+static void hisi_mig_region_enable(struct hisi_qm *qm)
+{
+ u32 val;
+
+ /* Select migration region of PF */
+ if (qm->fun_type == QM_HW_PF && qm->ver > QM_HW_V3) {
+ val = readl(qm->io_base + QM_MIG_REGION_SEL);
+ val |= QM_MIG_REGION_EN;
+ writel(val, qm->io_base + QM_MIG_REGION_SEL);
+ }
+}
+
static void hisi_qm_pci_uninit(struct hisi_qm *qm)
{
struct pci_dev *pdev = qm->pdev;
pci_free_irq_vectors(pdev);
+ hisi_mig_region_clear(qm);
qm_put_pci_res(qm);
pci_disable_device(pdev);
}
@@ -5752,6 +5777,7 @@ int hisi_qm_init(struct hisi_qm *qm)
goto err_free_qm_memory;
qm_cmd_init(qm);
+ hisi_mig_region_enable(qm);
return 0;
@@ -5890,6 +5916,7 @@ static int qm_rebuild_for_resume(struct hisi_qm *qm)
}
qm_cmd_init(qm);
+ hisi_mig_region_enable(qm);
hisi_qm_dev_err_init(qm);
/* Set the doorbell timeout to QM_DB_TIMEOUT_CFG ns. */
writel(QM_DB_TIMEOUT_SET, qm->io_base + QM_DB_TIMEOUT_CFG);
diff --git a/drivers/crypto/intel/qat/qat_common/adf_aer.c b/drivers/crypto/intel/qat/qat_common/adf_aer.c
index 667d5e320f50..11728cf32653 100644
--- a/drivers/crypto/intel/qat/qat_common/adf_aer.c
+++ b/drivers/crypto/intel/qat/qat_common/adf_aer.c
@@ -105,7 +105,6 @@ void adf_dev_restore(struct adf_accel_dev *accel_dev)
accel_dev->accel_id);
hw_device->reset_device(accel_dev);
pci_restore_state(pdev);
- pci_save_state(pdev);
}
}
@@ -204,7 +203,6 @@ static pci_ers_result_t adf_slot_reset(struct pci_dev *pdev)
if (!pdev->is_busmaster)
pci_set_master(pdev);
pci_restore_state(pdev);
- pci_save_state(pdev);
res = adf_dev_up(accel_dev, false);
if (res && res != -EALREADY)
return PCI_ERS_RESULT_DISCONNECT;
diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c
index bd2e282ca93a..77ac940e3013 100644
--- a/drivers/cxl/acpi.c
+++ b/drivers/cxl/acpi.c
@@ -11,25 +11,36 @@
#include "cxlpci.h"
#include "cxl.h"
-struct cxl_cxims_data {
- int nr_maps;
- u64 xormaps[] __counted_by(nr_maps);
-};
-
static const guid_t acpi_cxl_qtg_id_guid =
GUID_INIT(0xF365F9A6, 0xA7DE, 0x4071,
0xA6, 0x6A, 0xB4, 0x0C, 0x0B, 0x4F, 0x8E, 0x52);
-static u64 cxl_apply_xor_maps(struct cxl_root_decoder *cxlrd, u64 addr)
+#define HBIW_TO_NR_MAPS_SIZE (CXL_DECODER_MAX_INTERLEAVE + 1)
+static const int hbiw_to_nr_maps[HBIW_TO_NR_MAPS_SIZE] = {
+ [1] = 0, [2] = 1, [3] = 0, [4] = 2, [6] = 1, [8] = 3, [12] = 2, [16] = 4
+};
+
+static const int valid_hbiw[] = { 1, 2, 3, 4, 6, 8, 12, 16 };
+
+u64 cxl_do_xormap_calc(struct cxl_cxims_data *cximsd, u64 addr, int hbiw)
{
- struct cxl_cxims_data *cximsd = cxlrd->platform_data;
- int hbiw = cxlrd->cxlsd.nr_targets;
+ int nr_maps_to_apply = -1;
u64 val;
int pos;
- /* No xormaps for host bridge interleave ways of 1 or 3 */
- if (hbiw == 1 || hbiw == 3)
- return addr;
+ /*
+ * Strictly validate hbiw since this function is used for testing and
+ * that nullifies any expectation of trusted parameters from the CXL
+ * Region Driver.
+ */
+ for (int i = 0; i < ARRAY_SIZE(valid_hbiw); i++) {
+ if (valid_hbiw[i] == hbiw) {
+ nr_maps_to_apply = hbiw_to_nr_maps[hbiw];
+ break;
+ }
+ }
+ if (nr_maps_to_apply == -1 || nr_maps_to_apply > cximsd->nr_maps)
+ return ULLONG_MAX;
/*
* In regions using XOR interleave arithmetic the CXL HPA may not
@@ -60,6 +71,14 @@ static u64 cxl_apply_xor_maps(struct cxl_root_decoder *cxlrd, u64 addr)
return addr;
}
+EXPORT_SYMBOL_FOR_MODULES(cxl_do_xormap_calc, "cxl_translate");
+
+static u64 cxl_apply_xor_maps(struct cxl_root_decoder *cxlrd, u64 addr)
+{
+ struct cxl_cxims_data *cximsd = cxlrd->platform_data;
+
+ return cxl_do_xormap_calc(cximsd, addr, cxlrd->cxlsd.nr_targets);
+}
struct cxl_cxims_context {
struct device *dev;
@@ -353,7 +372,7 @@ static int cxl_acpi_set_cache_size(struct cxl_root_decoder *cxlrd)
rc = hmat_get_extended_linear_cache_size(&res, nid, &cache_size);
if (rc)
- return rc;
+ return 0;
/*
* The cache range is expected to be within the CFMWS.
@@ -378,21 +397,18 @@ static void cxl_setup_extended_linear_cache(struct cxl_root_decoder *cxlrd)
int rc;
rc = cxl_acpi_set_cache_size(cxlrd);
- if (!rc)
- return;
-
- if (rc != -EOPNOTSUPP) {
+ if (rc) {
/*
- * Failing to support extended linear cache region resize does not
+ * Failing to retrieve extended linear cache region resize does not
* prevent the region from functioning. Only causes cxl list showing
* incorrect region size.
*/
dev_warn(cxlrd->cxlsd.cxld.dev.parent,
- "Extended linear cache calculation failed rc:%d\n", rc);
- }
+ "Extended linear cache retrieval failed rc:%d\n", rc);
- /* Ignoring return code */
- cxlrd->cache_size = 0;
+ /* Ignoring return code */
+ cxlrd->cache_size = 0;
+ }
}
DEFINE_FREE(put_cxlrd, struct cxl_root_decoder *,
@@ -453,8 +469,6 @@ static int __cxl_parse_cfmws(struct acpi_cedt_cfmws *cfmws,
ig = CXL_DECODER_MIN_GRANULARITY;
cxld->interleave_granularity = ig;
- cxl_setup_extended_linear_cache(cxlrd);
-
if (cfmws->interleave_arithmetic == ACPI_CEDT_CFMWS_ARITHMETIC_XOR) {
if (ways != 1 && ways != 3) {
cxims_ctx = (struct cxl_cxims_context) {
@@ -470,18 +484,13 @@ static int __cxl_parse_cfmws(struct acpi_cedt_cfmws *cfmws,
return -EINVAL;
}
}
+ cxlrd->ops.hpa_to_spa = cxl_apply_xor_maps;
+ cxlrd->ops.spa_to_hpa = cxl_apply_xor_maps;
}
- cxlrd->qos_class = cfmws->qtg_id;
-
- if (cfmws->interleave_arithmetic == ACPI_CEDT_CFMWS_ARITHMETIC_XOR) {
- cxlrd->ops = kzalloc(sizeof(*cxlrd->ops), GFP_KERNEL);
- if (!cxlrd->ops)
- return -ENOMEM;
+ cxl_setup_extended_linear_cache(cxlrd);
- cxlrd->ops->hpa_to_spa = cxl_apply_xor_maps;
- cxlrd->ops->spa_to_hpa = cxl_apply_xor_maps;
- }
+ cxlrd->qos_class = cfmws->qtg_id;
rc = cxl_decoder_add(cxld);
if (rc)
diff --git a/drivers/cxl/core/cdat.c b/drivers/cxl/core/cdat.c
index c4bd6e8a0cf0..7120b5f2e31f 100644
--- a/drivers/cxl/core/cdat.c
+++ b/drivers/cxl/core/cdat.c
@@ -826,7 +826,7 @@ static struct xarray *cxl_switch_gather_bandwidth(struct cxl_region *cxlr,
cxl_coordinates_combine(coords, coords, ctx->coord);
/*
- * Take the min of the calculated bandwdith and the upstream
+ * Take the min of the calculated bandwidth and the upstream
* switch SSLBIS bandwidth if there's a parent switch
*/
if (!is_root)
@@ -949,7 +949,7 @@ static struct xarray *cxl_hb_gather_bandwidth(struct xarray *xa)
/**
* cxl_region_update_bandwidth - Update the bandwidth access coordinates of a region
* @cxlr: The region being operated on
- * @input_xa: xarray holds cxl_perf_ctx wht calculated bandwidth per ACPI0017 instance
+ * @input_xa: xarray holds cxl_perf_ctx with calculated bandwidth per ACPI0017 instance
*/
static void cxl_region_update_bandwidth(struct cxl_region *cxlr,
struct xarray *input_xa)
diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c
index d3a094ca01ad..1c5d2022c87a 100644
--- a/drivers/cxl/core/hdm.c
+++ b/drivers/cxl/core/hdm.c
@@ -905,6 +905,9 @@ static void cxl_decoder_reset(struct cxl_decoder *cxld)
if ((cxld->flags & CXL_DECODER_F_ENABLE) == 0)
return;
+ if (test_bit(CXL_DECODER_F_LOCK, &cxld->flags))
+ return;
+
if (port->commit_end == id)
cxl_port_commit_reap(cxld);
else
diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
index 18825e1505d6..5b023a0178a4 100644
--- a/drivers/cxl/core/pci.c
+++ b/drivers/cxl/core/pci.c
@@ -71,85 +71,6 @@ struct cxl_dport *__devm_cxl_add_dport_by_dev(struct cxl_port *port,
}
EXPORT_SYMBOL_NS_GPL(__devm_cxl_add_dport_by_dev, "CXL");
-struct cxl_walk_context {
- struct pci_bus *bus;
- struct cxl_port *port;
- int type;
- int error;
- int count;
-};
-
-static int match_add_dports(struct pci_dev *pdev, void *data)
-{
- struct cxl_walk_context *ctx = data;
- struct cxl_port *port = ctx->port;
- int type = pci_pcie_type(pdev);
- struct cxl_register_map map;
- struct cxl_dport *dport;
- u32 lnkcap, port_num;
- int rc;
-
- if (pdev->bus != ctx->bus)
- return 0;
- if (!pci_is_pcie(pdev))
- return 0;
- if (type != ctx->type)
- return 0;
- if (pci_read_config_dword(pdev, pci_pcie_cap(pdev) + PCI_EXP_LNKCAP,
- &lnkcap))
- return 0;
-
- rc = cxl_find_regblock(pdev, CXL_REGLOC_RBI_COMPONENT, &map);
- if (rc)
- dev_dbg(&port->dev, "failed to find component registers\n");
-
- port_num = FIELD_GET(PCI_EXP_LNKCAP_PN, lnkcap);
- dport = devm_cxl_add_dport(port, &pdev->dev, port_num, map.resource);
- if (IS_ERR(dport)) {
- ctx->error = PTR_ERR(dport);
- return PTR_ERR(dport);
- }
- ctx->count++;
-
- return 0;
-}
-
-/**
- * devm_cxl_port_enumerate_dports - enumerate downstream ports of the upstream port
- * @port: cxl_port whose ->uport_dev is the upstream of dports to be enumerated
- *
- * Returns a positive number of dports enumerated or a negative error
- * code.
- */
-int devm_cxl_port_enumerate_dports(struct cxl_port *port)
-{
- struct pci_bus *bus = cxl_port_to_pci_bus(port);
- struct cxl_walk_context ctx;
- int type;
-
- if (!bus)
- return -ENXIO;
-
- if (pci_is_root_bus(bus))
- type = PCI_EXP_TYPE_ROOT_PORT;
- else
- type = PCI_EXP_TYPE_DOWNSTREAM;
-
- ctx = (struct cxl_walk_context) {
- .port = port,
- .bus = bus,
- .type = type,
- };
- pci_walk_bus(bus, match_add_dports, &ctx);
-
- if (ctx.count == 0)
- return -ENODEV;
- if (ctx.error)
- return ctx.error;
- return ctx.count;
-}
-EXPORT_SYMBOL_NS_GPL(devm_cxl_port_enumerate_dports, "CXL");
-
static int cxl_dvsec_mem_range_valid(struct cxl_dev_state *cxlds, int id)
{
struct pci_dev *pdev = to_pci_dev(cxlds->dev);
@@ -1217,6 +1138,14 @@ int cxl_gpf_port_setup(struct cxl_dport *dport)
return 0;
}
+struct cxl_walk_context {
+ struct pci_bus *bus;
+ struct cxl_port *port;
+ int type;
+ int error;
+ int count;
+};
+
static int count_dports(struct pci_dev *pdev, void *data)
{
struct cxl_walk_context *ctx = data;
diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c
index 8128fd2b5b31..fef3aa0c6680 100644
--- a/drivers/cxl/core/port.c
+++ b/drivers/cxl/core/port.c
@@ -459,7 +459,6 @@ static void cxl_root_decoder_release(struct device *dev)
if (atomic_read(&cxlrd->region_id) >= 0)
memregion_free(atomic_read(&cxlrd->region_id));
__cxl_decoder_release(&cxlrd->cxlsd.cxld);
- kfree(cxlrd->ops);
kfree(cxlrd);
}
diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
index 41b64d871c5a..82d229c8f9bf 100644
--- a/drivers/cxl/core/region.c
+++ b/drivers/cxl/core/region.c
@@ -245,6 +245,9 @@ static void cxl_region_decode_reset(struct cxl_region *cxlr, int count)
struct cxl_region_params *p = &cxlr->params;
int i;
+ if (test_bit(CXL_REGION_F_LOCK, &cxlr->flags))
+ return;
+
/*
* Before region teardown attempt to flush, evict any data cached for
* this region, or scream loudly about missing arch / platform support
@@ -419,6 +422,9 @@ static ssize_t commit_store(struct device *dev, struct device_attribute *attr,
return len;
}
+ if (test_bit(CXL_REGION_F_LOCK, &cxlr->flags))
+ return -EPERM;
+
rc = queue_reset(cxlr);
if (rc)
return rc;
@@ -461,21 +467,6 @@ static ssize_t commit_show(struct device *dev, struct device_attribute *attr,
}
static DEVICE_ATTR_RW(commit);
-static umode_t cxl_region_visible(struct kobject *kobj, struct attribute *a,
- int n)
-{
- struct device *dev = kobj_to_dev(kobj);
- struct cxl_region *cxlr = to_cxl_region(dev);
-
- /*
- * Support tooling that expects to find a 'uuid' attribute for all
- * regions regardless of mode.
- */
- if (a == &dev_attr_uuid.attr && cxlr->mode != CXL_PARTMODE_PMEM)
- return 0444;
- return a->mode;
-}
-
static ssize_t interleave_ways_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
@@ -754,6 +745,21 @@ static ssize_t size_show(struct device *dev, struct device_attribute *attr,
}
static DEVICE_ATTR_RW(size);
+static ssize_t extended_linear_cache_size_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct cxl_region *cxlr = to_cxl_region(dev);
+ struct cxl_region_params *p = &cxlr->params;
+ ssize_t rc;
+
+ ACQUIRE(rwsem_read_intr, rwsem)(&cxl_rwsem.region);
+ if ((rc = ACQUIRE_ERR(rwsem_read_intr, &rwsem)))
+ return rc;
+ return sysfs_emit(buf, "%#llx\n", p->cache_size);
+}
+static DEVICE_ATTR_RO(extended_linear_cache_size);
+
static struct attribute *cxl_region_attrs[] = {
&dev_attr_uuid.attr,
&dev_attr_commit.attr,
@@ -762,9 +768,34 @@ static struct attribute *cxl_region_attrs[] = {
&dev_attr_resource.attr,
&dev_attr_size.attr,
&dev_attr_mode.attr,
+ &dev_attr_extended_linear_cache_size.attr,
NULL,
};
+static umode_t cxl_region_visible(struct kobject *kobj, struct attribute *a,
+ int n)
+{
+ struct device *dev = kobj_to_dev(kobj);
+ struct cxl_region *cxlr = to_cxl_region(dev);
+
+ /*
+ * Support tooling that expects to find a 'uuid' attribute for all
+ * regions regardless of mode.
+ */
+ if (a == &dev_attr_uuid.attr && cxlr->mode != CXL_PARTMODE_PMEM)
+ return 0444;
+
+ /*
+ * Don't display extended linear cache attribute if there is no
+ * extended linear cache.
+ */
+ if (a == &dev_attr_extended_linear_cache_size.attr &&
+ cxlr->params.cache_size == 0)
+ return 0;
+
+ return a->mode;
+}
+
static const struct attribute_group cxl_region_group = {
.attrs = cxl_region_attrs,
.is_visible = cxl_region_visible,
@@ -838,16 +869,16 @@ static int match_free_decoder(struct device *dev, const void *data)
return 1;
}
-static bool region_res_match_cxl_range(const struct cxl_region_params *p,
- const struct range *range)
+static bool spa_maps_hpa(const struct cxl_region_params *p,
+ const struct range *range)
{
if (!p->res)
return false;
/*
- * If an extended linear cache region then the CXL range is assumed
- * to be fronted by the DRAM range in current known implementation.
- * This assumption will be made until a variant implementation exists.
+ * The extended linear cache region is constructed by a 1:1 ratio
+ * where the SPA maps equal amounts of DRAM and CXL HPA capacity with
+ * CXL decoders at the high end of the SPA range.
*/
return p->res->start + p->cache_size == range->start &&
p->res->end == range->end;
@@ -865,7 +896,7 @@ static int match_auto_decoder(struct device *dev, const void *data)
cxld = to_cxl_decoder(dev);
r = &cxld->hpa_range;
- if (region_res_match_cxl_range(p, r))
+ if (spa_maps_hpa(p, r))
return 1;
return 0;
@@ -1059,6 +1090,16 @@ static int cxl_rr_assign_decoder(struct cxl_port *port, struct cxl_region *cxlr,
return 0;
}
+static void cxl_region_set_lock(struct cxl_region *cxlr,
+ struct cxl_decoder *cxld)
+{
+ if (!test_bit(CXL_DECODER_F_LOCK, &cxld->flags))
+ return;
+
+ set_bit(CXL_REGION_F_LOCK, &cxlr->flags);
+ clear_bit(CXL_REGION_F_NEEDS_RESET, &cxlr->flags);
+}
+
/**
* cxl_port_attach_region() - track a region's interest in a port by endpoint
* @port: port to add a new region reference 'struct cxl_region_ref'
@@ -1170,6 +1211,8 @@ static int cxl_port_attach_region(struct cxl_port *port,
}
}
+ cxl_region_set_lock(cxlr, cxld);
+
rc = cxl_rr_ep_add(cxl_rr, cxled);
if (rc) {
dev_dbg(&cxlr->dev,
@@ -1328,7 +1371,7 @@ static int cxl_port_setup_targets(struct cxl_port *port,
struct cxl_endpoint_decoder *cxled)
{
struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent);
- int parent_iw, parent_ig, ig, iw, rc, inc = 0, pos = cxled->pos;
+ int parent_iw, parent_ig, ig, iw, rc, pos = cxled->pos;
struct cxl_port *parent_port = to_cxl_port(port->dev.parent);
struct cxl_region_ref *cxl_rr = cxl_rr_load(port, cxlr);
struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
@@ -1465,7 +1508,7 @@ static int cxl_port_setup_targets(struct cxl_port *port,
if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags)) {
if (cxld->interleave_ways != iw ||
(iw > 1 && cxld->interleave_granularity != ig) ||
- !region_res_match_cxl_range(p, &cxld->hpa_range) ||
+ !spa_maps_hpa(p, &cxld->hpa_range) ||
((cxld->flags & CXL_DECODER_F_ENABLE) == 0)) {
dev_err(&cxlr->dev,
"%s:%s %s expected iw: %d ig: %d %pr\n",
@@ -1520,9 +1563,8 @@ add_target:
cxlsd->target[cxl_rr->nr_targets_set] = ep->dport;
cxlsd->cxld.target_map[cxl_rr->nr_targets_set] = ep->dport->port_id;
}
- inc = 1;
+ cxl_rr->nr_targets_set++;
out_target_set:
- cxl_rr->nr_targets_set += inc;
dev_dbg(&cxlr->dev, "%s:%s target[%d] = %s for %s:%s @ %d\n",
dev_name(port->uport_dev), dev_name(&port->dev),
cxl_rr->nr_targets_set - 1, dev_name(ep->dport->dport_dev),
@@ -2439,6 +2481,7 @@ static struct cxl_region *cxl_region_alloc(struct cxl_root_decoder *cxlrd, int i
dev->bus = &cxl_bus_type;
dev->type = &cxl_region_type;
cxlr->id = id;
+ cxl_region_set_lock(cxlr, &cxlrd->cxlsd.cxld);
return cxlr;
}
@@ -2924,38 +2967,119 @@ static bool cxl_is_hpa_in_chunk(u64 hpa, struct cxl_region *cxlr, int pos)
return false;
}
-static bool has_hpa_to_spa(struct cxl_root_decoder *cxlrd)
+#define CXL_POS_ZERO 0
+/**
+ * cxl_validate_translation_params
+ * @eiw: encoded interleave ways
+ * @eig: encoded interleave granularity
+ * @pos: position in interleave
+ *
+ * Callers pass CXL_POS_ZERO when no position parameter needs validating.
+ *
+ * Returns: 0 on success, -EINVAL on first invalid parameter
+ */
+int cxl_validate_translation_params(u8 eiw, u16 eig, int pos)
{
- return cxlrd->ops && cxlrd->ops->hpa_to_spa;
+ int ways, gran;
+
+ if (eiw_to_ways(eiw, &ways)) {
+ pr_debug("%s: invalid eiw=%u\n", __func__, eiw);
+ return -EINVAL;
+ }
+ if (eig_to_granularity(eig, &gran)) {
+ pr_debug("%s: invalid eig=%u\n", __func__, eig);
+ return -EINVAL;
+ }
+ if (pos < 0 || pos >= ways) {
+ pr_debug("%s: invalid pos=%d for ways=%u\n", __func__, pos,
+ ways);
+ return -EINVAL;
+ }
+
+ return 0;
}
+EXPORT_SYMBOL_FOR_MODULES(cxl_validate_translation_params, "cxl_translate");
-static bool has_spa_to_hpa(struct cxl_root_decoder *cxlrd)
+u64 cxl_calculate_dpa_offset(u64 hpa_offset, u8 eiw, u16 eig)
{
- return cxlrd->ops && cxlrd->ops->spa_to_hpa;
+ u64 dpa_offset, bits_lower, bits_upper, temp;
+ int ret;
+
+ ret = cxl_validate_translation_params(eiw, eig, CXL_POS_ZERO);
+ if (ret)
+ return ULLONG_MAX;
+
+ /*
+ * DPA offset: CXL Spec 3.2 Section 8.2.4.20.13
+ * Lower bits [IG+7:0] pass through unchanged
+ * (eiw < 8)
+ * Per spec: DPAOffset[51:IG+8] = (HPAOffset[51:IG+IW+8] >> IW)
+ * Clear the position bits to isolate upper section, then
+ * reverse the left shift by eiw that occurred during DPA->HPA
+ * (eiw >= 8)
+ * Per spec: DPAOffset[51:IG+8] = HPAOffset[51:IG+IW] / 3
+ * Extract upper bits from the correct bit range and divide by 3
+ * to recover the original DPA upper bits
+ */
+ bits_lower = hpa_offset & GENMASK_ULL(eig + 7, 0);
+ if (eiw < 8) {
+ temp = hpa_offset &= ~GENMASK_ULL(eig + eiw + 8 - 1, 0);
+ dpa_offset = temp >> eiw;
+ } else {
+ bits_upper = div64_u64(hpa_offset >> (eig + eiw), 3);
+ dpa_offset = bits_upper << (eig + 8);
+ }
+ dpa_offset |= bits_lower;
+
+ return dpa_offset;
}
+EXPORT_SYMBOL_FOR_MODULES(cxl_calculate_dpa_offset, "cxl_translate");
-u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd,
- u64 dpa)
+int cxl_calculate_position(u64 hpa_offset, u8 eiw, u16 eig)
{
- struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent);
- u64 dpa_offset, hpa_offset, bits_upper, mask_upper, hpa;
- struct cxl_region_params *p = &cxlr->params;
- struct cxl_endpoint_decoder *cxled = NULL;
- u16 eig = 0;
- u8 eiw = 0;
- int pos;
+ unsigned int ways = 0;
+ u64 shifted, rem;
+ int pos, ret;
- for (int i = 0; i < p->nr_targets; i++) {
- cxled = p->targets[i];
- if (cxlmd == cxled_to_memdev(cxled))
- break;
+ ret = cxl_validate_translation_params(eiw, eig, CXL_POS_ZERO);
+ if (ret)
+ return ret;
+
+ if (!eiw)
+ /* position is 0 if no interleaving */
+ return 0;
+
+ /*
+ * Interleave position: CXL Spec 3.2 Section 8.2.4.20.13
+ * eiw < 8
+ * Position is in the IW bits at HPA_OFFSET[IG+8+IW-1:IG+8].
+ * Per spec "remove IW bits starting with bit position IG+8"
+ * eiw >= 8
+ * Position is not explicitly stored in HPA_OFFSET bits. It is
+ * derived from the modulo operation of the upper bits using
+ * the total number of interleave ways.
+ */
+ if (eiw < 8) {
+ pos = (hpa_offset >> (eig + 8)) & GENMASK(eiw - 1, 0);
+ } else {
+ shifted = hpa_offset >> (eig + 8);
+ eiw_to_ways(eiw, &ways);
+ div64_u64_rem(shifted, ways, &rem);
+ pos = rem;
}
- if (!cxled || cxlmd != cxled_to_memdev(cxled))
- return ULLONG_MAX;
- pos = cxled->pos;
- ways_to_eiw(p->interleave_ways, &eiw);
- granularity_to_eig(p->interleave_granularity, &eig);
+ return pos;
+}
+EXPORT_SYMBOL_FOR_MODULES(cxl_calculate_position, "cxl_translate");
+
+u64 cxl_calculate_hpa_offset(u64 dpa_offset, int pos, u8 eiw, u16 eig)
+{
+ u64 mask_upper, hpa_offset, bits_upper;
+ int ret;
+
+ ret = cxl_validate_translation_params(eiw, eig, pos);
+ if (ret)
+ return ULLONG_MAX;
/*
* The device position in the region interleave set was removed
@@ -2967,9 +3091,6 @@ u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd,
* 8.2.4.19.13 Implementation Note: Device Decode Logic
*/
- /* Remove the dpa base */
- dpa_offset = dpa - cxl_dpa_resource_start(cxled);
-
mask_upper = GENMASK_ULL(51, eig + 8);
if (eiw < 8) {
@@ -2984,12 +3105,43 @@ u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd,
/* The lower bits remain unchanged */
hpa_offset |= dpa_offset & GENMASK_ULL(eig + 7, 0);
+ return hpa_offset;
+}
+EXPORT_SYMBOL_FOR_MODULES(cxl_calculate_hpa_offset, "cxl_translate");
+
+u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd,
+ u64 dpa)
+{
+ struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent);
+ struct cxl_region_params *p = &cxlr->params;
+ struct cxl_endpoint_decoder *cxled = NULL;
+ u64 dpa_offset, hpa_offset, hpa;
+ u16 eig = 0;
+ u8 eiw = 0;
+ int pos;
+
+ for (int i = 0; i < p->nr_targets; i++) {
+ if (cxlmd == cxled_to_memdev(p->targets[i])) {
+ cxled = p->targets[i];
+ break;
+ }
+ }
+ if (!cxled)
+ return ULLONG_MAX;
+
+ pos = cxled->pos;
+ ways_to_eiw(p->interleave_ways, &eiw);
+ granularity_to_eig(p->interleave_granularity, &eig);
+
+ dpa_offset = dpa - cxl_dpa_resource_start(cxled);
+ hpa_offset = cxl_calculate_hpa_offset(dpa_offset, pos, eiw, eig);
+
/* Apply the hpa_offset to the region base address */
hpa = hpa_offset + p->res->start + p->cache_size;
/* Root decoder translation overrides typical modulo decode */
- if (has_hpa_to_spa(cxlrd))
- hpa = cxlrd->ops->hpa_to_spa(cxlrd, hpa);
+ if (cxlrd->ops.hpa_to_spa)
+ hpa = cxlrd->ops.hpa_to_spa(cxlrd, hpa);
if (!cxl_resource_contains_addr(p->res, hpa)) {
dev_dbg(&cxlr->dev,
@@ -2998,7 +3150,7 @@ u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd,
}
/* Simple chunk check, by pos & gran, only applies to modulo decodes */
- if (!has_hpa_to_spa(cxlrd) && (!cxl_is_hpa_in_chunk(hpa, cxlr, pos)))
+ if (!cxlrd->ops.hpa_to_spa && !cxl_is_hpa_in_chunk(hpa, cxlr, pos))
return ULLONG_MAX;
return hpa;
@@ -3016,8 +3168,6 @@ static int region_offset_to_dpa_result(struct cxl_region *cxlr, u64 offset,
struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent);
struct cxl_endpoint_decoder *cxled;
u64 hpa, hpa_offset, dpa_offset;
- u64 bits_upper, bits_lower;
- u64 shifted, rem, temp;
u16 eig = 0;
u8 eiw = 0;
int pos;
@@ -3033,56 +3183,21 @@ static int region_offset_to_dpa_result(struct cxl_region *cxlr, u64 offset,
* If the root decoder has SPA to CXL HPA callback, use it. Otherwise
* CXL HPA is assumed to equal SPA.
*/
- if (has_spa_to_hpa(cxlrd)) {
- hpa = cxlrd->ops->spa_to_hpa(cxlrd, p->res->start + offset);
+ if (cxlrd->ops.spa_to_hpa) {
+ hpa = cxlrd->ops.spa_to_hpa(cxlrd, p->res->start + offset);
hpa_offset = hpa - p->res->start;
} else {
hpa_offset = offset;
}
- /*
- * Interleave position: CXL Spec 3.2 Section 8.2.4.20.13
- * eiw < 8
- * Position is in the IW bits at HPA_OFFSET[IG+8+IW-1:IG+8].
- * Per spec "remove IW bits starting with bit position IG+8"
- * eiw >= 8
- * Position is not explicitly stored in HPA_OFFSET bits. It is
- * derived from the modulo operation of the upper bits using
- * the total number of interleave ways.
- */
- if (eiw < 8) {
- pos = (hpa_offset >> (eig + 8)) & GENMASK(eiw - 1, 0);
- } else {
- shifted = hpa_offset >> (eig + 8);
- div64_u64_rem(shifted, p->interleave_ways, &rem);
- pos = rem;
- }
+
+ pos = cxl_calculate_position(hpa_offset, eiw, eig);
if (pos < 0 || pos >= p->nr_targets) {
dev_dbg(&cxlr->dev, "Invalid position %d for %d targets\n",
pos, p->nr_targets);
return -ENXIO;
}
- /*
- * DPA offset: CXL Spec 3.2 Section 8.2.4.20.13
- * Lower bits [IG+7:0] pass through unchanged
- * (eiw < 8)
- * Per spec: DPAOffset[51:IG+8] = (HPAOffset[51:IG+IW+8] >> IW)
- * Clear the position bits to isolate upper section, then
- * reverse the left shift by eiw that occurred during DPA->HPA
- * (eiw >= 8)
- * Per spec: DPAOffset[51:IG+8] = HPAOffset[51:IG+IW] / 3
- * Extract upper bits from the correct bit range and divide by 3
- * to recover the original DPA upper bits
- */
- bits_lower = hpa_offset & GENMASK_ULL(eig + 7, 0);
- if (eiw < 8) {
- temp = hpa_offset &= ~((u64)GENMASK(eig + eiw + 8 - 1, 0));
- dpa_offset = temp >> eiw;
- } else {
- bits_upper = div64_u64(hpa_offset >> (eig + eiw), 3);
- dpa_offset = bits_upper << (eig + 8);
- }
- dpa_offset |= bits_lower;
+ dpa_offset = cxl_calculate_dpa_offset(hpa_offset, eiw, eig);
/* Look-up and return the result: a memdev and a DPA */
for (int i = 0; i < p->nr_targets; i++) {
@@ -3398,7 +3513,7 @@ static int match_region_by_range(struct device *dev, const void *data)
p = &cxlr->params;
guard(rwsem_read)(&cxl_rwsem.region);
- return region_res_match_cxl_range(p, r);
+ return spa_maps_hpa(p, r);
}
static int cxl_extended_linear_cache_resize(struct cxl_region *cxlr,
@@ -3479,6 +3594,10 @@ static int __construct_region(struct cxl_region *cxlr,
"Extended linear cache calculation failed rc:%d\n", rc);
}
+ rc = sysfs_update_group(&cxlr->dev.kobj, &cxl_region_group);
+ if (rc)
+ return rc;
+
rc = insert_resource(cxlrd->res, res);
if (rc) {
/*
diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
index 231ddccf8977..ba17fa86d249 100644
--- a/drivers/cxl/cxl.h
+++ b/drivers/cxl/cxl.h
@@ -451,7 +451,7 @@ struct cxl_root_decoder {
void *platform_data;
struct mutex range_lock;
int qos_class;
- struct cxl_rd_ops *ops;
+ struct cxl_rd_ops ops;
struct cxl_switch_decoder cxlsd;
};
@@ -517,6 +517,14 @@ enum cxl_partition_mode {
*/
#define CXL_REGION_F_NEEDS_RESET 1
+/*
+ * Indicate whether this region is locked due to 1 or more decoders that have
+ * been locked. The approach of all or nothing is taken with regard to the
+ * locked attribute. CXL_REGION_F_NEEDS_RESET should not be set if this flag is
+ * set.
+ */
+#define CXL_REGION_F_LOCK 2
+
/**
* struct cxl_region - CXL region
* @dev: This region's device
@@ -738,6 +746,25 @@ static inline bool is_cxl_root(struct cxl_port *port)
return port->uport_dev == port->dev.parent;
}
+/* Address translation functions exported to cxl_translate test module only */
+int cxl_validate_translation_params(u8 eiw, u16 eig, int pos);
+u64 cxl_calculate_hpa_offset(u64 dpa_offset, int pos, u8 eiw, u16 eig);
+u64 cxl_calculate_dpa_offset(u64 hpa_offset, u8 eiw, u16 eig);
+int cxl_calculate_position(u64 hpa_offset, u8 eiw, u16 eig);
+struct cxl_cxims_data {
+ int nr_maps;
+ u64 xormaps[] __counted_by(nr_maps);
+};
+
+#if IS_ENABLED(CONFIG_CXL_ACPI)
+u64 cxl_do_xormap_calc(struct cxl_cxims_data *cximsd, u64 addr, int hbiw);
+#else
+static inline u64 cxl_do_xormap_calc(struct cxl_cxims_data *cximsd, u64 addr, int hbiw)
+{
+ return ULLONG_MAX;
+}
+#endif
+
int cxl_num_decoders_committed(struct cxl_port *port);
bool is_cxl_port(const struct device *dev);
struct cxl_port *to_cxl_port(const struct device *dev);
diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h
index 7ae621e618e7..1d526bea8431 100644
--- a/drivers/cxl/cxlpci.h
+++ b/drivers/cxl/cxlpci.h
@@ -127,7 +127,6 @@ static inline bool cxl_pci_flit_256(struct pci_dev *pdev)
return lnksta2 & PCI_EXP_LNKSTA2_FLIT;
}
-int devm_cxl_port_enumerate_dports(struct cxl_port *port);
struct cxl_dev_state;
void read_cdat_data(struct cxl_port *port);
void cxl_cor_error_detected(struct pci_dev *pdev);
diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
index bd100ac31672..0be4e508affe 100644
--- a/drivers/cxl/pci.c
+++ b/drivers/cxl/pci.c
@@ -136,7 +136,7 @@ static irqreturn_t cxl_pci_mbox_irq(int irq, void *id)
if (opcode == CXL_MBOX_OP_SANITIZE) {
mutex_lock(&cxl_mbox->mbox_mutex);
if (mds->security.sanitize_node)
- mod_delayed_work(system_wq, &mds->security.poll_dwork, 0);
+ mod_delayed_work(system_percpu_wq, &mds->security.poll_dwork, 0);
mutex_unlock(&cxl_mbox->mbox_mutex);
} else {
/* short-circuit the wait in __cxl_pci_mbox_send_cmd() */
diff --git a/drivers/dma-buf/Makefile b/drivers/dma-buf/Makefile
index 70ec901edf2c..2008fb7481b3 100644
--- a/drivers/dma-buf/Makefile
+++ b/drivers/dma-buf/Makefile
@@ -1,6 +1,6 @@
# SPDX-License-Identifier: GPL-2.0-only
obj-y := dma-buf.o dma-fence.o dma-fence-array.o dma-fence-chain.o \
- dma-fence-unwrap.o dma-resv.o
+ dma-fence-unwrap.o dma-resv.o dma-buf-mapping.o
obj-$(CONFIG_DMABUF_HEAPS) += dma-heap.o
obj-$(CONFIG_DMABUF_HEAPS) += heaps/
obj-$(CONFIG_SYNC_FILE) += sync_file.o
diff --git a/drivers/dma-buf/dma-buf-mapping.c b/drivers/dma-buf/dma-buf-mapping.c
new file mode 100644
index 000000000000..b7352e609fbd
--- /dev/null
+++ b/drivers/dma-buf/dma-buf-mapping.c
@@ -0,0 +1,248 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * DMA BUF Mapping Helpers
+ *
+ */
+#include <linux/dma-buf-mapping.h>
+#include <linux/dma-resv.h>
+
+static struct scatterlist *fill_sg_entry(struct scatterlist *sgl, size_t length,
+ dma_addr_t addr)
+{
+ unsigned int len, nents;
+ int i;
+
+ nents = DIV_ROUND_UP(length, UINT_MAX);
+ for (i = 0; i < nents; i++) {
+ len = min_t(size_t, length, UINT_MAX);
+ length -= len;
+ /*
+ * DMABUF abuses scatterlist to create a scatterlist
+ * that does not have any CPU list, only the DMA list.
+ * Always set the page related values to NULL to ensure
+ * importers can't use it. The phys_addr based DMA API
+ * does not require the CPU list for mapping or unmapping.
+ */
+ sg_set_page(sgl, NULL, 0, 0);
+ sg_dma_address(sgl) = addr + (dma_addr_t)i * UINT_MAX;
+ sg_dma_len(sgl) = len;
+ sgl = sg_next(sgl);
+ }
+
+ return sgl;
+}
+
+static unsigned int calc_sg_nents(struct dma_iova_state *state,
+ struct dma_buf_phys_vec *phys_vec,
+ size_t nr_ranges, size_t size)
+{
+ unsigned int nents = 0;
+ size_t i;
+
+ if (!state || !dma_use_iova(state)) {
+ for (i = 0; i < nr_ranges; i++)
+ nents += DIV_ROUND_UP(phys_vec[i].len, UINT_MAX);
+ } else {
+ /*
+ * In IOVA case, there is only one SG entry which spans
+ * for whole IOVA address space, but we need to make sure
+ * that it fits sg->length, maybe we need more.
+ */
+ nents = DIV_ROUND_UP(size, UINT_MAX);
+ }
+
+ return nents;
+}
+
+/**
+ * struct dma_buf_dma - holds DMA mapping information
+ * @sgt: Scatter-gather table
+ * @state: DMA IOVA state relevant in IOMMU-based DMA
+ * @size: Total size of DMA transfer
+ */
+struct dma_buf_dma {
+ struct sg_table sgt;
+ struct dma_iova_state *state;
+ size_t size;
+};
+
+/**
+ * dma_buf_phys_vec_to_sgt - Returns the scatterlist table of the attachment
+ * from arrays of physical vectors. This funciton is intended for MMIO memory
+ * only.
+ * @attach: [in] attachment whose scatterlist is to be returned
+ * @provider: [in] p2pdma provider
+ * @phys_vec: [in] array of physical vectors
+ * @nr_ranges: [in] number of entries in phys_vec array
+ * @size: [in] total size of phys_vec
+ * @dir: [in] direction of DMA transfer
+ *
+ * Returns sg_table containing the scatterlist to be returned; returns ERR_PTR
+ * on error. May return -EINTR if it is interrupted by a signal.
+ *
+ * On success, the DMA addresses and lengths in the returned scatterlist are
+ * PAGE_SIZE aligned.
+ *
+ * A mapping must be unmapped by using dma_buf_free_sgt().
+ *
+ * NOTE: This function is intended for exporters. If direct traffic routing is
+ * mandatory exporter should call routing pci_p2pdma_map_type() before calling
+ * this function.
+ */
+struct sg_table *dma_buf_phys_vec_to_sgt(struct dma_buf_attachment *attach,
+ struct p2pdma_provider *provider,
+ struct dma_buf_phys_vec *phys_vec,
+ size_t nr_ranges, size_t size,
+ enum dma_data_direction dir)
+{
+ unsigned int nents, mapped_len = 0;
+ struct dma_buf_dma *dma;
+ struct scatterlist *sgl;
+ dma_addr_t addr;
+ size_t i;
+ int ret;
+
+ dma_resv_assert_held(attach->dmabuf->resv);
+
+ if (WARN_ON(!attach || !attach->dmabuf || !provider))
+ /* This function is supposed to work on MMIO memory only */
+ return ERR_PTR(-EINVAL);
+
+ dma = kzalloc(sizeof(*dma), GFP_KERNEL);
+ if (!dma)
+ return ERR_PTR(-ENOMEM);
+
+ switch (pci_p2pdma_map_type(provider, attach->dev)) {
+ case PCI_P2PDMA_MAP_BUS_ADDR:
+ /*
+ * There is no need in IOVA at all for this flow.
+ */
+ break;
+ case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
+ dma->state = kzalloc(sizeof(*dma->state), GFP_KERNEL);
+ if (!dma->state) {
+ ret = -ENOMEM;
+ goto err_free_dma;
+ }
+
+ dma_iova_try_alloc(attach->dev, dma->state, 0, size);
+ break;
+ default:
+ ret = -EINVAL;
+ goto err_free_dma;
+ }
+
+ nents = calc_sg_nents(dma->state, phys_vec, nr_ranges, size);
+ ret = sg_alloc_table(&dma->sgt, nents, GFP_KERNEL | __GFP_ZERO);
+ if (ret)
+ goto err_free_state;
+
+ sgl = dma->sgt.sgl;
+
+ for (i = 0; i < nr_ranges; i++) {
+ if (!dma->state) {
+ addr = pci_p2pdma_bus_addr_map(provider,
+ phys_vec[i].paddr);
+ } else if (dma_use_iova(dma->state)) {
+ ret = dma_iova_link(attach->dev, dma->state,
+ phys_vec[i].paddr, 0,
+ phys_vec[i].len, dir,
+ DMA_ATTR_MMIO);
+ if (ret)
+ goto err_unmap_dma;
+
+ mapped_len += phys_vec[i].len;
+ } else {
+ addr = dma_map_phys(attach->dev, phys_vec[i].paddr,
+ phys_vec[i].len, dir,
+ DMA_ATTR_MMIO);
+ ret = dma_mapping_error(attach->dev, addr);
+ if (ret)
+ goto err_unmap_dma;
+ }
+
+ if (!dma->state || !dma_use_iova(dma->state))
+ sgl = fill_sg_entry(sgl, phys_vec[i].len, addr);
+ }
+
+ if (dma->state && dma_use_iova(dma->state)) {
+ WARN_ON_ONCE(mapped_len != size);
+ ret = dma_iova_sync(attach->dev, dma->state, 0, mapped_len);
+ if (ret)
+ goto err_unmap_dma;
+
+ sgl = fill_sg_entry(sgl, mapped_len, dma->state->addr);
+ }
+
+ dma->size = size;
+
+ /*
+ * No CPU list included — set orig_nents = 0 so others can detect
+ * this via SG table (use nents only).
+ */
+ dma->sgt.orig_nents = 0;
+
+
+ /*
+ * SGL must be NULL to indicate that SGL is the last one
+ * and we allocated correct number of entries in sg_alloc_table()
+ */
+ WARN_ON_ONCE(sgl);
+ return &dma->sgt;
+
+err_unmap_dma:
+ if (!i || !dma->state) {
+ ; /* Do nothing */
+ } else if (dma_use_iova(dma->state)) {
+ dma_iova_destroy(attach->dev, dma->state, mapped_len, dir,
+ DMA_ATTR_MMIO);
+ } else {
+ for_each_sgtable_dma_sg(&dma->sgt, sgl, i)
+ dma_unmap_phys(attach->dev, sg_dma_address(sgl),
+ sg_dma_len(sgl), dir, DMA_ATTR_MMIO);
+ }
+ sg_free_table(&dma->sgt);
+err_free_state:
+ kfree(dma->state);
+err_free_dma:
+ kfree(dma);
+ return ERR_PTR(ret);
+}
+EXPORT_SYMBOL_NS_GPL(dma_buf_phys_vec_to_sgt, "DMA_BUF");
+
+/**
+ * dma_buf_free_sgt- unmaps the buffer
+ * @attach: [in] attachment to unmap buffer from
+ * @sgt: [in] scatterlist info of the buffer to unmap
+ * @dir: [in] direction of DMA transfer
+ *
+ * This unmaps a DMA mapping for @attached obtained
+ * by dma_buf_phys_vec_to_sgt().
+ */
+void dma_buf_free_sgt(struct dma_buf_attachment *attach, struct sg_table *sgt,
+ enum dma_data_direction dir)
+{
+ struct dma_buf_dma *dma = container_of(sgt, struct dma_buf_dma, sgt);
+ int i;
+
+ dma_resv_assert_held(attach->dmabuf->resv);
+
+ if (!dma->state) {
+ ; /* Do nothing */
+ } else if (dma_use_iova(dma->state)) {
+ dma_iova_destroy(attach->dev, dma->state, dma->size, dir,
+ DMA_ATTR_MMIO);
+ } else {
+ struct scatterlist *sgl;
+
+ for_each_sgtable_dma_sg(sgt, sgl, i)
+ dma_unmap_phys(attach->dev, sg_dma_address(sgl),
+ sg_dma_len(sgl), dir, DMA_ATTR_MMIO);
+ }
+
+ sg_free_table(sgt);
+ kfree(dma->state);
+ kfree(dma);
+
+}
+EXPORT_SYMBOL_NS_GPL(dma_buf_free_sgt, "DMA_BUF");
diff --git a/drivers/dma/ioat/init.c b/drivers/dma/ioat/init.c
index 02f68b328511..227398673b73 100644
--- a/drivers/dma/ioat/init.c
+++ b/drivers/dma/ioat/init.c
@@ -1286,7 +1286,6 @@ static pci_ers_result_t ioat_pcie_error_slot_reset(struct pci_dev *pdev)
} else {
pci_set_master(pdev);
pci_restore_state(pdev);
- pci_save_state(pdev);
pci_wake_from_d3(pdev, false);
}
diff --git a/drivers/firmware/efi/cper-arm.c b/drivers/firmware/efi/cper-arm.c
index f0a63d09d3c4..76542a53e202 100644
--- a/drivers/firmware/efi/cper-arm.c
+++ b/drivers/firmware/efi/cper-arm.c
@@ -93,15 +93,11 @@ static void cper_print_arm_err_info(const char *pfx, u32 type,
bool proc_context_corrupt, corrected, precise_pc, restartable_pc;
bool time_out, access_mode;
- /* If the type is unknown, bail. */
- if (type > CPER_ARM_MAX_TYPE)
- return;
-
/*
* Vendor type errors have error information values that are vendor
* specific.
*/
- if (type == CPER_ARM_VENDOR_ERROR)
+ if (type & CPER_ARM_VENDOR_ERROR)
return;
if (error_info & CPER_ARM_ERR_VALID_TRANSACTION_TYPE) {
@@ -116,43 +112,38 @@ static void cper_print_arm_err_info(const char *pfx, u32 type,
if (error_info & CPER_ARM_ERR_VALID_OPERATION_TYPE) {
op_type = ((error_info >> CPER_ARM_ERR_OPERATION_SHIFT)
& CPER_ARM_ERR_OPERATION_MASK);
- switch (type) {
- case CPER_ARM_CACHE_ERROR:
+ if (type & CPER_ARM_CACHE_ERROR) {
if (op_type < ARRAY_SIZE(arm_cache_err_op_strs)) {
- printk("%soperation type: %s\n", pfx,
+ printk("%scache error, operation type: %s\n", pfx,
arm_cache_err_op_strs[op_type]);
}
- break;
- case CPER_ARM_TLB_ERROR:
+ }
+ if (type & CPER_ARM_TLB_ERROR) {
if (op_type < ARRAY_SIZE(arm_tlb_err_op_strs)) {
- printk("%soperation type: %s\n", pfx,
+ printk("%sTLB error, operation type: %s\n", pfx,
arm_tlb_err_op_strs[op_type]);
}
- break;
- case CPER_ARM_BUS_ERROR:
+ }
+ if (type & CPER_ARM_BUS_ERROR) {
if (op_type < ARRAY_SIZE(arm_bus_err_op_strs)) {
- printk("%soperation type: %s\n", pfx,
+ printk("%sbus error, operation type: %s\n", pfx,
arm_bus_err_op_strs[op_type]);
}
- break;
}
}
if (error_info & CPER_ARM_ERR_VALID_LEVEL) {
level = ((error_info >> CPER_ARM_ERR_LEVEL_SHIFT)
& CPER_ARM_ERR_LEVEL_MASK);
- switch (type) {
- case CPER_ARM_CACHE_ERROR:
+ if (type & CPER_ARM_CACHE_ERROR)
printk("%scache level: %d\n", pfx, level);
- break;
- case CPER_ARM_TLB_ERROR:
+
+ if (type & CPER_ARM_TLB_ERROR)
printk("%sTLB level: %d\n", pfx, level);
- break;
- case CPER_ARM_BUS_ERROR:
+
+ if (type & CPER_ARM_BUS_ERROR)
printk("%saffinity level at which the bus error occurred: %d\n",
pfx, level);
- break;
- }
}
if (error_info & CPER_ARM_ERR_VALID_PROC_CONTEXT_CORRUPT) {
@@ -240,7 +231,8 @@ void cper_print_proc_arm(const char *pfx,
int i, len, max_ctx_type;
struct cper_arm_err_info *err_info;
struct cper_arm_ctx_info *ctx_info;
- char newpfx[64], infopfx[64];
+ char newpfx[64], infopfx[ARRAY_SIZE(newpfx) + 1];
+ char error_type[120];
printk("%sMIDR: 0x%016llx\n", pfx, proc->midr);
@@ -289,9 +281,15 @@ void cper_print_proc_arm(const char *pfx,
newpfx);
}
- printk("%serror_type: %d, %s\n", newpfx, err_info->type,
- err_info->type < ARRAY_SIZE(cper_proc_error_type_strs) ?
- cper_proc_error_type_strs[err_info->type] : "unknown");
+ cper_bits_to_str(error_type, sizeof(error_type),
+ FIELD_GET(CPER_ARM_ERR_TYPE_MASK, err_info->type),
+ cper_proc_error_type_strs,
+ ARRAY_SIZE(cper_proc_error_type_strs));
+
+ printk("%serror_type: 0x%02x: %s%s\n", newpfx, err_info->type,
+ error_type,
+ (err_info->type & ~CPER_ARM_ERR_TYPE_MASK) ? " with reserved bit(s)" : "");
+
if (err_info->validation_bits & CPER_ARM_INFO_VALID_ERR_INFO) {
printk("%serror_info: 0x%016llx\n", newpfx,
err_info->error_info);
diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c
index 928409199a1a..0232bd040f61 100644
--- a/drivers/firmware/efi/cper.c
+++ b/drivers/firmware/efi/cper.c
@@ -12,6 +12,7 @@
* Specification version 2.4.
*/
+#include <linux/bitmap.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/time.h>
@@ -69,7 +70,7 @@ const char *cper_severity_str(unsigned int severity)
}
EXPORT_SYMBOL_GPL(cper_severity_str);
-/*
+/**
* cper_print_bits - print strings for set bits
* @pfx: prefix for each line, including log level and prefix string
* @bits: bit mask
@@ -106,6 +107,65 @@ void cper_print_bits(const char *pfx, unsigned int bits,
printk("%s\n", buf);
}
+/**
+ * cper_bits_to_str - return a string for set bits
+ * @buf: buffer to store the output string
+ * @buf_size: size of the output string buffer
+ * @bits: bit mask
+ * @strs: string array, indexed by bit position
+ * @strs_size: size of the string array: @strs
+ *
+ * Add to @buf the bitmask in hexadecimal. Then, for each set bit in @bits,
+ * add the corresponding string describing the bit in @strs to @buf.
+ *
+ * A typical example is::
+ *
+ * const char * const bits[] = {
+ * "bit 3 name",
+ * "bit 4 name",
+ * "bit 5 name",
+ * };
+ * char str[120];
+ * unsigned int bitmask = BIT(3) | BIT(5);
+ * #define MASK GENMASK(5,3)
+ *
+ * cper_bits_to_str(str, sizeof(str), FIELD_GET(MASK, bitmask),
+ * bits, ARRAY_SIZE(bits));
+ *
+ * The above code fills the string ``str`` with ``bit 3 name|bit 5 name``.
+ *
+ * Return: number of bytes stored or an error code if lower than zero.
+ */
+int cper_bits_to_str(char *buf, int buf_size, unsigned long bits,
+ const char * const strs[], unsigned int strs_size)
+{
+ int len = buf_size;
+ char *str = buf;
+ int i, size;
+
+ *buf = '\0';
+
+ for_each_set_bit(i, &bits, strs_size) {
+ if (!(bits & BIT_ULL(i)))
+ continue;
+
+ if (*buf && len > 0) {
+ *str = '|';
+ len--;
+ str++;
+ }
+
+ size = strscpy(str, strs[i], len);
+ if (size < 0)
+ return size;
+
+ len -= size;
+ str += size;
+ }
+ return len - buf_size;
+}
+EXPORT_SYMBOL_GPL(cper_bits_to_str);
+
static const char * const proc_type_strs[] = {
"IA32/X64",
"IA64",
diff --git a/drivers/firmware/efi/libstub/efi-stub.c b/drivers/firmware/efi/libstub/efi-stub.c
index 874f63b4a383..9cb814c5ba1b 100644
--- a/drivers/firmware/efi/libstub/efi-stub.c
+++ b/drivers/firmware/efi/libstub/efi-stub.c
@@ -56,7 +56,7 @@ static struct screen_info *setup_graphics(void)
{
struct screen_info *si, tmp = {};
- if (efi_setup_gop(&tmp) != EFI_SUCCESS)
+ if (efi_setup_graphics(&tmp, NULL) != EFI_SUCCESS)
return NULL;
si = alloc_screen_info();
diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h
index f5ba032863a9..b2fb0c3fa721 100644
--- a/drivers/firmware/efi/libstub/efistub.h
+++ b/drivers/firmware/efi/libstub/efistub.h
@@ -34,6 +34,9 @@
#define EFI_ALLOC_LIMIT ULONG_MAX
#endif
+struct edid_info;
+struct screen_info;
+
extern bool efi_no5lvl;
extern bool efi_nochunk;
extern bool efi_nokaslr;
@@ -578,6 +581,32 @@ union efi_graphics_output_protocol {
} mixed_mode;
};
+typedef union efi_edid_discovered_protocol efi_edid_discovered_protocol_t;
+
+union efi_edid_discovered_protocol {
+ struct {
+ u32 size_of_edid;
+ u8 *edid;
+ };
+ struct {
+ u32 size_of_edid;
+ u32 edid;
+ } mixed_mode;
+};
+
+typedef union efi_edid_active_protocol efi_edid_active_protocol_t;
+
+union efi_edid_active_protocol {
+ struct {
+ u32 size_of_edid;
+ u8 *edid;
+ };
+ struct {
+ u32 size_of_edid;
+ u32 edid;
+ } mixed_mode;
+};
+
typedef union {
struct {
u32 revision;
@@ -1085,7 +1114,7 @@ efi_status_t efi_parse_options(char const *cmdline);
void efi_parse_option_graphics(char *option);
-efi_status_t efi_setup_gop(struct screen_info *si);
+efi_status_t efi_setup_graphics(struct screen_info *si, struct edid_info *edid);
efi_status_t handle_cmdline_files(efi_loaded_image_t *image,
const efi_char16_t *optstr,
diff --git a/drivers/firmware/efi/libstub/gop.c b/drivers/firmware/efi/libstub/gop.c
index 3785fb4986b4..72d74436a7a4 100644
--- a/drivers/firmware/efi/libstub/gop.c
+++ b/drivers/firmware/efi/libstub/gop.c
@@ -12,6 +12,7 @@
#include <linux/string.h>
#include <asm/efi.h>
#include <asm/setup.h>
+#include <video/edid.h>
#include "efistub.h"
@@ -367,24 +368,31 @@ static void find_bits(u32 mask, u8 *pos, u8 *size)
*size = __fls(mask) - *pos + 1;
}
-static void
-setup_pixel_info(struct screen_info *si, u32 pixels_per_scan_line,
- efi_pixel_bitmask_t pixel_info, int pixel_format)
+static void setup_screen_info(struct screen_info *si, const efi_graphics_output_protocol_t *gop)
{
- if (pixel_format == PIXEL_BIT_MASK) {
- find_bits(pixel_info.red_mask,
- &si->red_pos, &si->red_size);
- find_bits(pixel_info.green_mask,
- &si->green_pos, &si->green_size);
- find_bits(pixel_info.blue_mask,
- &si->blue_pos, &si->blue_size);
- find_bits(pixel_info.reserved_mask,
- &si->rsvd_pos, &si->rsvd_size);
- si->lfb_depth = si->red_size + si->green_size +
- si->blue_size + si->rsvd_size;
- si->lfb_linelength = (pixels_per_scan_line * si->lfb_depth) / 8;
+ const efi_graphics_output_protocol_mode_t *mode = efi_table_attr(gop, mode);
+ const efi_graphics_output_mode_info_t *info = efi_table_attr(mode, info);
+
+ si->orig_video_isVGA = VIDEO_TYPE_EFI;
+
+ si->lfb_width = info->horizontal_resolution;
+ si->lfb_height = info->vertical_resolution;
+
+ efi_set_u64_split(efi_table_attr(mode, frame_buffer_base),
+ &si->lfb_base, &si->ext_lfb_base);
+ if (si->ext_lfb_base)
+ si->capabilities |= VIDEO_CAPABILITY_64BIT_BASE;
+ si->pages = 1;
+
+ if (info->pixel_format == PIXEL_BIT_MASK) {
+ find_bits(info->pixel_information.red_mask, &si->red_pos, &si->red_size);
+ find_bits(info->pixel_information.green_mask, &si->green_pos, &si->green_size);
+ find_bits(info->pixel_information.blue_mask, &si->blue_pos, &si->blue_size);
+ find_bits(info->pixel_information.reserved_mask, &si->rsvd_pos, &si->rsvd_size);
+ si->lfb_depth = si->red_size + si->green_size + si->blue_size + si->rsvd_size;
+ si->lfb_linelength = (info->pixels_per_scan_line * si->lfb_depth) / 8;
} else {
- if (pixel_format == PIXEL_RGB_RESERVED_8BIT_PER_COLOR) {
+ if (info->pixel_format == PIXEL_RGB_RESERVED_8BIT_PER_COLOR) {
si->red_pos = 0;
si->blue_pos = 16;
} else /* PIXEL_BGR_RESERVED_8BIT_PER_COLOR */ {
@@ -394,20 +402,33 @@ setup_pixel_info(struct screen_info *si, u32 pixels_per_scan_line,
si->green_pos = 8;
si->rsvd_pos = 24;
- si->red_size = si->green_size =
- si->blue_size = si->rsvd_size = 8;
-
+ si->red_size = 8;
+ si->green_size = 8;
+ si->blue_size = 8;
+ si->rsvd_size = 8;
si->lfb_depth = 32;
- si->lfb_linelength = pixels_per_scan_line * 4;
+ si->lfb_linelength = info->pixels_per_scan_line * 4;
}
+
+ si->lfb_size = si->lfb_linelength * si->lfb_height;
+ si->capabilities |= VIDEO_CAPABILITY_SKIP_QUIRKS;
}
-static efi_graphics_output_protocol_t *find_gop(unsigned long num,
- const efi_handle_t handles[])
+static void setup_edid_info(struct edid_info *edid, u32 gop_size_of_edid, u8 *gop_edid)
+{
+ if (!gop_edid || gop_size_of_edid < 128)
+ memset(edid->dummy, 0, sizeof(edid->dummy));
+ else
+ memcpy(edid->dummy, gop_edid, min(gop_size_of_edid, sizeof(edid->dummy)));
+}
+
+static efi_handle_t find_handle_with_primary_gop(unsigned long num, const efi_handle_t handles[],
+ efi_graphics_output_protocol_t **found_gop)
{
efi_graphics_output_protocol_t *first_gop;
- efi_handle_t h;
+ efi_handle_t h, first_gop_handle;
+ first_gop_handle = NULL;
first_gop = NULL;
for_each_efi_handle(h, handles, num) {
@@ -442,21 +463,25 @@ static efi_graphics_output_protocol_t *find_gop(unsigned long num,
*/
status = efi_bs_call(handle_protocol, h,
&EFI_CONSOLE_OUT_DEVICE_GUID, &dummy);
- if (status == EFI_SUCCESS)
- return gop;
-
- if (!first_gop)
+ if (status == EFI_SUCCESS) {
+ if (found_gop)
+ *found_gop = gop;
+ return h;
+ } else if (!first_gop_handle) {
+ first_gop_handle = h;
first_gop = gop;
+ }
}
- return first_gop;
+ if (found_gop)
+ *found_gop = first_gop;
+ return first_gop_handle;
}
-efi_status_t efi_setup_gop(struct screen_info *si)
+efi_status_t efi_setup_graphics(struct screen_info *si, struct edid_info *edid)
{
efi_handle_t *handles __free(efi_pool) = NULL;
- efi_graphics_output_protocol_mode_t *mode;
- efi_graphics_output_mode_info_t *info;
+ efi_handle_t handle;
efi_graphics_output_protocol_t *gop;
efi_status_t status;
unsigned long num;
@@ -467,35 +492,41 @@ efi_status_t efi_setup_gop(struct screen_info *si)
if (status != EFI_SUCCESS)
return status;
- gop = find_gop(num, handles);
- if (!gop)
+ handle = find_handle_with_primary_gop(num, handles, &gop);
+ if (!handle)
return EFI_NOT_FOUND;
/* Change mode if requested */
set_mode(gop);
/* EFI framebuffer */
- mode = efi_table_attr(gop, mode);
- info = efi_table_attr(mode, info);
-
- si->orig_video_isVGA = VIDEO_TYPE_EFI;
-
- si->lfb_width = info->horizontal_resolution;
- si->lfb_height = info->vertical_resolution;
-
- efi_set_u64_split(efi_table_attr(mode, frame_buffer_base),
- &si->lfb_base, &si->ext_lfb_base);
- if (si->ext_lfb_base)
- si->capabilities |= VIDEO_CAPABILITY_64BIT_BASE;
-
- si->pages = 1;
-
- setup_pixel_info(si, info->pixels_per_scan_line,
- info->pixel_information, info->pixel_format);
-
- si->lfb_size = si->lfb_linelength * si->lfb_height;
+ if (si)
+ setup_screen_info(si, gop);
+
+ /* Display EDID for primary GOP */
+ if (edid) {
+ efi_edid_discovered_protocol_t *discovered_edid;
+ efi_edid_active_protocol_t *active_edid;
+ u32 gop_size_of_edid = 0;
+ u8 *gop_edid = NULL;
+
+ status = efi_bs_call(handle_protocol, handle, &EFI_EDID_ACTIVE_PROTOCOL_GUID,
+ (void **)&active_edid);
+ if (status == EFI_SUCCESS) {
+ gop_size_of_edid = active_edid->size_of_edid;
+ gop_edid = active_edid->edid;
+ } else {
+ status = efi_bs_call(handle_protocol, handle,
+ &EFI_EDID_DISCOVERED_PROTOCOL_GUID,
+ (void **)&discovered_edid);
+ if (status == EFI_SUCCESS) {
+ gop_size_of_edid = discovered_edid->size_of_edid;
+ gop_edid = discovered_edid->edid;
+ }
+ }
- si->capabilities |= VIDEO_CAPABILITY_SKIP_QUIRKS;
+ setup_edid_info(edid, gop_size_of_edid, gop_edid);
+ }
return EFI_SUCCESS;
}
diff --git a/drivers/firmware/efi/libstub/x86-stub.c b/drivers/firmware/efi/libstub/x86-stub.c
index 761121a77f9e..cef32e2c82d8 100644
--- a/drivers/firmware/efi/libstub/x86-stub.c
+++ b/drivers/firmware/efi/libstub/x86-stub.c
@@ -203,6 +203,104 @@ static void retrieve_apple_device_properties(struct boot_params *boot_params)
}
}
+struct smbios_entry_point {
+ u8 anchor[4];
+ u8 ep_checksum;
+ u8 ep_length;
+ u8 major_version;
+ u8 minor_version;
+ u16 max_size_entry;
+ u8 ep_rev;
+ u8 reserved[5];
+
+ struct __packed {
+ u8 anchor[5];
+ u8 checksum;
+ u16 st_length;
+ u32 st_address;
+ u16 number_of_entries;
+ u8 bcd_rev;
+ } intm;
+};
+
+static bool verify_ep_checksum(const void *ptr, int length)
+{
+ u8 sum = 0;
+
+ for (int i = 0; i < length; i++)
+ sum += ((u8 *)ptr)[i];
+
+ return sum == 0;
+}
+
+static bool verify_ep_integrity(const struct smbios_entry_point *ep)
+{
+ if (memcmp(ep->anchor, "_SM_", sizeof(ep->anchor)) != 0)
+ return false;
+
+ if (memcmp(ep->intm.anchor, "_DMI_", sizeof(ep->intm.anchor)) != 0)
+ return false;
+
+ if (!verify_ep_checksum(ep, ep->ep_length) ||
+ !verify_ep_checksum(&ep->intm, sizeof(ep->intm)))
+ return false;
+
+ return true;
+}
+
+static const struct efi_smbios_record *search_record(void *table, u32 length,
+ u8 type)
+{
+ const u8 *p, *end;
+
+ p = (u8 *)table;
+ end = p + length;
+
+ while (p + sizeof(struct efi_smbios_record) < end) {
+ const struct efi_smbios_record *hdr =
+ (struct efi_smbios_record *)p;
+ const u8 *next;
+
+ if (hdr->type == type)
+ return hdr;
+
+ /* Type 127 = End-of-Table */
+ if (hdr->type == 0x7F)
+ return NULL;
+
+ /* Jumping to the unformed section */
+ next = p + hdr->length;
+
+ /* Unformed section ends with 0000h */
+ while ((next[0] != 0 || next[1] != 0) && next + 1 < end)
+ next++;
+
+ next += 2;
+ p = next;
+ }
+
+ return NULL;
+}
+
+static const struct efi_smbios_record *get_table_record(u8 type)
+{
+ const struct smbios_entry_point *ep;
+
+ /*
+ * Locate the legacy 32-bit SMBIOS entrypoint in memory, and parse it
+ * directly. Needed by some Macs that do not implement the EFI protocol.
+ */
+ ep = get_efi_config_table(SMBIOS_TABLE_GUID);
+ if (!ep)
+ return NULL;
+
+ if (!verify_ep_integrity(ep))
+ return NULL;
+
+ return search_record((void *)(unsigned long)ep->intm.st_address,
+ ep->intm.st_length, type);
+}
+
static bool apple_match_product_name(void)
{
static const char type1_product_matches[][15] = {
@@ -218,7 +316,8 @@ static bool apple_match_product_name(void)
const struct efi_smbios_type1_record *record;
const u8 *product;
- record = (struct efi_smbios_type1_record *)efi_get_smbios_record(1);
+ record = (struct efi_smbios_type1_record *)
+ (efi_get_smbios_record(1) ?: get_table_record(1));
if (!record)
return false;
@@ -388,8 +487,9 @@ static void setup_quirks(struct boot_params *boot_params)
static void setup_graphics(struct boot_params *boot_params)
{
struct screen_info *si = memset(&boot_params->screen_info, 0, sizeof(*si));
+ struct edid_info *edid = memset(&boot_params->edid_info, 0, sizeof(*edid));
- efi_setup_gop(si);
+ efi_setup_graphics(si, edid);
}
static void __noreturn efi_exit(efi_handle_t handle, efi_status_t status)
diff --git a/drivers/firmware/efi/memattr.c b/drivers/firmware/efi/memattr.c
index c38b1a335590..e727cc5909cb 100644
--- a/drivers/firmware/efi/memattr.c
+++ b/drivers/firmware/efi/memattr.c
@@ -19,19 +19,19 @@ unsigned long __ro_after_init efi_mem_attr_table = EFI_INVALID_TABLE_ADDR;
* Reserve the memory associated with the Memory Attributes configuration
* table, if it exists.
*/
-int __init efi_memattr_init(void)
+void __init efi_memattr_init(void)
{
efi_memory_attributes_table_t *tbl;
unsigned long size;
if (efi_mem_attr_table == EFI_INVALID_TABLE_ADDR)
- return 0;
+ return;
tbl = early_memremap(efi_mem_attr_table, sizeof(*tbl));
if (!tbl) {
pr_err("Failed to map EFI Memory Attributes table @ 0x%lx\n",
efi_mem_attr_table);
- return -ENOMEM;
+ return;
}
if (tbl->version > 2) {
@@ -61,7 +61,6 @@ int __init efi_memattr_init(void)
unmap:
early_memunmap(tbl, sizeof(*tbl));
- return 0;
}
/*
diff --git a/drivers/firmware/efi/riscv-runtime.c b/drivers/firmware/efi/riscv-runtime.c
index fa71cd898120..4a2588358be2 100644
--- a/drivers/firmware/efi/riscv-runtime.c
+++ b/drivers/firmware/efi/riscv-runtime.c
@@ -36,20 +36,12 @@ static bool __init efi_virtmap_init(void)
init_new_context(NULL, &efi_mm);
for_each_efi_memory_desc(md) {
- phys_addr_t phys = md->phys_addr;
- int ret;
-
if (!(md->attribute & EFI_MEMORY_RUNTIME))
continue;
if (md->virt_addr == U64_MAX)
return false;
- ret = efi_create_mapping(&efi_mm, md);
- if (ret) {
- pr_warn(" EFI remap %pa: failed to create mapping (%d)\n",
- &phys, ret);
- return false;
- }
+ efi_create_mapping(&efi_mm, md);
}
if (efi_memattr_apply_permissions(&efi_mm, efi_set_mapping_permissions))
diff --git a/drivers/firmware/efi/stmm/mm_communication.h b/drivers/firmware/efi/stmm/mm_communication.h
index 52a1f32cd1eb..06e7663f96dc 100644
--- a/drivers/firmware/efi/stmm/mm_communication.h
+++ b/drivers/firmware/efi/stmm/mm_communication.h
@@ -32,7 +32,7 @@
/**
* struct efi_mm_communicate_header - Header used for SMM variable communication
-
+ *
* @header_guid: header use for disambiguation of content
* @message_len: length of the message. Does not include the size of the
* header
@@ -111,7 +111,7 @@ struct efi_mm_communicate_header {
/**
* struct smm_variable_communicate_header - Used for SMM variable communication
-
+ *
* @function: function to call in Smm.
* @ret_status: return status
* @data: payload
@@ -128,7 +128,7 @@ struct smm_variable_communicate_header {
/**
* struct smm_variable_access - Used to communicate with StMM by
* SetVariable and GetVariable.
-
+ *
* @guid: vendor GUID
* @data_size: size of EFI variable data
* @name_size: size of EFI name
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index a1817b4b5173..58c3ffe707d1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1678,9 +1678,9 @@ int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size);
struct pci_bus *root;
struct resource *res;
+ int max_size, r;
unsigned int i;
u16 cmd;
- int r;
if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT))
return 0;
@@ -1726,30 +1726,28 @@ int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
return 0;
/* Limit the BAR size to what is available */
- rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1,
- rbar_size);
+ max_size = pci_rebar_get_max_size(adev->pdev, 0);
+ if (max_size < 0)
+ return 0;
+ rbar_size = min(max_size, rbar_size);
/* Disable memory decoding while we change the BAR addresses and size */
pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
pci_write_config_word(adev->pdev, PCI_COMMAND,
cmd & ~PCI_COMMAND_MEMORY);
- /* Free the VRAM and doorbell BAR, we most likely need to move both. */
+ /* Tear down doorbell as resizing will release BARs */
amdgpu_doorbell_fini(adev);
- if (adev->asic_type >= CHIP_BONAIRE)
- pci_release_resource(adev->pdev, 2);
- pci_release_resource(adev->pdev, 0);
-
- r = pci_resize_resource(adev->pdev, 0, rbar_size);
+ r = pci_resize_resource(adev->pdev, 0, rbar_size,
+ (adev->asic_type >= CHIP_BONAIRE) ? 1 << 5
+ : 1 << 2);
if (r == -ENOSPC)
dev_info(adev->dev,
"Not enough PCI address space for a large BAR.");
else if (r && r != -ENOTSUPP)
dev_err(adev->dev, "Problem resizing BAR0 (%d).", r);
- pci_assign_unassigned_bus_resources(adev->pdev->bus);
-
/* When the doorbell or fb BAR isn't available we have no chance of
* using the device.
*/
diff --git a/drivers/gpu/drm/i915/gt/intel_region_lmem.c b/drivers/gpu/drm/i915/gt/intel_region_lmem.c
index 890183de2277..a30060fd4429 100644
--- a/drivers/gpu/drm/i915/gt/intel_region_lmem.c
+++ b/drivers/gpu/drm/i915/gt/intel_region_lmem.c
@@ -20,16 +20,6 @@
#include "gt/intel_gt_regs.h"
#ifdef CONFIG_64BIT
-static void _release_bars(struct pci_dev *pdev)
-{
- int resno;
-
- for (resno = PCI_STD_RESOURCES; resno < PCI_STD_RESOURCE_END; resno++) {
- if (pci_resource_len(pdev, resno))
- pci_release_resource(pdev, resno);
- }
-}
-
static void
_resize_bar(struct drm_i915_private *i915, int resno, resource_size_t size)
{
@@ -37,9 +27,7 @@ _resize_bar(struct drm_i915_private *i915, int resno, resource_size_t size)
int bar_size = pci_rebar_bytes_to_size(size);
int ret;
- _release_bars(pdev);
-
- ret = pci_resize_resource(pdev, resno, bar_size);
+ ret = pci_resize_resource(pdev, resno, bar_size, 0);
if (ret) {
drm_info(&i915->drm, "Failed to resize BAR%d to %dM (%pe)\n",
resno, 1 << bar_size, ERR_PTR(ret));
@@ -63,16 +51,12 @@ static void i915_resize_lmem_bar(struct drm_i915_private *i915, resource_size_t
current_size = roundup_pow_of_two(pci_resource_len(pdev, GEN12_LMEM_BAR));
if (i915->params.lmem_bar_size) {
- u32 bar_sizes;
-
- rebar_size = i915->params.lmem_bar_size *
- (resource_size_t)SZ_1M;
- bar_sizes = pci_rebar_get_possible_sizes(pdev, GEN12_LMEM_BAR);
-
+ rebar_size = i915->params.lmem_bar_size * (resource_size_t)SZ_1M;
if (rebar_size == current_size)
return;
- if (!(bar_sizes & BIT(pci_rebar_bytes_to_size(rebar_size))) ||
+ if (!pci_rebar_size_supported(pdev, GEN12_LMEM_BAR,
+ pci_rebar_bytes_to_size(rebar_size)) ||
rebar_size >= roundup_pow_of_two(lmem_size)) {
rebar_size = lmem_size;
diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c
index bbeba0d3fca8..3abc9206f1a8 100644
--- a/drivers/gpu/drm/i915/gvt/kvmgt.c
+++ b/drivers/gpu/drm/i915/gvt/kvmgt.c
@@ -1141,6 +1141,122 @@ static int intel_vgpu_set_irqs(struct intel_vgpu *vgpu, u32 flags,
return func(vgpu, index, start, count, flags, data);
}
+static int intel_vgpu_ioctl_get_region_info(struct vfio_device *vfio_dev,
+ struct vfio_region_info *info,
+ struct vfio_info_cap *caps)
+{
+ struct vfio_region_info_cap_sparse_mmap *sparse = NULL;
+ struct intel_vgpu *vgpu = vfio_dev_to_vgpu(vfio_dev);
+ int nr_areas = 1;
+ int cap_type_id;
+ unsigned int i;
+ int ret;
+
+ switch (info->index) {
+ case VFIO_PCI_CONFIG_REGION_INDEX:
+ info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index);
+ info->size = vgpu->gvt->device_info.cfg_space_size;
+ info->flags = VFIO_REGION_INFO_FLAG_READ |
+ VFIO_REGION_INFO_FLAG_WRITE;
+ break;
+ case VFIO_PCI_BAR0_REGION_INDEX:
+ info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index);
+ info->size = vgpu->cfg_space.bar[info->index].size;
+ if (!info->size) {
+ info->flags = 0;
+ break;
+ }
+
+ info->flags = VFIO_REGION_INFO_FLAG_READ |
+ VFIO_REGION_INFO_FLAG_WRITE;
+ break;
+ case VFIO_PCI_BAR1_REGION_INDEX:
+ info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index);
+ info->size = 0;
+ info->flags = 0;
+ break;
+ case VFIO_PCI_BAR2_REGION_INDEX:
+ info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index);
+ info->flags = VFIO_REGION_INFO_FLAG_CAPS |
+ VFIO_REGION_INFO_FLAG_MMAP |
+ VFIO_REGION_INFO_FLAG_READ |
+ VFIO_REGION_INFO_FLAG_WRITE;
+ info->size = gvt_aperture_sz(vgpu->gvt);
+
+ sparse = kzalloc(struct_size(sparse, areas, nr_areas),
+ GFP_KERNEL);
+ if (!sparse)
+ return -ENOMEM;
+
+ sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
+ sparse->header.version = 1;
+ sparse->nr_areas = nr_areas;
+ cap_type_id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
+ sparse->areas[0].offset =
+ PAGE_ALIGN(vgpu_aperture_offset(vgpu));
+ sparse->areas[0].size = vgpu_aperture_sz(vgpu);
+ break;
+
+ case VFIO_PCI_BAR3_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
+ info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index);
+ info->size = 0;
+ info->flags = 0;
+
+ gvt_dbg_core("get region info bar:%d\n", info->index);
+ break;
+
+ case VFIO_PCI_ROM_REGION_INDEX:
+ case VFIO_PCI_VGA_REGION_INDEX:
+ info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index);
+ info->size = 0;
+ info->flags = 0;
+
+ gvt_dbg_core("get region info index:%d\n", info->index);
+ break;
+ default: {
+ struct vfio_region_info_cap_type cap_type = {
+ .header.id = VFIO_REGION_INFO_CAP_TYPE,
+ .header.version = 1
+ };
+
+ if (info->index >= VFIO_PCI_NUM_REGIONS + vgpu->num_regions)
+ return -EINVAL;
+ info->index = array_index_nospec(
+ info->index, VFIO_PCI_NUM_REGIONS + vgpu->num_regions);
+
+ i = info->index - VFIO_PCI_NUM_REGIONS;
+
+ info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index);
+ info->size = vgpu->region[i].size;
+ info->flags = vgpu->region[i].flags;
+
+ cap_type.type = vgpu->region[i].type;
+ cap_type.subtype = vgpu->region[i].subtype;
+
+ ret = vfio_info_add_capability(caps, &cap_type.header,
+ sizeof(cap_type));
+ if (ret)
+ return ret;
+ }
+ }
+
+ if ((info->flags & VFIO_REGION_INFO_FLAG_CAPS) && sparse) {
+ ret = -EINVAL;
+ if (cap_type_id == VFIO_REGION_INFO_CAP_SPARSE_MMAP) {
+ ret = vfio_info_add_capability(
+ caps, &sparse->header,
+ struct_size(sparse, areas, sparse->nr_areas));
+ }
+ if (ret) {
+ kfree(sparse);
+ return ret;
+ }
+ }
+
+ kfree(sparse);
+ return 0;
+}
+
static long intel_vgpu_ioctl(struct vfio_device *vfio_dev, unsigned int cmd,
unsigned long arg)
{
@@ -1169,152 +1285,6 @@ static long intel_vgpu_ioctl(struct vfio_device *vfio_dev, unsigned int cmd,
return copy_to_user((void __user *)arg, &info, minsz) ?
-EFAULT : 0;
- } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) {
- struct vfio_region_info info;
- struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
- unsigned int i;
- int ret;
- struct vfio_region_info_cap_sparse_mmap *sparse = NULL;
- int nr_areas = 1;
- int cap_type_id;
-
- minsz = offsetofend(struct vfio_region_info, offset);
-
- if (copy_from_user(&info, (void __user *)arg, minsz))
- return -EFAULT;
-
- if (info.argsz < minsz)
- return -EINVAL;
-
- switch (info.index) {
- case VFIO_PCI_CONFIG_REGION_INDEX:
- info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
- info.size = vgpu->gvt->device_info.cfg_space_size;
- info.flags = VFIO_REGION_INFO_FLAG_READ |
- VFIO_REGION_INFO_FLAG_WRITE;
- break;
- case VFIO_PCI_BAR0_REGION_INDEX:
- info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
- info.size = vgpu->cfg_space.bar[info.index].size;
- if (!info.size) {
- info.flags = 0;
- break;
- }
-
- info.flags = VFIO_REGION_INFO_FLAG_READ |
- VFIO_REGION_INFO_FLAG_WRITE;
- break;
- case VFIO_PCI_BAR1_REGION_INDEX:
- info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
- info.size = 0;
- info.flags = 0;
- break;
- case VFIO_PCI_BAR2_REGION_INDEX:
- info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
- info.flags = VFIO_REGION_INFO_FLAG_CAPS |
- VFIO_REGION_INFO_FLAG_MMAP |
- VFIO_REGION_INFO_FLAG_READ |
- VFIO_REGION_INFO_FLAG_WRITE;
- info.size = gvt_aperture_sz(vgpu->gvt);
-
- sparse = kzalloc(struct_size(sparse, areas, nr_areas),
- GFP_KERNEL);
- if (!sparse)
- return -ENOMEM;
-
- sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
- sparse->header.version = 1;
- sparse->nr_areas = nr_areas;
- cap_type_id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
- sparse->areas[0].offset =
- PAGE_ALIGN(vgpu_aperture_offset(vgpu));
- sparse->areas[0].size = vgpu_aperture_sz(vgpu);
- break;
-
- case VFIO_PCI_BAR3_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
- info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
- info.size = 0;
- info.flags = 0;
-
- gvt_dbg_core("get region info bar:%d\n", info.index);
- break;
-
- case VFIO_PCI_ROM_REGION_INDEX:
- case VFIO_PCI_VGA_REGION_INDEX:
- info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
- info.size = 0;
- info.flags = 0;
-
- gvt_dbg_core("get region info index:%d\n", info.index);
- break;
- default:
- {
- struct vfio_region_info_cap_type cap_type = {
- .header.id = VFIO_REGION_INFO_CAP_TYPE,
- .header.version = 1 };
-
- if (info.index >= VFIO_PCI_NUM_REGIONS +
- vgpu->num_regions)
- return -EINVAL;
- info.index =
- array_index_nospec(info.index,
- VFIO_PCI_NUM_REGIONS +
- vgpu->num_regions);
-
- i = info.index - VFIO_PCI_NUM_REGIONS;
-
- info.offset =
- VFIO_PCI_INDEX_TO_OFFSET(info.index);
- info.size = vgpu->region[i].size;
- info.flags = vgpu->region[i].flags;
-
- cap_type.type = vgpu->region[i].type;
- cap_type.subtype = vgpu->region[i].subtype;
-
- ret = vfio_info_add_capability(&caps,
- &cap_type.header,
- sizeof(cap_type));
- if (ret)
- return ret;
- }
- }
-
- if ((info.flags & VFIO_REGION_INFO_FLAG_CAPS) && sparse) {
- ret = -EINVAL;
- if (cap_type_id == VFIO_REGION_INFO_CAP_SPARSE_MMAP)
- ret = vfio_info_add_capability(&caps,
- &sparse->header,
- struct_size(sparse, areas,
- sparse->nr_areas));
- if (ret) {
- kfree(sparse);
- return ret;
- }
- }
-
- if (caps.size) {
- info.flags |= VFIO_REGION_INFO_FLAG_CAPS;
- if (info.argsz < sizeof(info) + caps.size) {
- info.argsz = sizeof(info) + caps.size;
- info.cap_offset = 0;
- } else {
- vfio_info_cap_shift(&caps, sizeof(info));
- if (copy_to_user((void __user *)arg +
- sizeof(info), caps.buf,
- caps.size)) {
- kfree(caps.buf);
- kfree(sparse);
- return -EFAULT;
- }
- info.cap_offset = sizeof(info);
- }
-
- kfree(caps.buf);
- }
-
- kfree(sparse);
- return copy_to_user((void __user *)arg, &info, minsz) ?
- -EFAULT : 0;
} else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) {
struct vfio_irq_info info;
@@ -1477,6 +1447,7 @@ static const struct vfio_device_ops intel_vgpu_dev_ops = {
.write = intel_vgpu_write,
.mmap = intel_vgpu_mmap,
.ioctl = intel_vgpu_ioctl,
+ .get_region_info_caps = intel_vgpu_ioctl_get_region_info,
.dma_unmap = intel_vgpu_dma_unmap,
.bind_iommufd = vfio_iommufd_emulated_bind,
.unbind_iommufd = vfio_iommufd_emulated_unbind,
diff --git a/drivers/gpu/drm/xe/xe_vram.c b/drivers/gpu/drm/xe/xe_vram.c
index 0e10da790cc5..d50baefcd124 100644
--- a/drivers/gpu/drm/xe/xe_vram.c
+++ b/drivers/gpu/drm/xe/xe_vram.c
@@ -25,39 +25,13 @@
#include "xe_vram.h"
#include "xe_vram_types.h"
-#define BAR_SIZE_SHIFT 20
-
-/*
- * Release all the BARs that could influence/block LMEMBAR resizing, i.e.
- * assigned IORESOURCE_MEM_64 BARs
- */
-static void release_bars(struct pci_dev *pdev)
-{
- struct resource *res;
- int i;
-
- pci_dev_for_each_resource(pdev, res, i) {
- /* Resource already un-assigned, do not reset it */
- if (!res->parent)
- continue;
-
- /* No need to release unrelated BARs */
- if (!(res->flags & IORESOURCE_MEM_64))
- continue;
-
- pci_release_resource(pdev, i);
- }
-}
-
static void resize_bar(struct xe_device *xe, int resno, resource_size_t size)
{
struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
int bar_size = pci_rebar_bytes_to_size(size);
int ret;
- release_bars(pdev);
-
- ret = pci_resize_resource(pdev, resno, bar_size);
+ ret = pci_resize_resource(pdev, resno, bar_size, 0);
if (ret) {
drm_info(&xe->drm, "Failed to resize BAR%d to %dM (%pe). Consider enabling 'Resizable BAR' support in your BIOS\n",
resno, 1 << bar_size, ERR_PTR(ret));
@@ -79,41 +53,37 @@ void xe_vram_resize_bar(struct xe_device *xe)
resource_size_t current_size;
resource_size_t rebar_size;
struct resource *root_res;
- u32 bar_size_mask;
+ int max_size, i;
u32 pci_cmd;
- int i;
/* gather some relevant info */
current_size = pci_resource_len(pdev, LMEM_BAR);
- bar_size_mask = pci_rebar_get_possible_sizes(pdev, LMEM_BAR);
-
- if (!bar_size_mask)
- return;
if (force_vram_bar_size < 0)
return;
/* set to a specific size? */
if (force_vram_bar_size) {
- u32 bar_size_bit;
+ rebar_size = pci_rebar_bytes_to_size(force_vram_bar_size *
+ (resource_size_t)SZ_1M);
- rebar_size = force_vram_bar_size * (resource_size_t)SZ_1M;
-
- bar_size_bit = bar_size_mask & BIT(pci_rebar_bytes_to_size(rebar_size));
-
- if (!bar_size_bit) {
+ if (!pci_rebar_size_supported(pdev, LMEM_BAR, rebar_size)) {
drm_info(&xe->drm,
- "Requested size: %lluMiB is not supported by rebar sizes: 0x%x. Leaving default: %lluMiB\n",
- (u64)rebar_size >> 20, bar_size_mask, (u64)current_size >> 20);
+ "Requested size: %lluMiB is not supported by rebar sizes: 0x%llx. Leaving default: %lluMiB\n",
+ (u64)pci_rebar_size_to_bytes(rebar_size) >> 20,
+ pci_rebar_get_possible_sizes(pdev, LMEM_BAR),
+ (u64)current_size >> 20);
return;
}
- rebar_size = 1ULL << (__fls(bar_size_bit) + BAR_SIZE_SHIFT);
-
+ rebar_size = pci_rebar_size_to_bytes(rebar_size);
if (rebar_size == current_size)
return;
} else {
- rebar_size = 1ULL << (__fls(bar_size_mask) + BAR_SIZE_SHIFT);
+ max_size = pci_rebar_get_max_size(pdev, LMEM_BAR);
+ if (max_size < 0)
+ return;
+ rebar_size = pci_rebar_size_to_bytes(max_size);
/* only resize if larger than current */
if (rebar_size <= current_size)
diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig
index f0323f1d6f01..794b9778816b 100644
--- a/drivers/infiniband/Kconfig
+++ b/drivers/infiniband/Kconfig
@@ -80,6 +80,7 @@ config INFINIBAND_VIRT_DMA
if INFINIBAND_USER_ACCESS || !INFINIBAND_USER_ACCESS
if !UML
source "drivers/infiniband/hw/bnxt_re/Kconfig"
+source "drivers/infiniband/hw/bng_re/Kconfig"
source "drivers/infiniband/hw/cxgb4/Kconfig"
source "drivers/infiniband/hw/efa/Kconfig"
source "drivers/infiniband/hw/erdma/Kconfig"
diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index 01bede8ba105..024df6ee239d 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -34,7 +34,6 @@ MODULE_AUTHOR("Sean Hefty");
MODULE_DESCRIPTION("InfiniBand CM");
MODULE_LICENSE("Dual BSD/GPL");
-#define CM_DESTROY_ID_WAIT_TIMEOUT 10000 /* msecs */
#define CM_DIRECT_RETRY_CTX ((void *) 1UL)
#define CM_MRA_SETTING 24 /* 4.096us * 2^24 = ~68.7 seconds */
@@ -1057,6 +1056,7 @@ static void cm_destroy_id(struct ib_cm_id *cm_id, int err)
{
struct cm_id_private *cm_id_priv;
enum ib_cm_state old_state;
+ unsigned long timeout;
struct cm_work *work;
int ret;
@@ -1167,10 +1167,9 @@ retest:
xa_erase(&cm.local_id_table, cm_local_id(cm_id->local_id));
cm_deref_id(cm_id_priv);
+ timeout = msecs_to_jiffies((cm_id_priv->max_cm_retries * cm_id_priv->timeout_ms * 5) / 4);
do {
- ret = wait_for_completion_timeout(&cm_id_priv->comp,
- msecs_to_jiffies(
- CM_DESTROY_ID_WAIT_TIMEOUT));
+ ret = wait_for_completion_timeout(&cm_id_priv->comp, timeout);
if (!ret) /* timeout happened */
cm_destroy_id_wait_timeout(cm_id, old_state);
} while (!ret);
@@ -4518,7 +4517,7 @@ static int __init ib_cm_init(void)
get_random_bytes(&cm.random_id_operand, sizeof cm.random_id_operand);
INIT_LIST_HEAD(&cm.timewait_list);
- cm.wq = alloc_workqueue("ib_cm", 0, 1);
+ cm.wq = alloc_workqueue("ib_cm", WQ_PERCPU, 1);
if (!cm.wq) {
ret = -ENOMEM;
goto error2;
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 5b2d3ae3f9fc..95e89f5c147c 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -4475,6 +4475,8 @@ int rdma_connect_locked(struct rdma_cm_id *id,
container_of(id, struct rdma_id_private, id);
int ret;
+ lockdep_assert_held(&id_priv->handler_mutex);
+
if (!cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_CONNECT))
return -EINVAL;
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index b4f3c835844a..13e8a1714bbd 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -3021,7 +3021,7 @@ static int __init ib_core_init(void)
{
int ret = -ENOMEM;
- ib_wq = alloc_workqueue("infiniband", 0, 0);
+ ib_wq = alloc_workqueue("infiniband", WQ_PERCPU, 0);
if (!ib_wq)
return -ENOMEM;
@@ -3031,7 +3031,7 @@ static int __init ib_core_init(void)
goto err;
ib_comp_wq = alloc_workqueue("ib-comp-wq",
- WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
+ WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_SYSFS | WQ_PERCPU, 0);
if (!ib_comp_wq)
goto err_unbound;
diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c
index a7de6f403fca..b097cfcade1c 100644
--- a/drivers/infiniband/core/restrack.c
+++ b/drivers/infiniband/core/restrack.c
@@ -175,7 +175,7 @@ void rdma_restrack_new(struct rdma_restrack_entry *res,
EXPORT_SYMBOL(rdma_restrack_new);
/**
- * rdma_restrack_add() - add object to the reource tracking database
+ * rdma_restrack_add() - add object to the resource tracking database
* @res: resource entry
*/
void rdma_restrack_add(struct rdma_restrack_entry *res)
@@ -277,7 +277,7 @@ int rdma_restrack_put(struct rdma_restrack_entry *res)
EXPORT_SYMBOL(rdma_restrack_put);
/**
- * rdma_restrack_del() - delete object from the reource tracking database
+ * rdma_restrack_del() - delete object from the resource tracking database
* @res: resource entry
*/
void rdma_restrack_del(struct rdma_restrack_entry *res)
diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
index f86ece701db6..ec3be65a2b88 100644
--- a/drivers/infiniband/core/ucma.c
+++ b/drivers/infiniband/core/ucma.c
@@ -366,7 +366,7 @@ static int ucma_event_handler(struct rdma_cm_id *cm_id,
if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) {
xa_lock(&ctx_table);
if (xa_load(&ctx_table, ctx->id) == ctx)
- queue_work(system_unbound_wq, &ctx->close_work);
+ queue_work(system_dfl_wq, &ctx->close_work);
xa_unlock(&ctx_table);
}
return 0;
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index c5b686394760..8137031c2a65 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -45,6 +45,8 @@
#include "uverbs.h"
+#define RESCHED_LOOP_CNT_THRESHOLD 0x1000
+
static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty)
{
bool make_dirty = umem->writable && dirty;
@@ -55,10 +57,14 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d
ib_dma_unmap_sgtable_attrs(dev, &umem->sgt_append.sgt,
DMA_BIDIRECTIONAL, 0);
- for_each_sgtable_sg(&umem->sgt_append.sgt, sg, i)
+ for_each_sgtable_sg(&umem->sgt_append.sgt, sg, i) {
unpin_user_page_range_dirty_lock(sg_page(sg),
DIV_ROUND_UP(sg->length, PAGE_SIZE), make_dirty);
+ if (i && !(i % RESCHED_LOOP_CNT_THRESHOLD))
+ cond_resched();
+ }
+
sg_free_append_table(&umem->sgt_append);
}
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 3a5f81402d2f..11b1a194de44 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -148,6 +148,7 @@ __attribute_const__ int ib_rate_to_mult(enum ib_rate rate)
case IB_RATE_400_GBPS: return 160;
case IB_RATE_600_GBPS: return 240;
case IB_RATE_800_GBPS: return 320;
+ case IB_RATE_1600_GBPS: return 640;
default: return -1;
}
}
@@ -178,6 +179,7 @@ __attribute_const__ enum ib_rate mult_to_ib_rate(int mult)
case 160: return IB_RATE_400_GBPS;
case 240: return IB_RATE_600_GBPS;
case 320: return IB_RATE_800_GBPS;
+ case 640: return IB_RATE_1600_GBPS;
default: return IB_RATE_PORT_CURRENT;
}
}
@@ -208,6 +210,7 @@ __attribute_const__ int ib_rate_to_mbps(enum ib_rate rate)
case IB_RATE_400_GBPS: return 425000;
case IB_RATE_600_GBPS: return 637500;
case IB_RATE_800_GBPS: return 850000;
+ case IB_RATE_1600_GBPS: return 1700000;
default: return -1;
}
}
diff --git a/drivers/infiniband/hw/Makefile b/drivers/infiniband/hw/Makefile
index b706dc0d0263..c42b22ac3303 100644
--- a/drivers/infiniband/hw/Makefile
+++ b/drivers/infiniband/hw/Makefile
@@ -13,5 +13,6 @@ obj-$(CONFIG_INFINIBAND_HFI1) += hfi1/
obj-$(CONFIG_INFINIBAND_HNS_HIP08) += hns/
obj-$(CONFIG_INFINIBAND_QEDR) += qedr/
obj-$(CONFIG_INFINIBAND_BNXT_RE) += bnxt_re/
+obj-$(CONFIG_INFINIBAND_BNG_RE) += bng_re/
obj-$(CONFIG_INFINIBAND_ERDMA) += erdma/
obj-$(CONFIG_INFINIBAND_IONIC) += ionic/
diff --git a/drivers/infiniband/hw/bng_re/Kconfig b/drivers/infiniband/hw/bng_re/Kconfig
new file mode 100644
index 000000000000..85845f72c64d
--- /dev/null
+++ b/drivers/infiniband/hw/bng_re/Kconfig
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: GPL-2.0-only
+config INFINIBAND_BNG_RE
+ tristate "Broadcom Next generation RoCE HCA support"
+ depends on 64BIT
+ depends on INET && DCB && BNGE
+ help
+ This driver supports Broadcom Next generation
+ 50/100/200/400/800 gigabit RoCE HCAs. The module
+ will be called bng_re. To compile this driver
+ as a module, choose M here.
diff --git a/drivers/infiniband/hw/bng_re/Makefile b/drivers/infiniband/hw/bng_re/Makefile
new file mode 100644
index 000000000000..c6aaaf853c77
--- /dev/null
+++ b/drivers/infiniband/hw/bng_re/Makefile
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0
+ccflags-y := -I $(srctree)/drivers/net/ethernet/broadcom/bnge -I $(srctree)/drivers/infiniband/hw/bnxt_re
+
+obj-$(CONFIG_INFINIBAND_BNG_RE) += bng_re.o
+
+bng_re-y := bng_dev.o bng_fw.o \
+ bng_res.o bng_sp.o \
+ bng_debugfs.o
diff --git a/drivers/infiniband/hw/bng_re/bng_debugfs.c b/drivers/infiniband/hw/bng_re/bng_debugfs.c
new file mode 100644
index 000000000000..9ec5a8785250
--- /dev/null
+++ b/drivers/infiniband/hw/bng_re/bng_debugfs.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2025 Broadcom.
+#include <linux/debugfs.h>
+#include <linux/pci.h>
+
+#include <rdma/ib_verbs.h>
+
+#include "bng_res.h"
+#include "bng_fw.h"
+#include "bnge.h"
+#include "bnge_auxr.h"
+#include "bng_re.h"
+#include "bng_debugfs.h"
+
+static struct dentry *bng_re_debugfs_root;
+
+void bng_re_debugfs_add_pdev(struct bng_re_dev *rdev)
+{
+ struct pci_dev *pdev = rdev->aux_dev->pdev;
+
+ rdev->dbg_root =
+ debugfs_create_dir(dev_name(&pdev->dev), bng_re_debugfs_root);
+}
+
+void bng_re_debugfs_rem_pdev(struct bng_re_dev *rdev)
+{
+ debugfs_remove_recursive(rdev->dbg_root);
+ rdev->dbg_root = NULL;
+}
+
+void bng_re_register_debugfs(void)
+{
+ bng_re_debugfs_root = debugfs_create_dir("bng_re", NULL);
+}
+
+void bng_re_unregister_debugfs(void)
+{
+ debugfs_remove(bng_re_debugfs_root);
+}
diff --git a/drivers/infiniband/hw/bng_re/bng_debugfs.h b/drivers/infiniband/hw/bng_re/bng_debugfs.h
new file mode 100644
index 000000000000..baef71df4242
--- /dev/null
+++ b/drivers/infiniband/hw/bng_re/bng_debugfs.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+// Copyright (c) 2025 Broadcom.
+
+#ifndef __BNG_RE_DEBUGFS__
+#define __BNG_RE_DEBUGFS__
+
+void bng_re_debugfs_add_pdev(struct bng_re_dev *rdev);
+void bng_re_debugfs_rem_pdev(struct bng_re_dev *rdev);
+
+void bng_re_register_debugfs(void);
+void bng_re_unregister_debugfs(void);
+#endif
diff --git a/drivers/infiniband/hw/bng_re/bng_dev.c b/drivers/infiniband/hw/bng_re/bng_dev.c
new file mode 100644
index 000000000000..d8f8d7f7075f
--- /dev/null
+++ b/drivers/infiniband/hw/bng_re/bng_dev.c
@@ -0,0 +1,534 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2025 Broadcom.
+
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/auxiliary_bus.h>
+
+#include <rdma/ib_verbs.h>
+
+#include "bng_res.h"
+#include "bng_sp.h"
+#include "bng_fw.h"
+#include "bnge.h"
+#include "bnge_auxr.h"
+#include "bng_re.h"
+#include "bnge_hwrm.h"
+#include "bng_debugfs.h"
+
+MODULE_AUTHOR("Siva Reddy Kallam <siva.kallam@broadcom.com>");
+MODULE_DESCRIPTION(BNG_RE_DESC);
+MODULE_LICENSE("Dual BSD/GPL");
+
+static struct bng_re_dev *bng_re_dev_add(struct auxiliary_device *adev,
+ struct bnge_auxr_dev *aux_dev)
+{
+ struct bng_re_dev *rdev;
+
+ /* Allocate bng_re_dev instance */
+ rdev = ib_alloc_device(bng_re_dev, ibdev);
+ if (!rdev) {
+ pr_err("%s: bng_re_dev allocation failure!", KBUILD_MODNAME);
+ return NULL;
+ }
+
+ /* Assign auxiliary device specific data */
+ rdev->netdev = aux_dev->net;
+ rdev->aux_dev = aux_dev;
+ rdev->adev = adev;
+ rdev->fn_id = rdev->aux_dev->pdev->devfn;
+
+ return rdev;
+}
+
+
+static int bng_re_register_netdev(struct bng_re_dev *rdev)
+{
+ struct bnge_auxr_dev *aux_dev;
+
+ aux_dev = rdev->aux_dev;
+ return bnge_register_dev(aux_dev, rdev->adev);
+}
+
+static void bng_re_destroy_chip_ctx(struct bng_re_dev *rdev)
+{
+ struct bng_re_chip_ctx *chip_ctx;
+
+ if (!rdev->chip_ctx)
+ return;
+
+ kfree(rdev->dev_attr);
+ rdev->dev_attr = NULL;
+
+ chip_ctx = rdev->chip_ctx;
+ rdev->chip_ctx = NULL;
+ rdev->rcfw.res = NULL;
+ rdev->bng_res.cctx = NULL;
+ rdev->bng_res.pdev = NULL;
+ kfree(chip_ctx);
+}
+
+static int bng_re_setup_chip_ctx(struct bng_re_dev *rdev)
+{
+ struct bng_re_chip_ctx *chip_ctx;
+ struct bnge_auxr_dev *aux_dev;
+ int rc = -ENOMEM;
+
+ aux_dev = rdev->aux_dev;
+ rdev->bng_res.pdev = aux_dev->pdev;
+ rdev->rcfw.res = &rdev->bng_res;
+ chip_ctx = kzalloc(sizeof(*chip_ctx), GFP_KERNEL);
+ if (!chip_ctx)
+ return -ENOMEM;
+ chip_ctx->chip_num = aux_dev->chip_num;
+ chip_ctx->hw_stats_size = aux_dev->hw_ring_stats_size;
+
+ rdev->chip_ctx = chip_ctx;
+ rdev->bng_res.cctx = rdev->chip_ctx;
+ rdev->dev_attr = kzalloc(sizeof(*rdev->dev_attr), GFP_KERNEL);
+ if (!rdev->dev_attr)
+ goto free_chip_ctx;
+ rdev->bng_res.dattr = rdev->dev_attr;
+
+ return 0;
+free_chip_ctx:
+ kfree(rdev->chip_ctx);
+ rdev->chip_ctx = NULL;
+ return rc;
+}
+
+static void bng_re_init_hwrm_hdr(struct input *hdr, u16 opcd)
+{
+ hdr->req_type = cpu_to_le16(opcd);
+ hdr->cmpl_ring = cpu_to_le16(-1);
+ hdr->target_id = cpu_to_le16(-1);
+}
+
+static void bng_re_fill_fw_msg(struct bnge_fw_msg *fw_msg, void *msg,
+ int msg_len, void *resp, int resp_max_len,
+ int timeout)
+{
+ fw_msg->msg = msg;
+ fw_msg->msg_len = msg_len;
+ fw_msg->resp = resp;
+ fw_msg->resp_max_len = resp_max_len;
+ fw_msg->timeout = timeout;
+}
+
+static int bng_re_net_ring_free(struct bng_re_dev *rdev,
+ u16 fw_ring_id, int type)
+{
+ struct bnge_auxr_dev *aux_dev = rdev->aux_dev;
+ struct hwrm_ring_free_input req = {};
+ struct hwrm_ring_free_output resp;
+ struct bnge_fw_msg fw_msg = {};
+ int rc = -EINVAL;
+
+ if (!rdev)
+ return rc;
+
+ if (!aux_dev)
+ return rc;
+
+ bng_re_init_hwrm_hdr((void *)&req, HWRM_RING_FREE);
+ req.ring_type = type;
+ req.ring_id = cpu_to_le16(fw_ring_id);
+ bng_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp,
+ sizeof(resp), BNGE_DFLT_HWRM_CMD_TIMEOUT);
+ rc = bnge_send_msg(aux_dev, &fw_msg);
+ if (rc)
+ ibdev_err(&rdev->ibdev, "Failed to free HW ring:%d :%#x",
+ req.ring_id, rc);
+ return rc;
+}
+
+static int bng_re_net_ring_alloc(struct bng_re_dev *rdev,
+ struct bng_re_ring_attr *ring_attr,
+ u16 *fw_ring_id)
+{
+ struct bnge_auxr_dev *aux_dev = rdev->aux_dev;
+ struct hwrm_ring_alloc_input req = {};
+ struct hwrm_ring_alloc_output resp;
+ struct bnge_fw_msg fw_msg = {};
+ int rc = -EINVAL;
+
+ if (!aux_dev)
+ return rc;
+
+ bng_re_init_hwrm_hdr((void *)&req, HWRM_RING_ALLOC);
+ req.enables = 0;
+ req.page_tbl_addr = cpu_to_le64(ring_attr->dma_arr[0]);
+ if (ring_attr->pages > 1) {
+ /* Page size is in log2 units */
+ req.page_size = BNGE_PAGE_SHIFT;
+ req.page_tbl_depth = 1;
+ }
+ req.fbo = 0;
+ /* Association of ring index with doorbell index and MSIX number */
+ req.logical_id = cpu_to_le16(ring_attr->lrid);
+ req.length = cpu_to_le32(ring_attr->depth + 1);
+ req.ring_type = ring_attr->type;
+ req.int_mode = ring_attr->mode;
+ bng_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp,
+ sizeof(resp), BNGE_DFLT_HWRM_CMD_TIMEOUT);
+ rc = bnge_send_msg(aux_dev, &fw_msg);
+ if (!rc)
+ *fw_ring_id = le16_to_cpu(resp.ring_id);
+
+ return rc;
+}
+
+static int bng_re_stats_ctx_free(struct bng_re_dev *rdev)
+{
+ struct bnge_auxr_dev *aux_dev = rdev->aux_dev;
+ struct hwrm_stat_ctx_free_input req = {};
+ struct hwrm_stat_ctx_free_output resp = {};
+ struct bnge_fw_msg fw_msg = {};
+ int rc = -EINVAL;
+
+ if (!aux_dev)
+ return rc;
+
+ bng_re_init_hwrm_hdr((void *)&req, HWRM_STAT_CTX_FREE);
+ req.stat_ctx_id = cpu_to_le32(rdev->stats_ctx.fw_id);
+ bng_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp,
+ sizeof(resp), BNGE_DFLT_HWRM_CMD_TIMEOUT);
+ rc = bnge_send_msg(aux_dev, &fw_msg);
+ if (rc)
+ ibdev_err(&rdev->ibdev, "Failed to free HW stats context %#x",
+ rc);
+
+ return rc;
+}
+
+static int bng_re_stats_ctx_alloc(struct bng_re_dev *rdev)
+{
+ struct bnge_auxr_dev *aux_dev = rdev->aux_dev;
+ struct bng_re_stats *stats = &rdev->stats_ctx;
+ struct hwrm_stat_ctx_alloc_output resp = {};
+ struct hwrm_stat_ctx_alloc_input req = {};
+ struct bnge_fw_msg fw_msg = {};
+ int rc = -EINVAL;
+
+ stats->fw_id = BNGE_INVALID_STATS_CTX_ID;
+
+ if (!aux_dev)
+ return rc;
+
+ bng_re_init_hwrm_hdr((void *)&req, HWRM_STAT_CTX_ALLOC);
+ req.update_period_ms = cpu_to_le32(1000);
+ req.stats_dma_addr = cpu_to_le64(stats->dma_map);
+ req.stats_dma_length = cpu_to_le16(rdev->chip_ctx->hw_stats_size);
+ req.stat_ctx_flags = STAT_CTX_ALLOC_REQ_STAT_CTX_FLAGS_ROCE;
+ bng_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp,
+ sizeof(resp), BNGE_DFLT_HWRM_CMD_TIMEOUT);
+ rc = bnge_send_msg(aux_dev, &fw_msg);
+ if (!rc)
+ stats->fw_id = le32_to_cpu(resp.stat_ctx_id);
+ return rc;
+}
+
+static void bng_re_query_hwrm_version(struct bng_re_dev *rdev)
+{
+ struct bnge_auxr_dev *aux_dev = rdev->aux_dev;
+ struct hwrm_ver_get_output ver_get_resp = {};
+ struct hwrm_ver_get_input ver_get_req = {};
+ struct bng_re_chip_ctx *cctx;
+ struct bnge_fw_msg fw_msg = {};
+ int rc;
+
+ bng_re_init_hwrm_hdr((void *)&ver_get_req, HWRM_VER_GET);
+ ver_get_req.hwrm_intf_maj = HWRM_VERSION_MAJOR;
+ ver_get_req.hwrm_intf_min = HWRM_VERSION_MINOR;
+ ver_get_req.hwrm_intf_upd = HWRM_VERSION_UPDATE;
+ bng_re_fill_fw_msg(&fw_msg, (void *)&ver_get_req, sizeof(ver_get_req),
+ (void *)&ver_get_resp, sizeof(ver_get_resp),
+ BNGE_DFLT_HWRM_CMD_TIMEOUT);
+ rc = bnge_send_msg(aux_dev, &fw_msg);
+ if (rc) {
+ ibdev_err(&rdev->ibdev, "Failed to query HW version, rc = 0x%x",
+ rc);
+ return;
+ }
+
+ cctx = rdev->chip_ctx;
+ cctx->hwrm_intf_ver =
+ (u64)le16_to_cpu(ver_get_resp.hwrm_intf_major) << 48 |
+ (u64)le16_to_cpu(ver_get_resp.hwrm_intf_minor) << 32 |
+ (u64)le16_to_cpu(ver_get_resp.hwrm_intf_build) << 16 |
+ le16_to_cpu(ver_get_resp.hwrm_intf_patch);
+
+ cctx->hwrm_cmd_max_timeout = le16_to_cpu(ver_get_resp.max_req_timeout);
+
+ if (!cctx->hwrm_cmd_max_timeout)
+ cctx->hwrm_cmd_max_timeout = BNG_ROCE_FW_MAX_TIMEOUT;
+}
+
+static void bng_re_dev_uninit(struct bng_re_dev *rdev)
+{
+ int rc;
+ bng_re_debugfs_rem_pdev(rdev);
+
+ if (test_and_clear_bit(BNG_RE_FLAG_RCFW_CHANNEL_EN, &rdev->flags)) {
+ rc = bng_re_deinit_rcfw(&rdev->rcfw);
+ if (rc)
+ ibdev_warn(&rdev->ibdev,
+ "Failed to deinitialize RCFW: %#x", rc);
+ bng_re_stats_ctx_free(rdev);
+ bng_re_free_stats_ctx_mem(rdev->bng_res.pdev, &rdev->stats_ctx);
+ bng_re_disable_rcfw_channel(&rdev->rcfw);
+ bng_re_net_ring_free(rdev, rdev->rcfw.creq.ring_id,
+ RING_ALLOC_REQ_RING_TYPE_NQ);
+ bng_re_free_rcfw_channel(&rdev->rcfw);
+ }
+
+ kfree(rdev->nqr);
+ rdev->nqr = NULL;
+ bng_re_destroy_chip_ctx(rdev);
+ if (test_and_clear_bit(BNG_RE_FLAG_NETDEV_REGISTERED, &rdev->flags))
+ bnge_unregister_dev(rdev->aux_dev);
+}
+
+static int bng_re_dev_init(struct bng_re_dev *rdev)
+{
+ struct bng_re_ring_attr rattr = {};
+ struct bng_re_creq_ctx *creq;
+ u32 db_offt;
+ int vid;
+ u8 type;
+ int rc;
+
+ /* Registered a new RoCE device instance to netdev */
+ rc = bng_re_register_netdev(rdev);
+ if (rc) {
+ ibdev_err(&rdev->ibdev,
+ "Failed to register with netedev: %#x\n", rc);
+ return -EINVAL;
+ }
+
+ set_bit(BNG_RE_FLAG_NETDEV_REGISTERED, &rdev->flags);
+
+ if (rdev->aux_dev->auxr_info->msix_requested < BNG_RE_MIN_MSIX) {
+ ibdev_err(&rdev->ibdev,
+ "RoCE requires minimum 2 MSI-X vectors, but only %d reserved\n",
+ rdev->aux_dev->auxr_info->msix_requested);
+ bnge_unregister_dev(rdev->aux_dev);
+ clear_bit(BNG_RE_FLAG_NETDEV_REGISTERED, &rdev->flags);
+ return -EINVAL;
+ }
+ ibdev_dbg(&rdev->ibdev, "Got %d MSI-X vectors\n",
+ rdev->aux_dev->auxr_info->msix_requested);
+
+ rc = bng_re_setup_chip_ctx(rdev);
+ if (rc) {
+ bnge_unregister_dev(rdev->aux_dev);
+ clear_bit(BNG_RE_FLAG_NETDEV_REGISTERED, &rdev->flags);
+ ibdev_err(&rdev->ibdev, "Failed to get chip context\n");
+ return -EINVAL;
+ }
+
+ bng_re_query_hwrm_version(rdev);
+
+ rc = bng_re_alloc_fw_channel(&rdev->bng_res, &rdev->rcfw);
+ if (rc) {
+ ibdev_err(&rdev->ibdev,
+ "Failed to allocate RCFW Channel: %#x\n", rc);
+ goto fail;
+ }
+
+ /* Allocate nq record memory */
+ rdev->nqr = kzalloc(sizeof(*rdev->nqr), GFP_KERNEL);
+ if (!rdev->nqr) {
+ bng_re_destroy_chip_ctx(rdev);
+ bnge_unregister_dev(rdev->aux_dev);
+ clear_bit(BNG_RE_FLAG_NETDEV_REGISTERED, &rdev->flags);
+ return -ENOMEM;
+ }
+
+ rdev->nqr->num_msix = rdev->aux_dev->auxr_info->msix_requested;
+ memcpy(rdev->nqr->msix_entries, rdev->aux_dev->msix_info,
+ sizeof(struct bnge_msix_info) * rdev->nqr->num_msix);
+
+ type = RING_ALLOC_REQ_RING_TYPE_NQ;
+ creq = &rdev->rcfw.creq;
+ rattr.dma_arr = creq->hwq.pbl[BNG_PBL_LVL_0].pg_map_arr;
+ rattr.pages = creq->hwq.pbl[creq->hwq.level].pg_count;
+ rattr.type = type;
+ rattr.mode = RING_ALLOC_REQ_INT_MODE_MSIX;
+ rattr.depth = BNG_FW_CREQE_MAX_CNT - 1;
+ rattr.lrid = rdev->nqr->msix_entries[BNG_RE_CREQ_NQ_IDX].ring_idx;
+ rc = bng_re_net_ring_alloc(rdev, &rattr, &creq->ring_id);
+ if (rc) {
+ ibdev_err(&rdev->ibdev, "Failed to allocate CREQ: %#x\n", rc);
+ goto free_rcfw;
+ }
+ db_offt = rdev->nqr->msix_entries[BNG_RE_CREQ_NQ_IDX].db_offset;
+ vid = rdev->nqr->msix_entries[BNG_RE_CREQ_NQ_IDX].vector;
+
+ rc = bng_re_enable_fw_channel(&rdev->rcfw,
+ vid, db_offt);
+ if (rc) {
+ ibdev_err(&rdev->ibdev, "Failed to enable RCFW channel: %#x\n",
+ rc);
+ goto free_ring;
+ }
+
+ rc = bng_re_get_dev_attr(&rdev->rcfw);
+ if (rc)
+ goto disable_rcfw;
+
+ bng_re_debugfs_add_pdev(rdev);
+ rc = bng_re_alloc_stats_ctx_mem(rdev->bng_res.pdev, rdev->chip_ctx,
+ &rdev->stats_ctx);
+ if (rc) {
+ ibdev_err(&rdev->ibdev,
+ "Failed to allocate stats context: %#x\n", rc);
+ goto disable_rcfw;
+ }
+
+ rc = bng_re_stats_ctx_alloc(rdev);
+ if (rc) {
+ ibdev_err(&rdev->ibdev,
+ "Failed to allocate QPLIB context: %#x\n", rc);
+ goto free_stats_ctx;
+ }
+
+ rc = bng_re_init_rcfw(&rdev->rcfw, &rdev->stats_ctx);
+ if (rc) {
+ ibdev_err(&rdev->ibdev,
+ "Failed to initialize RCFW: %#x\n", rc);
+ goto free_sctx;
+ }
+ set_bit(BNG_RE_FLAG_RCFW_CHANNEL_EN, &rdev->flags);
+
+ return 0;
+free_sctx:
+ bng_re_stats_ctx_free(rdev);
+free_stats_ctx:
+ bng_re_free_stats_ctx_mem(rdev->bng_res.pdev, &rdev->stats_ctx);
+disable_rcfw:
+ bng_re_disable_rcfw_channel(&rdev->rcfw);
+free_ring:
+ bng_re_net_ring_free(rdev, rdev->rcfw.creq.ring_id, type);
+free_rcfw:
+ bng_re_free_rcfw_channel(&rdev->rcfw);
+fail:
+ bng_re_dev_uninit(rdev);
+ return rc;
+}
+
+static int bng_re_add_device(struct auxiliary_device *adev)
+{
+ struct bnge_auxr_priv *auxr_priv =
+ container_of(adev, struct bnge_auxr_priv, aux_dev);
+ struct bng_re_en_dev_info *dev_info;
+ struct bng_re_dev *rdev;
+ int rc;
+
+ dev_info = auxiliary_get_drvdata(adev);
+
+ rdev = bng_re_dev_add(adev, auxr_priv->auxr_dev);
+ if (!rdev) {
+ rc = -ENOMEM;
+ goto exit;
+ }
+
+ dev_info->rdev = rdev;
+
+ rc = bng_re_dev_init(rdev);
+ if (rc)
+ goto re_dev_dealloc;
+
+ return 0;
+
+re_dev_dealloc:
+ ib_dealloc_device(&rdev->ibdev);
+exit:
+ return rc;
+}
+
+
+static void bng_re_remove_device(struct bng_re_dev *rdev,
+ struct auxiliary_device *aux_dev)
+{
+ bng_re_dev_uninit(rdev);
+ ib_dealloc_device(&rdev->ibdev);
+}
+
+
+static int bng_re_probe(struct auxiliary_device *adev,
+ const struct auxiliary_device_id *id)
+{
+ struct bnge_auxr_priv *aux_priv =
+ container_of(adev, struct bnge_auxr_priv, aux_dev);
+ struct bng_re_en_dev_info *en_info;
+ int rc;
+
+ en_info = kzalloc(sizeof(*en_info), GFP_KERNEL);
+ if (!en_info)
+ return -ENOMEM;
+
+ en_info->auxr_dev = aux_priv->auxr_dev;
+
+ auxiliary_set_drvdata(adev, en_info);
+
+ rc = bng_re_add_device(adev);
+ if (rc)
+ kfree(en_info);
+
+ return rc;
+}
+
+static void bng_re_remove(struct auxiliary_device *adev)
+{
+ struct bng_re_en_dev_info *dev_info = auxiliary_get_drvdata(adev);
+ struct bng_re_dev *rdev;
+
+ rdev = dev_info->rdev;
+
+ if (rdev)
+ bng_re_remove_device(rdev, adev);
+ kfree(dev_info);
+}
+
+static const struct auxiliary_device_id bng_re_id_table[] = {
+ { .name = BNG_RE_ADEV_NAME ".rdma", },
+ {},
+};
+
+MODULE_DEVICE_TABLE(auxiliary, bng_re_id_table);
+
+static struct auxiliary_driver bng_re_driver = {
+ .name = "rdma",
+ .probe = bng_re_probe,
+ .remove = bng_re_remove,
+ .id_table = bng_re_id_table,
+};
+
+static int __init bng_re_mod_init(void)
+{
+ int rc;
+
+
+ bng_re_register_debugfs();
+
+ rc = auxiliary_driver_register(&bng_re_driver);
+ if (rc) {
+ pr_err("%s: Failed to register auxiliary driver\n",
+ KBUILD_MODNAME);
+ goto unreg_debugfs;
+ }
+ return 0;
+unreg_debugfs:
+ bng_re_unregister_debugfs();
+ return rc;
+}
+
+static void __exit bng_re_mod_exit(void)
+{
+ auxiliary_driver_unregister(&bng_re_driver);
+ bng_re_unregister_debugfs();
+}
+
+module_init(bng_re_mod_init);
+module_exit(bng_re_mod_exit);
diff --git a/drivers/infiniband/hw/bng_re/bng_fw.c b/drivers/infiniband/hw/bng_re/bng_fw.c
new file mode 100644
index 000000000000..7d9539113cf5
--- /dev/null
+++ b/drivers/infiniband/hw/bng_re/bng_fw.c
@@ -0,0 +1,767 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2025 Broadcom.
+#include <linux/pci.h>
+
+#include "roce_hsi.h"
+#include "bng_res.h"
+#include "bng_fw.h"
+#include "bng_sp.h"
+
+/**
+ * bng_re_map_rc - map return type based on opcode
+ * @opcode: roce slow path opcode
+ *
+ * case #1
+ * Firmware initiated error recovery is a safe state machine and
+ * driver can consider all the underlying rdma resources are free.
+ * In this state, it is safe to return success for opcodes related to
+ * destroying rdma resources (like destroy qp, destroy cq etc.).
+ *
+ * case #2
+ * If driver detect potential firmware stall, it is not safe state machine
+ * and the driver can not consider all the underlying rdma resources are
+ * freed.
+ * In this state, it is not safe to return success for opcodes related to
+ * destroying rdma resources (like destroy qp, destroy cq etc.).
+ *
+ * Scope of this helper function is only for case #1.
+ *
+ * Returns:
+ * 0 to communicate success to caller.
+ * Non zero error code to communicate failure to caller.
+ */
+static int bng_re_map_rc(u8 opcode)
+{
+ switch (opcode) {
+ case CMDQ_BASE_OPCODE_DESTROY_QP:
+ case CMDQ_BASE_OPCODE_DESTROY_SRQ:
+ case CMDQ_BASE_OPCODE_DESTROY_CQ:
+ case CMDQ_BASE_OPCODE_DEALLOCATE_KEY:
+ case CMDQ_BASE_OPCODE_DEREGISTER_MR:
+ case CMDQ_BASE_OPCODE_DELETE_GID:
+ case CMDQ_BASE_OPCODE_DESTROY_QP1:
+ case CMDQ_BASE_OPCODE_DESTROY_AH:
+ case CMDQ_BASE_OPCODE_DEINITIALIZE_FW:
+ case CMDQ_BASE_OPCODE_MODIFY_ROCE_CC:
+ case CMDQ_BASE_OPCODE_SET_LINK_AGGR_MODE:
+ return 0;
+ default:
+ return -ETIMEDOUT;
+ }
+}
+
+void bng_re_free_rcfw_channel(struct bng_re_rcfw *rcfw)
+{
+ kfree(rcfw->crsqe_tbl);
+ bng_re_free_hwq(rcfw->res, &rcfw->cmdq.hwq);
+ bng_re_free_hwq(rcfw->res, &rcfw->creq.hwq);
+ rcfw->pdev = NULL;
+}
+
+int bng_re_alloc_fw_channel(struct bng_re_res *res,
+ struct bng_re_rcfw *rcfw)
+{
+ struct bng_re_hwq_attr hwq_attr = {};
+ struct bng_re_sg_info sginfo = {};
+ struct bng_re_cmdq_ctx *cmdq;
+ struct bng_re_creq_ctx *creq;
+
+ rcfw->pdev = res->pdev;
+ cmdq = &rcfw->cmdq;
+ creq = &rcfw->creq;
+ rcfw->res = res;
+
+ sginfo.pgsize = PAGE_SIZE;
+ sginfo.pgshft = PAGE_SHIFT;
+
+ hwq_attr.sginfo = &sginfo;
+ hwq_attr.res = rcfw->res;
+ hwq_attr.depth = BNG_FW_CREQE_MAX_CNT;
+ hwq_attr.stride = BNG_FW_CREQE_UNITS;
+ hwq_attr.type = BNG_HWQ_TYPE_QUEUE;
+
+ if (bng_re_alloc_init_hwq(&creq->hwq, &hwq_attr)) {
+ dev_err(&rcfw->pdev->dev,
+ "HW channel CREQ allocation failed\n");
+ goto fail;
+ }
+
+ rcfw->cmdq_depth = BNG_FW_CMDQE_MAX_CNT;
+
+ sginfo.pgsize = bng_fw_cmdqe_page_size(rcfw->cmdq_depth);
+ hwq_attr.depth = rcfw->cmdq_depth & 0x7FFFFFFF;
+ hwq_attr.stride = BNG_FW_CMDQE_UNITS;
+ hwq_attr.type = BNG_HWQ_TYPE_CTX;
+ if (bng_re_alloc_init_hwq(&cmdq->hwq, &hwq_attr)) {
+ dev_err(&rcfw->pdev->dev,
+ "HW channel CMDQ allocation failed\n");
+ goto fail;
+ }
+
+ rcfw->crsqe_tbl = kcalloc(cmdq->hwq.max_elements,
+ sizeof(*rcfw->crsqe_tbl), GFP_KERNEL);
+ if (!rcfw->crsqe_tbl)
+ goto fail;
+
+ spin_lock_init(&rcfw->tbl_lock);
+
+ rcfw->max_timeout = res->cctx->hwrm_cmd_max_timeout;
+ return 0;
+
+fail:
+ bng_re_free_rcfw_channel(rcfw);
+ return -ENOMEM;
+}
+
+static int bng_re_process_qp_event(struct bng_re_rcfw *rcfw,
+ struct creq_qp_event *qp_event,
+ u32 *num_wait)
+{
+ struct bng_re_hwq *hwq = &rcfw->cmdq.hwq;
+ struct bng_re_crsqe *crsqe;
+ u32 req_size;
+ u16 cookie;
+ bool is_waiter_alive;
+ struct pci_dev *pdev;
+ u32 wait_cmds = 0;
+ int rc = 0;
+
+ pdev = rcfw->pdev;
+ switch (qp_event->event) {
+ case CREQ_QP_EVENT_EVENT_QP_ERROR_NOTIFICATION:
+ dev_err(&pdev->dev, "Received QP error notification\n");
+ break;
+ default:
+ /*
+ * Command Response
+ * cmdq->lock needs to be acquired to synchronie
+ * the command send and completion reaping. This function
+ * is always called with creq->lock held. Using
+ * the nested variant of spin_lock.
+ *
+ */
+
+ spin_lock_nested(&hwq->lock, SINGLE_DEPTH_NESTING);
+ cookie = le16_to_cpu(qp_event->cookie);
+ cookie &= BNG_FW_MAX_COOKIE_VALUE;
+ crsqe = &rcfw->crsqe_tbl[cookie];
+
+ if (WARN_ONCE(test_bit(FIRMWARE_STALL_DETECTED,
+ &rcfw->cmdq.flags),
+ "Unreponsive rcfw channel detected.!!")) {
+ dev_info(&pdev->dev,
+ "rcfw timedout: cookie = %#x, free_slots = %d",
+ cookie, crsqe->free_slots);
+ spin_unlock(&hwq->lock);
+ return rc;
+ }
+
+ if (crsqe->is_waiter_alive) {
+ if (crsqe->resp) {
+ memcpy(crsqe->resp, qp_event, sizeof(*qp_event));
+ /* Insert write memory barrier to ensure that
+ * response data is copied before clearing the
+ * flags
+ */
+ smp_wmb();
+ }
+ }
+
+ wait_cmds++;
+
+ req_size = crsqe->req_size;
+ is_waiter_alive = crsqe->is_waiter_alive;
+
+ crsqe->req_size = 0;
+ if (!is_waiter_alive)
+ crsqe->resp = NULL;
+
+ crsqe->is_in_used = false;
+
+ hwq->cons += req_size;
+
+ spin_unlock(&hwq->lock);
+ }
+ *num_wait += wait_cmds;
+ return rc;
+}
+
+/* function events */
+static int bng_re_process_func_event(struct bng_re_rcfw *rcfw,
+ struct creq_func_event *func_event)
+{
+ switch (func_event->event) {
+ case CREQ_FUNC_EVENT_EVENT_TX_WQE_ERROR:
+ case CREQ_FUNC_EVENT_EVENT_TX_DATA_ERROR:
+ case CREQ_FUNC_EVENT_EVENT_RX_WQE_ERROR:
+ case CREQ_FUNC_EVENT_EVENT_RX_DATA_ERROR:
+ case CREQ_FUNC_EVENT_EVENT_CQ_ERROR:
+ case CREQ_FUNC_EVENT_EVENT_TQM_ERROR:
+ case CREQ_FUNC_EVENT_EVENT_CFCQ_ERROR:
+ case CREQ_FUNC_EVENT_EVENT_CFCS_ERROR:
+ case CREQ_FUNC_EVENT_EVENT_CFCC_ERROR:
+ case CREQ_FUNC_EVENT_EVENT_CFCM_ERROR:
+ case CREQ_FUNC_EVENT_EVENT_TIM_ERROR:
+ case CREQ_FUNC_EVENT_EVENT_VF_COMM_REQUEST:
+ case CREQ_FUNC_EVENT_EVENT_RESOURCE_EXHAUSTED:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/* CREQ Completion handlers */
+static void bng_re_service_creq(struct tasklet_struct *t)
+{
+ struct bng_re_rcfw *rcfw = from_tasklet(rcfw, t, creq.creq_tasklet);
+ struct bng_re_creq_ctx *creq = &rcfw->creq;
+ u32 type, budget = BNG_FW_CREQ_ENTRY_POLL_BUDGET;
+ struct bng_re_hwq *hwq = &creq->hwq;
+ struct creq_base *creqe;
+ u32 num_wakeup = 0;
+ u32 hw_polled = 0;
+
+ /* Service the CREQ until budget is over */
+ spin_lock_bh(&hwq->lock);
+ while (budget > 0) {
+ creqe = bng_re_get_qe(hwq, hwq->cons, NULL);
+ if (!BNG_FW_CREQ_CMP_VALID(creqe, creq->creq_db.dbinfo.flags))
+ break;
+ /* The valid test of the entry must be done first before
+ * reading any further.
+ */
+ dma_rmb();
+
+ type = creqe->type & CREQ_BASE_TYPE_MASK;
+ switch (type) {
+ case CREQ_BASE_TYPE_QP_EVENT:
+ bng_re_process_qp_event
+ (rcfw, (struct creq_qp_event *)creqe,
+ &num_wakeup);
+ creq->stats.creq_qp_event_processed++;
+ break;
+ case CREQ_BASE_TYPE_FUNC_EVENT:
+ if (!bng_re_process_func_event
+ (rcfw, (struct creq_func_event *)creqe))
+ creq->stats.creq_func_event_processed++;
+ else
+ dev_warn(&rcfw->pdev->dev,
+ "aeqe:%#x Not handled\n", type);
+ break;
+ default:
+ if (type != ASYNC_EVENT_CMPL_TYPE_HWRM_ASYNC_EVENT)
+ dev_warn(&rcfw->pdev->dev,
+ "creqe with event 0x%x not handled\n",
+ type);
+ break;
+ }
+ budget--;
+ hw_polled++;
+ bng_re_hwq_incr_cons(hwq->max_elements, &hwq->cons,
+ 1, &creq->creq_db.dbinfo.flags);
+ }
+
+ if (hw_polled)
+ bng_re_ring_nq_db(&creq->creq_db.dbinfo,
+ rcfw->res->cctx, true);
+ spin_unlock_bh(&hwq->lock);
+ if (num_wakeup)
+ wake_up_nr(&rcfw->cmdq.waitq, num_wakeup);
+}
+
+static int __send_message_basic_sanity(struct bng_re_rcfw *rcfw,
+ struct bng_re_cmdqmsg *msg,
+ u8 opcode)
+{
+ struct bng_re_cmdq_ctx *cmdq;
+
+ cmdq = &rcfw->cmdq;
+
+ if (test_bit(FIRMWARE_STALL_DETECTED, &cmdq->flags))
+ return -ETIMEDOUT;
+
+ if (test_bit(FIRMWARE_INITIALIZED_FLAG, &cmdq->flags) &&
+ opcode == CMDQ_BASE_OPCODE_INITIALIZE_FW) {
+ dev_err(&rcfw->pdev->dev, "RCFW already initialized!");
+ return -EINVAL;
+ }
+
+ if (!test_bit(FIRMWARE_INITIALIZED_FLAG, &cmdq->flags) &&
+ (opcode != CMDQ_BASE_OPCODE_QUERY_FUNC &&
+ opcode != CMDQ_BASE_OPCODE_INITIALIZE_FW &&
+ opcode != CMDQ_BASE_OPCODE_QUERY_VERSION)) {
+ dev_err(&rcfw->pdev->dev,
+ "RCFW not initialized, reject opcode 0x%x",
+ opcode);
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
+static int __send_message(struct bng_re_rcfw *rcfw,
+ struct bng_re_cmdqmsg *msg, u8 opcode)
+{
+ u32 bsize, free_slots, required_slots;
+ struct bng_re_cmdq_ctx *cmdq;
+ struct bng_re_crsqe *crsqe;
+ struct bng_fw_cmdqe *cmdqe;
+ struct bng_re_hwq *hwq;
+ u32 sw_prod, cmdq_prod;
+ struct pci_dev *pdev;
+ u16 cookie;
+ u8 *preq;
+
+ cmdq = &rcfw->cmdq;
+ hwq = &cmdq->hwq;
+ pdev = rcfw->pdev;
+
+ /* Cmdq are in 16-byte units, each request can consume 1 or more
+ * cmdqe
+ */
+ spin_lock_bh(&hwq->lock);
+ required_slots = bng_re_get_cmd_slots(msg->req);
+ free_slots = HWQ_FREE_SLOTS(hwq);
+ cookie = cmdq->seq_num & BNG_FW_MAX_COOKIE_VALUE;
+ crsqe = &rcfw->crsqe_tbl[cookie];
+
+ if (required_slots >= free_slots) {
+ dev_info_ratelimited(&pdev->dev,
+ "CMDQ is full req/free %d/%d!",
+ required_slots, free_slots);
+ spin_unlock_bh(&hwq->lock);
+ return -EAGAIN;
+ }
+ __set_cmdq_base_cookie(msg->req, msg->req_sz, cpu_to_le16(cookie));
+
+ bsize = bng_re_set_cmd_slots(msg->req);
+ crsqe->free_slots = free_slots;
+ crsqe->resp = (struct creq_qp_event *)msg->resp;
+ crsqe->is_waiter_alive = true;
+ crsqe->is_in_used = true;
+ crsqe->opcode = opcode;
+
+ crsqe->req_size = __get_cmdq_base_cmd_size(msg->req, msg->req_sz);
+ if (__get_cmdq_base_resp_size(msg->req, msg->req_sz) && msg->sb) {
+ struct bng_re_rcfw_sbuf *sbuf = msg->sb;
+
+ __set_cmdq_base_resp_addr(msg->req, msg->req_sz,
+ cpu_to_le64(sbuf->dma_addr));
+ __set_cmdq_base_resp_size(msg->req, msg->req_sz,
+ ALIGN(sbuf->size,
+ BNG_FW_CMDQE_UNITS) /
+ BNG_FW_CMDQE_UNITS);
+ }
+
+ preq = (u8 *)msg->req;
+ do {
+ /* Locate the next cmdq slot */
+ sw_prod = HWQ_CMP(hwq->prod, hwq);
+ cmdqe = bng_re_get_qe(hwq, sw_prod, NULL);
+ /* Copy a segment of the req cmd to the cmdq */
+ memset(cmdqe, 0, sizeof(*cmdqe));
+ memcpy(cmdqe, preq, min_t(u32, bsize, sizeof(*cmdqe)));
+ preq += min_t(u32, bsize, sizeof(*cmdqe));
+ bsize -= min_t(u32, bsize, sizeof(*cmdqe));
+ hwq->prod++;
+ } while (bsize > 0);
+ cmdq->seq_num++;
+
+ cmdq_prod = hwq->prod & 0xFFFF;
+ if (test_bit(FIRMWARE_FIRST_FLAG, &cmdq->flags)) {
+ /* The very first doorbell write
+ * is required to set this flag
+ * which prompts the FW to reset
+ * its internal pointers
+ */
+ cmdq_prod |= BIT(FIRMWARE_FIRST_FLAG);
+ clear_bit(FIRMWARE_FIRST_FLAG, &cmdq->flags);
+ }
+ /* ring CMDQ DB */
+ wmb();
+ writel(cmdq_prod, cmdq->cmdq_mbox.prod);
+ writel(BNG_FW_CMDQ_TRIG_VAL, cmdq->cmdq_mbox.db);
+ spin_unlock_bh(&hwq->lock);
+ /* Return the CREQ response pointer */
+ return 0;
+}
+
+/**
+ * __wait_for_resp - Don't hold the cpu context and wait for response
+ * @rcfw: rcfw channel instance of rdev
+ * @cookie: cookie to track the command
+ *
+ * Wait for command completion in sleepable context.
+ *
+ * Returns:
+ * 0 if command is completed by firmware.
+ * Non zero error code for rest of the case.
+ */
+static int __wait_for_resp(struct bng_re_rcfw *rcfw, u16 cookie)
+{
+ struct bng_re_cmdq_ctx *cmdq;
+ struct bng_re_crsqe *crsqe;
+
+ cmdq = &rcfw->cmdq;
+ crsqe = &rcfw->crsqe_tbl[cookie];
+
+ do {
+ wait_event_timeout(cmdq->waitq,
+ !crsqe->is_in_used,
+ secs_to_jiffies(rcfw->max_timeout));
+
+ if (!crsqe->is_in_used)
+ return 0;
+
+ bng_re_service_creq(&rcfw->creq.creq_tasklet);
+
+ if (!crsqe->is_in_used)
+ return 0;
+ } while (true);
+};
+
+/**
+ * bng_re_rcfw_send_message - interface to send
+ * and complete rcfw command.
+ * @rcfw: rcfw channel instance of rdev
+ * @msg: message to send
+ *
+ * This function does not account shadow queue depth. It will send
+ * all the command unconditionally as long as send queue is not full.
+ *
+ * Returns:
+ * 0 if command completed by firmware.
+ * Non zero if the command is not completed by firmware.
+ */
+int bng_re_rcfw_send_message(struct bng_re_rcfw *rcfw,
+ struct bng_re_cmdqmsg *msg)
+{
+ struct creq_qp_event *evnt = (struct creq_qp_event *)msg->resp;
+ struct bng_re_crsqe *crsqe;
+ u16 cookie;
+ int rc;
+ u8 opcode;
+
+ opcode = __get_cmdq_base_opcode(msg->req, msg->req_sz);
+
+ rc = __send_message_basic_sanity(rcfw, msg, opcode);
+ if (rc)
+ return rc == -ENXIO ? bng_re_map_rc(opcode) : rc;
+
+ rc = __send_message(rcfw, msg, opcode);
+ if (rc)
+ return rc;
+
+ cookie = le16_to_cpu(__get_cmdq_base_cookie(msg->req, msg->req_sz))
+ & BNG_FW_MAX_COOKIE_VALUE;
+
+ rc = __wait_for_resp(rcfw, cookie);
+
+ if (rc) {
+ spin_lock_bh(&rcfw->cmdq.hwq.lock);
+ crsqe = &rcfw->crsqe_tbl[cookie];
+ crsqe->is_waiter_alive = false;
+ if (rc == -ENODEV)
+ set_bit(FIRMWARE_STALL_DETECTED, &rcfw->cmdq.flags);
+ spin_unlock_bh(&rcfw->cmdq.hwq.lock);
+ return -ETIMEDOUT;
+ }
+
+ if (evnt->status) {
+ /* failed with status */
+ dev_err(&rcfw->pdev->dev, "cmdq[%#x]=%#x status %#x\n",
+ cookie, opcode, evnt->status);
+ rc = -EIO;
+ }
+
+ return rc;
+}
+
+static int bng_re_map_cmdq_mbox(struct bng_re_rcfw *rcfw)
+{
+ struct bng_re_cmdq_mbox *mbox;
+ resource_size_t bar_reg;
+ struct pci_dev *pdev;
+
+ pdev = rcfw->pdev;
+ mbox = &rcfw->cmdq.cmdq_mbox;
+
+ mbox->reg.bar_id = BNG_FW_COMM_PCI_BAR_REGION;
+ mbox->reg.len = BNG_FW_COMM_SIZE;
+ mbox->reg.bar_base = pci_resource_start(pdev, mbox->reg.bar_id);
+ if (!mbox->reg.bar_base) {
+ dev_err(&pdev->dev,
+ "CMDQ BAR region %d resc start is 0!\n",
+ mbox->reg.bar_id);
+ return -ENOMEM;
+ }
+
+ bar_reg = mbox->reg.bar_base + BNG_FW_COMM_BASE_OFFSET;
+ mbox->reg.len = BNG_FW_COMM_SIZE;
+ mbox->reg.bar_reg = ioremap(bar_reg, mbox->reg.len);
+ if (!mbox->reg.bar_reg) {
+ dev_err(&pdev->dev,
+ "CMDQ BAR region %d mapping failed\n",
+ mbox->reg.bar_id);
+ return -ENOMEM;
+ }
+
+ mbox->prod = (void __iomem *)(mbox->reg.bar_reg +
+ BNG_FW_PF_VF_COMM_PROD_OFFSET);
+ mbox->db = (void __iomem *)(mbox->reg.bar_reg + BNG_FW_COMM_TRIG_OFFSET);
+ return 0;
+}
+
+static irqreturn_t bng_re_creq_irq(int irq, void *dev_instance)
+{
+ struct bng_re_rcfw *rcfw = dev_instance;
+ struct bng_re_creq_ctx *creq;
+ struct bng_re_hwq *hwq;
+ u32 sw_cons;
+
+ creq = &rcfw->creq;
+ hwq = &creq->hwq;
+ /* Prefetch the CREQ element */
+ sw_cons = HWQ_CMP(hwq->cons, hwq);
+ bng_re_get_qe(hwq, sw_cons, NULL);
+
+ tasklet_schedule(&creq->creq_tasklet);
+ return IRQ_HANDLED;
+}
+
+int bng_re_rcfw_start_irq(struct bng_re_rcfw *rcfw, int msix_vector,
+ bool need_init)
+{
+ struct bng_re_creq_ctx *creq;
+ struct bng_re_res *res;
+ int rc;
+
+ creq = &rcfw->creq;
+ res = rcfw->res;
+
+ if (creq->irq_handler_avail)
+ return -EFAULT;
+
+ creq->msix_vec = msix_vector;
+ if (need_init)
+ tasklet_setup(&creq->creq_tasklet, bng_re_service_creq);
+ else
+ tasklet_enable(&creq->creq_tasklet);
+
+ creq->irq_name = kasprintf(GFP_KERNEL, "bng_re-creq@pci:%s",
+ pci_name(res->pdev));
+ if (!creq->irq_name)
+ return -ENOMEM;
+ rc = request_irq(creq->msix_vec, bng_re_creq_irq, 0,
+ creq->irq_name, rcfw);
+ if (rc) {
+ kfree(creq->irq_name);
+ creq->irq_name = NULL;
+ tasklet_disable(&creq->creq_tasklet);
+ return rc;
+ }
+ creq->irq_handler_avail = true;
+
+ bng_re_ring_nq_db(&creq->creq_db.dbinfo, res->cctx, true);
+ atomic_inc(&rcfw->rcfw_intr_enabled);
+
+ return 0;
+}
+
+static int bng_re_map_creq_db(struct bng_re_rcfw *rcfw, u32 reg_offt)
+{
+ struct bng_re_creq_db *creq_db;
+ resource_size_t bar_reg;
+ struct pci_dev *pdev;
+
+ pdev = rcfw->pdev;
+ creq_db = &rcfw->creq.creq_db;
+
+ creq_db->dbinfo.flags = 0;
+ creq_db->reg.bar_id = BNG_FW_COMM_CONS_PCI_BAR_REGION;
+ creq_db->reg.bar_base = pci_resource_start(pdev, creq_db->reg.bar_id);
+ if (!creq_db->reg.bar_id)
+ dev_err(&pdev->dev,
+ "CREQ BAR region %d resc start is 0!",
+ creq_db->reg.bar_id);
+
+ bar_reg = creq_db->reg.bar_base + reg_offt;
+
+ creq_db->reg.len = BNG_FW_CREQ_DB_LEN;
+ creq_db->reg.bar_reg = ioremap(bar_reg, creq_db->reg.len);
+ if (!creq_db->reg.bar_reg) {
+ dev_err(&pdev->dev,
+ "CREQ BAR region %d mapping failed",
+ creq_db->reg.bar_id);
+ return -ENOMEM;
+ }
+ creq_db->dbinfo.db = creq_db->reg.bar_reg;
+ creq_db->dbinfo.hwq = &rcfw->creq.hwq;
+ creq_db->dbinfo.xid = rcfw->creq.ring_id;
+ return 0;
+}
+
+void bng_re_rcfw_stop_irq(struct bng_re_rcfw *rcfw, bool kill)
+{
+ struct bng_re_creq_ctx *creq;
+
+ creq = &rcfw->creq;
+
+ if (!creq->irq_handler_avail)
+ return;
+
+ creq->irq_handler_avail = false;
+ /* Mask h/w interrupts */
+ bng_re_ring_nq_db(&creq->creq_db.dbinfo, rcfw->res->cctx, false);
+ /* Sync with last running IRQ-handler */
+ synchronize_irq(creq->msix_vec);
+ free_irq(creq->msix_vec, rcfw);
+ kfree(creq->irq_name);
+ creq->irq_name = NULL;
+ atomic_set(&rcfw->rcfw_intr_enabled, 0);
+ if (kill)
+ tasklet_kill(&creq->creq_tasklet);
+ tasklet_disable(&creq->creq_tasklet);
+}
+
+void bng_re_disable_rcfw_channel(struct bng_re_rcfw *rcfw)
+{
+ struct bng_re_creq_ctx *creq;
+ struct bng_re_cmdq_ctx *cmdq;
+
+ creq = &rcfw->creq;
+ cmdq = &rcfw->cmdq;
+ /* Make sure the HW channel is stopped! */
+ bng_re_rcfw_stop_irq(rcfw, true);
+
+ iounmap(cmdq->cmdq_mbox.reg.bar_reg);
+ iounmap(creq->creq_db.reg.bar_reg);
+
+ cmdq->cmdq_mbox.reg.bar_reg = NULL;
+ creq->creq_db.reg.bar_reg = NULL;
+ creq->msix_vec = 0;
+}
+
+static void bng_re_start_rcfw(struct bng_re_rcfw *rcfw)
+{
+ struct bng_re_cmdq_ctx *cmdq;
+ struct bng_re_creq_ctx *creq;
+ struct bng_re_cmdq_mbox *mbox;
+ struct cmdq_init init = {0};
+
+ cmdq = &rcfw->cmdq;
+ creq = &rcfw->creq;
+ mbox = &cmdq->cmdq_mbox;
+
+ init.cmdq_pbl = cpu_to_le64(cmdq->hwq.pbl[BNG_PBL_LVL_0].pg_map_arr[0]);
+ init.cmdq_size_cmdq_lvl =
+ cpu_to_le16(((rcfw->cmdq_depth <<
+ CMDQ_INIT_CMDQ_SIZE_SFT) &
+ CMDQ_INIT_CMDQ_SIZE_MASK) |
+ ((cmdq->hwq.level <<
+ CMDQ_INIT_CMDQ_LVL_SFT) &
+ CMDQ_INIT_CMDQ_LVL_MASK));
+ init.creq_ring_id = cpu_to_le16(creq->ring_id);
+ /* Write to the mailbox register */
+ __iowrite32_copy(mbox->reg.bar_reg, &init, sizeof(init) / 4);
+}
+
+int bng_re_enable_fw_channel(struct bng_re_rcfw *rcfw,
+ int msix_vector,
+ int cp_bar_reg_off)
+{
+ struct bng_re_cmdq_ctx *cmdq;
+ int rc;
+
+ cmdq = &rcfw->cmdq;
+
+ /* Assign defaults */
+ cmdq->seq_num = 0;
+ set_bit(FIRMWARE_FIRST_FLAG, &cmdq->flags);
+ init_waitqueue_head(&cmdq->waitq);
+
+ rc = bng_re_map_cmdq_mbox(rcfw);
+ if (rc)
+ return rc;
+
+ rc = bng_re_map_creq_db(rcfw, cp_bar_reg_off);
+ if (rc)
+ return rc;
+
+ rc = bng_re_rcfw_start_irq(rcfw, msix_vector, true);
+ if (rc) {
+ dev_err(&rcfw->pdev->dev,
+ "Failed to request IRQ for CREQ rc = 0x%x\n", rc);
+ bng_re_disable_rcfw_channel(rcfw);
+ return rc;
+ }
+
+ bng_re_start_rcfw(rcfw);
+ return 0;
+}
+
+int bng_re_deinit_rcfw(struct bng_re_rcfw *rcfw)
+{
+ struct creq_deinitialize_fw_resp resp = {};
+ struct cmdq_deinitialize_fw req = {};
+ struct bng_re_cmdqmsg msg = {};
+ int rc;
+
+ bng_re_rcfw_cmd_prep((struct cmdq_base *)&req,
+ CMDQ_BASE_OPCODE_DEINITIALIZE_FW,
+ sizeof(req));
+ bng_re_fill_cmdqmsg(&msg, &req, &resp, NULL,
+ sizeof(req), sizeof(resp), 0);
+ rc = bng_re_rcfw_send_message(rcfw, &msg);
+ if (rc)
+ return rc;
+
+ clear_bit(FIRMWARE_INITIALIZED_FLAG, &rcfw->cmdq.flags);
+ return 0;
+}
+static inline bool _is_hw_retx_supported(u16 dev_cap_flags)
+{
+ return dev_cap_flags &
+ (CREQ_QUERY_FUNC_RESP_SB_HW_REQUESTER_RETX_ENABLED |
+ CREQ_QUERY_FUNC_RESP_SB_HW_RESPONDER_RETX_ENABLED);
+}
+
+#define BNG_RE_HW_RETX(a) _is_hw_retx_supported((a))
+static inline bool _is_optimize_modify_qp_supported(u16 dev_cap_ext_flags2)
+{
+ return dev_cap_ext_flags2 &
+ CREQ_QUERY_FUNC_RESP_SB_OPTIMIZE_MODIFY_QP_SUPPORTED;
+}
+
+int bng_re_init_rcfw(struct bng_re_rcfw *rcfw,
+ struct bng_re_stats *stats_ctx)
+{
+ struct creq_initialize_fw_resp resp = {};
+ struct cmdq_initialize_fw req = {};
+ struct bng_re_cmdqmsg msg = {};
+ int rc;
+ u16 flags = 0;
+
+ bng_re_rcfw_cmd_prep((struct cmdq_base *)&req,
+ CMDQ_BASE_OPCODE_INITIALIZE_FW,
+ sizeof(req));
+ /* Supply (log-base-2-of-host-page-size - base-page-shift)
+ * to bono to adjust the doorbell page sizes.
+ */
+ req.log2_dbr_pg_size = cpu_to_le16(PAGE_SHIFT -
+ BNG_FW_DBR_BASE_PAGE_SHIFT);
+ if (BNG_RE_HW_RETX(rcfw->res->dattr->dev_cap_flags))
+ flags |= CMDQ_INITIALIZE_FW_FLAGS_HW_REQUESTER_RETX_SUPPORTED;
+ if (_is_optimize_modify_qp_supported(rcfw->res->dattr->dev_cap_flags2))
+ flags |= CMDQ_INITIALIZE_FW_FLAGS_OPTIMIZE_MODIFY_QP_SUPPORTED;
+ req.flags |= cpu_to_le16(flags);
+ req.stat_ctx_id = cpu_to_le32(stats_ctx->fw_id);
+ bng_re_fill_cmdqmsg(&msg, &req, &resp, NULL, sizeof(req), sizeof(resp), 0);
+ rc = bng_re_rcfw_send_message(rcfw, &msg);
+ if (rc)
+ return rc;
+ set_bit(FIRMWARE_INITIALIZED_FLAG, &rcfw->cmdq.flags);
+ return 0;
+}
diff --git a/drivers/infiniband/hw/bng_re/bng_fw.h b/drivers/infiniband/hw/bng_re/bng_fw.h
new file mode 100644
index 000000000000..c89c926ec2fc
--- /dev/null
+++ b/drivers/infiniband/hw/bng_re/bng_fw.h
@@ -0,0 +1,211 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+// Copyright (c) 2025 Broadcom.
+
+#ifndef __BNG_FW_H__
+#define __BNG_FW_H__
+
+#include "bng_tlv.h"
+
+/* FW DB related */
+#define BNG_FW_CMDQ_TRIG_VAL 1
+#define BNG_FW_COMM_PCI_BAR_REGION 0
+#define BNG_FW_COMM_CONS_PCI_BAR_REGION 2
+#define BNG_FW_DBR_BASE_PAGE_SHIFT 12
+#define BNG_FW_COMM_SIZE 0x104
+#define BNG_FW_COMM_BASE_OFFSET 0x600
+#define BNG_FW_COMM_TRIG_OFFSET 0x100
+#define BNG_FW_PF_VF_COMM_PROD_OFFSET 0xc
+#define BNG_FW_CREQ_DB_LEN 8
+
+/* CREQ */
+#define BNG_FW_CREQE_MAX_CNT (64 * 1024)
+#define BNG_FW_CREQE_UNITS 16
+#define BNG_FW_CREQ_ENTRY_POLL_BUDGET 0x100
+#define BNG_FW_CREQ_CMP_VALID(hdr, pass) \
+ (!!((hdr)->v & CREQ_BASE_V) == \
+ !((pass) & BNG_RE_FLAG_EPOCH_CONS_MASK))
+#define BNG_FW_CREQ_ENTRY_POLL_BUDGET 0x100
+
+/* CMDQ */
+struct bng_fw_cmdqe {
+ u8 data[16];
+};
+
+#define BNG_FW_CMDQE_MAX_CNT 8192
+#define BNG_FW_CMDQE_UNITS sizeof(struct bng_fw_cmdqe)
+#define BNG_FW_CMDQE_BYTES(depth) ((depth) * BNG_FW_CMDQE_UNITS)
+
+#define BNG_FW_MAX_COOKIE_VALUE (BNG_FW_CMDQE_MAX_CNT - 1)
+#define BNG_FW_CMD_IS_BLOCKING 0x8000
+
+/* Crsq buf is 1024-Byte */
+struct bng_re_crsbe {
+ u8 data[1024];
+};
+
+
+static inline u32 bng_fw_cmdqe_npages(u32 depth)
+{
+ u32 npages;
+
+ npages = BNG_FW_CMDQE_BYTES(depth) / PAGE_SIZE;
+ if (BNG_FW_CMDQE_BYTES(depth) % PAGE_SIZE)
+ npages++;
+ return npages;
+}
+
+static inline u32 bng_fw_cmdqe_page_size(u32 depth)
+{
+ return (bng_fw_cmdqe_npages(depth) * PAGE_SIZE);
+}
+struct bng_re_cmdq_mbox {
+ struct bng_re_reg_desc reg;
+ void __iomem *prod;
+ void __iomem *db;
+};
+
+/* HWQ */
+struct bng_re_cmdq_ctx {
+ struct bng_re_hwq hwq;
+ struct bng_re_cmdq_mbox cmdq_mbox;
+ unsigned long flags;
+#define FIRMWARE_INITIALIZED_FLAG (0)
+#define FIRMWARE_STALL_DETECTED (3)
+#define FIRMWARE_FIRST_FLAG (31)
+ wait_queue_head_t waitq;
+ u32 seq_num;
+};
+
+struct bng_re_creq_db {
+ struct bng_re_reg_desc reg;
+ struct bng_re_db_info dbinfo;
+};
+
+struct bng_re_creq_stat {
+ u64 creq_qp_event_processed;
+ u64 creq_func_event_processed;
+};
+
+struct bng_re_creq_ctx {
+ struct bng_re_hwq hwq;
+ struct bng_re_creq_db creq_db;
+ struct bng_re_creq_stat stats;
+ struct tasklet_struct creq_tasklet;
+ u16 ring_id;
+ int msix_vec;
+ bool irq_handler_avail;
+ char *irq_name;
+};
+
+struct bng_re_crsqe {
+ struct creq_qp_event *resp;
+ u32 req_size;
+ /* Free slots at the time of submission */
+ u32 free_slots;
+ u8 opcode;
+ bool is_waiter_alive;
+ bool is_in_used;
+};
+
+struct bng_re_rcfw_sbuf {
+ void *sb;
+ dma_addr_t dma_addr;
+ u32 size;
+};
+
+/* RoCE FW Communication Channels */
+struct bng_re_rcfw {
+ struct pci_dev *pdev;
+ struct bng_re_res *res;
+ struct bng_re_cmdq_ctx cmdq;
+ struct bng_re_creq_ctx creq;
+ struct bng_re_crsqe *crsqe_tbl;
+ /* To synchronize the qp-handle hash table */
+ spinlock_t tbl_lock;
+ u32 cmdq_depth;
+ /* cached from chip cctx for quick reference in slow path */
+ u16 max_timeout;
+ atomic_t rcfw_intr_enabled;
+};
+
+struct bng_re_cmdqmsg {
+ struct cmdq_base *req;
+ struct creq_base *resp;
+ void *sb;
+ u32 req_sz;
+ u32 res_sz;
+ u8 block;
+};
+
+static inline void bng_re_rcfw_cmd_prep(struct cmdq_base *req,
+ u8 opcode, u8 cmd_size)
+{
+ req->opcode = opcode;
+ req->cmd_size = cmd_size;
+}
+
+static inline void bng_re_fill_cmdqmsg(struct bng_re_cmdqmsg *msg,
+ void *req, void *resp, void *sb,
+ u32 req_sz, u32 res_sz, u8 block)
+{
+ msg->req = req;
+ msg->resp = resp;
+ msg->sb = sb;
+ msg->req_sz = req_sz;
+ msg->res_sz = res_sz;
+ msg->block = block;
+}
+
+/* Get the number of command units required for the req. The
+ * function returns correct value only if called before
+ * setting using bng_re_set_cmd_slots
+ */
+static inline u32 bng_re_get_cmd_slots(struct cmdq_base *req)
+{
+ u32 cmd_units = 0;
+
+ if (HAS_TLV_HEADER(req)) {
+ struct roce_tlv *tlv_req = (struct roce_tlv *)req;
+
+ cmd_units = tlv_req->total_size;
+ } else {
+ cmd_units = (req->cmd_size + BNG_FW_CMDQE_UNITS - 1) /
+ BNG_FW_CMDQE_UNITS;
+ }
+
+ return cmd_units;
+}
+
+static inline u32 bng_re_set_cmd_slots(struct cmdq_base *req)
+{
+ u32 cmd_byte = 0;
+
+ if (HAS_TLV_HEADER(req)) {
+ struct roce_tlv *tlv_req = (struct roce_tlv *)req;
+
+ cmd_byte = tlv_req->total_size * BNG_FW_CMDQE_UNITS;
+ } else {
+ cmd_byte = req->cmd_size;
+ req->cmd_size = (req->cmd_size + BNG_FW_CMDQE_UNITS - 1) /
+ BNG_FW_CMDQE_UNITS;
+ }
+
+ return cmd_byte;
+}
+
+void bng_re_free_rcfw_channel(struct bng_re_rcfw *rcfw);
+int bng_re_alloc_fw_channel(struct bng_re_res *res,
+ struct bng_re_rcfw *rcfw);
+int bng_re_enable_fw_channel(struct bng_re_rcfw *rcfw,
+ int msix_vector,
+ int cp_bar_reg_off);
+void bng_re_disable_rcfw_channel(struct bng_re_rcfw *rcfw);
+int bng_re_rcfw_start_irq(struct bng_re_rcfw *rcfw, int msix_vector,
+ bool need_init);
+void bng_re_rcfw_stop_irq(struct bng_re_rcfw *rcfw, bool kill);
+int bng_re_rcfw_send_message(struct bng_re_rcfw *rcfw,
+ struct bng_re_cmdqmsg *msg);
+int bng_re_init_rcfw(struct bng_re_rcfw *rcfw,
+ struct bng_re_stats *stats_ctx);
+int bng_re_deinit_rcfw(struct bng_re_rcfw *rcfw);
+#endif
diff --git a/drivers/infiniband/hw/bng_re/bng_re.h b/drivers/infiniband/hw/bng_re/bng_re.h
new file mode 100644
index 000000000000..dae4862621a7
--- /dev/null
+++ b/drivers/infiniband/hw/bng_re/bng_re.h
@@ -0,0 +1,85 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+// Copyright (c) 2025 Broadcom.
+
+#ifndef __BNG_RE_H__
+#define __BNG_RE_H__
+
+#include "bng_res.h"
+
+#define BNG_RE_ADEV_NAME "bng_en"
+
+#define BNG_RE_DESC "Broadcom 800G RoCE Driver"
+
+#define rdev_to_dev(rdev) ((rdev) ? (&(rdev)->ibdev.dev) : NULL)
+
+#define BNG_RE_MIN_MSIX 2
+#define BNG_RE_MAX_MSIX BNGE_MAX_ROCE_MSIX
+
+#define BNG_RE_CREQ_NQ_IDX 0
+
+#define BNGE_INVALID_STATS_CTX_ID -1
+/* NQ specific structures */
+struct bng_re_nq_db {
+ struct bng_re_reg_desc reg;
+ struct bng_re_db_info dbinfo;
+};
+
+struct bng_re_nq {
+ struct pci_dev *pdev;
+ struct bng_re_res *res;
+ char *name;
+ struct bng_re_hwq hwq;
+ struct bng_re_nq_db nq_db;
+ u16 ring_id;
+ int msix_vec;
+ cpumask_t mask;
+ struct tasklet_struct nq_tasklet;
+ bool requested;
+ int budget;
+ u32 load;
+
+ struct workqueue_struct *cqn_wq;
+};
+
+struct bng_re_nq_record {
+ struct bnge_msix_info msix_entries[BNG_RE_MAX_MSIX];
+ struct bng_re_nq nq[BNG_RE_MAX_MSIX];
+ int num_msix;
+ /* serialize NQ access */
+ struct mutex load_lock;
+};
+
+struct bng_re_en_dev_info {
+ struct bng_re_dev *rdev;
+ struct bnge_auxr_dev *auxr_dev;
+};
+
+struct bng_re_ring_attr {
+ dma_addr_t *dma_arr;
+ int pages;
+ int type;
+ u32 depth;
+ u32 lrid; /* Logical ring id */
+ u8 mode;
+};
+
+struct bng_re_dev {
+ struct ib_device ibdev;
+ unsigned long flags;
+#define BNG_RE_FLAG_NETDEV_REGISTERED 0
+#define BNG_RE_FLAG_RCFW_CHANNEL_EN 1
+ struct net_device *netdev;
+ struct auxiliary_device *adev;
+ struct bnge_auxr_dev *aux_dev;
+ struct bng_re_chip_ctx *chip_ctx;
+ int fn_id;
+ struct bng_re_res bng_res;
+ struct bng_re_rcfw rcfw;
+ struct bng_re_nq_record *nqr;
+ /* Device Resources */
+ struct bng_re_dev_attr *dev_attr;
+ struct dentry *dbg_root;
+ struct bng_re_stats stats_ctx;
+};
+
+#endif
diff --git a/drivers/infiniband/hw/bng_re/bng_res.c b/drivers/infiniband/hw/bng_re/bng_res.c
new file mode 100644
index 000000000000..c50823758b53
--- /dev/null
+++ b/drivers/infiniband/hw/bng_re/bng_res.c
@@ -0,0 +1,279 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2025 Broadcom.
+
+#include <linux/pci.h>
+#include <linux/vmalloc.h>
+#include <rdma/ib_umem.h>
+
+#include <linux/bnxt/hsi.h>
+#include "bng_res.h"
+#include "roce_hsi.h"
+
+/* Stats */
+void bng_re_free_stats_ctx_mem(struct pci_dev *pdev,
+ struct bng_re_stats *stats)
+{
+ if (stats->dma) {
+ dma_free_coherent(&pdev->dev, stats->size,
+ stats->dma, stats->dma_map);
+ }
+ memset(stats, 0, sizeof(*stats));
+ stats->fw_id = -1;
+}
+
+int bng_re_alloc_stats_ctx_mem(struct pci_dev *pdev,
+ struct bng_re_chip_ctx *cctx,
+ struct bng_re_stats *stats)
+{
+ memset(stats, 0, sizeof(*stats));
+ stats->fw_id = -1;
+ stats->size = cctx->hw_stats_size;
+ stats->dma = dma_alloc_coherent(&pdev->dev, stats->size,
+ &stats->dma_map, GFP_KERNEL);
+ if (!stats->dma)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static void bng_free_pbl(struct bng_re_res *res, struct bng_re_pbl *pbl)
+{
+ struct pci_dev *pdev = res->pdev;
+ int i;
+
+ for (i = 0; i < pbl->pg_count; i++) {
+ if (pbl->pg_arr[i])
+ dma_free_coherent(&pdev->dev, pbl->pg_size,
+ (void *)((unsigned long)
+ pbl->pg_arr[i] &
+ PAGE_MASK),
+ pbl->pg_map_arr[i]);
+ else
+ dev_warn(&pdev->dev,
+ "PBL free pg_arr[%d] empty?!\n", i);
+ pbl->pg_arr[i] = NULL;
+ }
+
+ vfree(pbl->pg_arr);
+ pbl->pg_arr = NULL;
+ vfree(pbl->pg_map_arr);
+ pbl->pg_map_arr = NULL;
+ pbl->pg_count = 0;
+ pbl->pg_size = 0;
+}
+
+static int bng_alloc_pbl(struct bng_re_res *res,
+ struct bng_re_pbl *pbl,
+ struct bng_re_sg_info *sginfo)
+{
+ struct pci_dev *pdev = res->pdev;
+ u32 pages;
+ int i;
+
+ if (sginfo->nopte)
+ return 0;
+ pages = sginfo->npages;
+
+ /* page ptr arrays */
+ pbl->pg_arr = vmalloc_array(pages, sizeof(void *));
+ if (!pbl->pg_arr)
+ return -ENOMEM;
+
+ pbl->pg_map_arr = vmalloc_array(pages, sizeof(dma_addr_t));
+ if (!pbl->pg_map_arr) {
+ vfree(pbl->pg_arr);
+ pbl->pg_arr = NULL;
+ return -ENOMEM;
+ }
+ pbl->pg_count = 0;
+ pbl->pg_size = sginfo->pgsize;
+
+ for (i = 0; i < pages; i++) {
+ pbl->pg_arr[i] = dma_alloc_coherent(&pdev->dev,
+ pbl->pg_size,
+ &pbl->pg_map_arr[i],
+ GFP_KERNEL);
+ if (!pbl->pg_arr[i])
+ goto fail;
+ pbl->pg_count++;
+ }
+
+ return 0;
+fail:
+ bng_free_pbl(res, pbl);
+ return -ENOMEM;
+}
+
+void bng_re_free_hwq(struct bng_re_res *res,
+ struct bng_re_hwq *hwq)
+{
+ int i;
+
+ if (!hwq->max_elements)
+ return;
+ if (hwq->level >= BNG_PBL_LVL_MAX)
+ return;
+
+ for (i = 0; i < hwq->level + 1; i++)
+ bng_free_pbl(res, &hwq->pbl[i]);
+
+ hwq->level = BNG_PBL_LVL_MAX;
+ hwq->max_elements = 0;
+ hwq->element_size = 0;
+ hwq->prod = 0;
+ hwq->cons = 0;
+}
+
+/* All HWQs are power of 2 in size */
+int bng_re_alloc_init_hwq(struct bng_re_hwq *hwq,
+ struct bng_re_hwq_attr *hwq_attr)
+{
+ u32 npages, pg_size;
+ struct bng_re_sg_info sginfo = {};
+ u32 depth, stride, npbl, npde;
+ dma_addr_t *src_phys_ptr, **dst_virt_ptr;
+ struct bng_re_res *res;
+ struct pci_dev *pdev;
+ int i, rc, lvl;
+
+ res = hwq_attr->res;
+ pdev = res->pdev;
+ pg_size = hwq_attr->sginfo->pgsize;
+ hwq->level = BNG_PBL_LVL_MAX;
+
+ depth = roundup_pow_of_two(hwq_attr->depth);
+ stride = roundup_pow_of_two(hwq_attr->stride);
+
+ npages = (depth * stride) / pg_size;
+ if ((depth * stride) % pg_size)
+ npages++;
+ if (!npages)
+ return -EINVAL;
+ hwq_attr->sginfo->npages = npages;
+
+ if (npages == MAX_PBL_LVL_0_PGS && !hwq_attr->sginfo->nopte) {
+ /* This request is Level 0, map PTE */
+ rc = bng_alloc_pbl(res, &hwq->pbl[BNG_PBL_LVL_0], hwq_attr->sginfo);
+ if (rc)
+ goto fail;
+ hwq->level = BNG_PBL_LVL_0;
+ goto done;
+ }
+
+ if (npages >= MAX_PBL_LVL_0_PGS) {
+ if (npages > MAX_PBL_LVL_1_PGS) {
+ u32 flag = PTU_PTE_VALID;
+ /* 2 levels of indirection */
+ npbl = npages >> MAX_PBL_LVL_1_PGS_SHIFT;
+ if (npages % BIT(MAX_PBL_LVL_1_PGS_SHIFT))
+ npbl++;
+ npde = npbl >> MAX_PDL_LVL_SHIFT;
+ if (npbl % BIT(MAX_PDL_LVL_SHIFT))
+ npde++;
+ /* Alloc PDE pages */
+ sginfo.pgsize = npde * pg_size;
+ sginfo.npages = 1;
+ rc = bng_alloc_pbl(res, &hwq->pbl[BNG_PBL_LVL_0], &sginfo);
+ if (rc)
+ goto fail;
+
+ /* Alloc PBL pages */
+ sginfo.npages = npbl;
+ sginfo.pgsize = PAGE_SIZE;
+ rc = bng_alloc_pbl(res, &hwq->pbl[BNG_PBL_LVL_1], &sginfo);
+ if (rc)
+ goto fail;
+ /* Fill PDL with PBL page pointers */
+ dst_virt_ptr =
+ (dma_addr_t **)hwq->pbl[BNG_PBL_LVL_0].pg_arr;
+ src_phys_ptr = hwq->pbl[BNG_PBL_LVL_1].pg_map_arr;
+ for (i = 0; i < hwq->pbl[BNG_PBL_LVL_1].pg_count; i++)
+ dst_virt_ptr[0][i] = src_phys_ptr[i] | flag;
+
+ /* Alloc or init PTEs */
+ rc = bng_alloc_pbl(res, &hwq->pbl[BNG_PBL_LVL_2],
+ hwq_attr->sginfo);
+ if (rc)
+ goto fail;
+ hwq->level = BNG_PBL_LVL_2;
+ if (hwq_attr->sginfo->nopte)
+ goto done;
+ /* Fill PBLs with PTE pointers */
+ dst_virt_ptr =
+ (dma_addr_t **)hwq->pbl[BNG_PBL_LVL_1].pg_arr;
+ src_phys_ptr = hwq->pbl[BNG_PBL_LVL_2].pg_map_arr;
+ for (i = 0; i < hwq->pbl[BNG_PBL_LVL_2].pg_count; i++) {
+ dst_virt_ptr[PTR_PG(i)][PTR_IDX(i)] =
+ src_phys_ptr[i] | PTU_PTE_VALID;
+ }
+ if (hwq_attr->type == BNG_HWQ_TYPE_QUEUE) {
+ /* Find the last pg of the size */
+ i = hwq->pbl[BNG_PBL_LVL_2].pg_count;
+ dst_virt_ptr[PTR_PG(i - 1)][PTR_IDX(i - 1)] |=
+ PTU_PTE_LAST;
+ if (i > 1)
+ dst_virt_ptr[PTR_PG(i - 2)]
+ [PTR_IDX(i - 2)] |=
+ PTU_PTE_NEXT_TO_LAST;
+ }
+ } else { /* pages < 512 npbl = 1, npde = 0 */
+ u32 flag = PTU_PTE_VALID;
+
+ /* 1 level of indirection */
+ npbl = npages >> MAX_PBL_LVL_1_PGS_SHIFT;
+ if (npages % BIT(MAX_PBL_LVL_1_PGS_SHIFT))
+ npbl++;
+ sginfo.npages = npbl;
+ sginfo.pgsize = PAGE_SIZE;
+ /* Alloc PBL page */
+ rc = bng_alloc_pbl(res, &hwq->pbl[BNG_PBL_LVL_0], &sginfo);
+ if (rc)
+ goto fail;
+ /* Alloc or init PTEs */
+ rc = bng_alloc_pbl(res, &hwq->pbl[BNG_PBL_LVL_1],
+ hwq_attr->sginfo);
+ if (rc)
+ goto fail;
+ hwq->level = BNG_PBL_LVL_1;
+ if (hwq_attr->sginfo->nopte)
+ goto done;
+ /* Fill PBL with PTE pointers */
+ dst_virt_ptr =
+ (dma_addr_t **)hwq->pbl[BNG_PBL_LVL_0].pg_arr;
+ src_phys_ptr = hwq->pbl[BNG_PBL_LVL_1].pg_map_arr;
+ for (i = 0; i < hwq->pbl[BNG_PBL_LVL_1].pg_count; i++)
+ dst_virt_ptr[PTR_PG(i)][PTR_IDX(i)] =
+ src_phys_ptr[i] | flag;
+ if (hwq_attr->type == BNG_HWQ_TYPE_QUEUE) {
+ /* Find the last pg of the size */
+ i = hwq->pbl[BNG_PBL_LVL_1].pg_count;
+ dst_virt_ptr[PTR_PG(i - 1)][PTR_IDX(i - 1)] |=
+ PTU_PTE_LAST;
+ if (i > 1)
+ dst_virt_ptr[PTR_PG(i - 2)]
+ [PTR_IDX(i - 2)] |=
+ PTU_PTE_NEXT_TO_LAST;
+ }
+ }
+ }
+done:
+ hwq->prod = 0;
+ hwq->cons = 0;
+ hwq->pdev = pdev;
+ hwq->depth = hwq_attr->depth;
+ hwq->max_elements = hwq->depth;
+ hwq->element_size = stride;
+ hwq->qe_ppg = pg_size / stride;
+ /* For direct access to the elements */
+ lvl = hwq->level;
+ if (hwq_attr->sginfo->nopte && hwq->level)
+ lvl = hwq->level - 1;
+ hwq->pbl_ptr = hwq->pbl[lvl].pg_arr;
+ hwq->pbl_dma_ptr = hwq->pbl[lvl].pg_map_arr;
+ spin_lock_init(&hwq->lock);
+
+ return 0;
+fail:
+ bng_re_free_hwq(res, hwq);
+ return -ENOMEM;
+}
diff --git a/drivers/infiniband/hw/bng_re/bng_res.h b/drivers/infiniband/hw/bng_re/bng_res.h
new file mode 100644
index 000000000000..9997f86d6a0e
--- /dev/null
+++ b/drivers/infiniband/hw/bng_re/bng_res.h
@@ -0,0 +1,215 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+// Copyright (c) 2025 Broadcom.
+
+#ifndef __BNG_RES_H__
+#define __BNG_RES_H__
+
+#include "roce_hsi.h"
+
+#define BNG_ROCE_FW_MAX_TIMEOUT 60
+
+#define PTR_CNT_PER_PG (PAGE_SIZE / sizeof(void *))
+#define PTR_MAX_IDX_PER_PG (PTR_CNT_PER_PG - 1)
+#define PTR_PG(x) (((x) & ~PTR_MAX_IDX_PER_PG) / PTR_CNT_PER_PG)
+#define PTR_IDX(x) ((x) & PTR_MAX_IDX_PER_PG)
+
+#define HWQ_CMP(idx, hwq) ((idx) & ((hwq)->max_elements - 1))
+#define HWQ_FREE_SLOTS(hwq) (hwq->max_elements - \
+ ((HWQ_CMP(hwq->prod, hwq)\
+ - HWQ_CMP(hwq->cons, hwq))\
+ & (hwq->max_elements - 1)))
+
+#define MAX_PBL_LVL_0_PGS 1
+#define MAX_PBL_LVL_1_PGS 512
+#define MAX_PBL_LVL_1_PGS_SHIFT 9
+#define MAX_PBL_LVL_1_PGS_FOR_LVL_2 256
+#define MAX_PBL_LVL_2_PGS (256 * 512)
+#define MAX_PDL_LVL_SHIFT 9
+
+#define BNG_RE_DBR_VALID (0x1UL << 26)
+#define BNG_RE_DBR_EPOCH_SHIFT 24
+#define BNG_RE_DBR_TOGGLE_SHIFT 25
+
+#define BNG_MAX_TQM_ALLOC_REQ 48
+
+struct bng_re_reg_desc {
+ u8 bar_id;
+ resource_size_t bar_base;
+ unsigned long offset;
+ void __iomem *bar_reg;
+ size_t len;
+};
+
+struct bng_re_db_info {
+ void __iomem *db;
+ void __iomem *priv_db;
+ struct bng_re_hwq *hwq;
+ u32 xid;
+ u32 max_slot;
+ u32 flags;
+ u8 toggle;
+};
+
+enum bng_re_db_info_flags_mask {
+ BNG_RE_FLAG_EPOCH_CONS_SHIFT = 0x0UL,
+ BNG_RE_FLAG_EPOCH_PROD_SHIFT = 0x1UL,
+ BNG_RE_FLAG_EPOCH_CONS_MASK = 0x1UL,
+ BNG_RE_FLAG_EPOCH_PROD_MASK = 0x2UL,
+};
+
+enum bng_re_db_epoch_flag_shift {
+ BNG_RE_DB_EPOCH_CONS_SHIFT = BNG_RE_DBR_EPOCH_SHIFT,
+ BNG_RE_DB_EPOCH_PROD_SHIFT = (BNG_RE_DBR_EPOCH_SHIFT - 1),
+};
+
+struct bng_re_chip_ctx {
+ u16 chip_num;
+ u16 hw_stats_size;
+ u64 hwrm_intf_ver;
+ u16 hwrm_cmd_max_timeout;
+};
+
+struct bng_re_pbl {
+ u32 pg_count;
+ u32 pg_size;
+ void **pg_arr;
+ dma_addr_t *pg_map_arr;
+};
+
+enum bng_re_pbl_lvl {
+ BNG_PBL_LVL_0,
+ BNG_PBL_LVL_1,
+ BNG_PBL_LVL_2,
+ BNG_PBL_LVL_MAX
+};
+
+enum bng_re_hwq_type {
+ BNG_HWQ_TYPE_CTX,
+ BNG_HWQ_TYPE_QUEUE
+};
+
+struct bng_re_sg_info {
+ u32 npages;
+ u32 pgshft;
+ u32 pgsize;
+ bool nopte;
+};
+
+struct bng_re_hwq_attr {
+ struct bng_re_res *res;
+ struct bng_re_sg_info *sginfo;
+ enum bng_re_hwq_type type;
+ u32 depth;
+ u32 stride;
+ u32 aux_stride;
+ u32 aux_depth;
+};
+
+struct bng_re_hwq {
+ struct pci_dev *pdev;
+ /* lock to protect hwq */
+ spinlock_t lock;
+ struct bng_re_pbl pbl[BNG_PBL_LVL_MAX + 1];
+ /* Valid values: 0, 1, 2 */
+ enum bng_re_pbl_lvl level;
+ /* PBL entries */
+ void **pbl_ptr;
+ /* PBL dma_addr */
+ dma_addr_t *pbl_dma_ptr;
+ u32 max_elements;
+ u32 depth;
+ u16 element_size;
+ u32 prod;
+ u32 cons;
+ /* queue entry per page */
+ u16 qe_ppg;
+};
+
+struct bng_re_stats {
+ dma_addr_t dma_map;
+ void *dma;
+ u32 size;
+ u32 fw_id;
+};
+
+struct bng_re_res {
+ struct pci_dev *pdev;
+ struct bng_re_chip_ctx *cctx;
+ struct bng_re_dev_attr *dattr;
+};
+
+static inline void *bng_re_get_qe(struct bng_re_hwq *hwq,
+ u32 indx, u64 *pg)
+{
+ u32 pg_num, pg_idx;
+
+ pg_num = (indx / hwq->qe_ppg);
+ pg_idx = (indx % hwq->qe_ppg);
+ if (pg)
+ *pg = (u64)&hwq->pbl_ptr[pg_num];
+ return (void *)(hwq->pbl_ptr[pg_num] + hwq->element_size * pg_idx);
+}
+
+#define BNG_RE_INIT_DBHDR(xid, type, indx, toggle) \
+ (((u64)(((xid) & DBC_DBC_XID_MASK) | DBC_DBC_PATH_ROCE | \
+ (type) | BNG_RE_DBR_VALID) << 32) | (indx) | \
+ (((u32)(toggle)) << (BNG_RE_DBR_TOGGLE_SHIFT)))
+
+static inline void bng_re_ring_db(struct bng_re_db_info *info,
+ u32 type)
+{
+ u64 key = 0;
+ u32 indx;
+ u8 toggle = 0;
+
+ if (type == DBC_DBC_TYPE_CQ_ARMALL ||
+ type == DBC_DBC_TYPE_CQ_ARMSE)
+ toggle = info->toggle;
+
+ indx = (info->hwq->cons & DBC_DBC_INDEX_MASK) |
+ ((info->flags & BNG_RE_FLAG_EPOCH_CONS_MASK) <<
+ BNG_RE_DB_EPOCH_CONS_SHIFT);
+
+ key = BNG_RE_INIT_DBHDR(info->xid, type, indx, toggle);
+ writeq(key, info->db);
+}
+
+static inline void bng_re_ring_nq_db(struct bng_re_db_info *info,
+ struct bng_re_chip_ctx *cctx,
+ bool arm)
+{
+ u32 type;
+
+ type = arm ? DBC_DBC_TYPE_NQ_ARM : DBC_DBC_TYPE_NQ;
+ bng_re_ring_db(info, type);
+}
+
+static inline void bng_re_hwq_incr_cons(u32 max_elements, u32 *cons, u32 cnt,
+ u32 *dbinfo_flags)
+{
+ /* move cons and update toggle/epoch if wrap around */
+ *cons += cnt;
+ if (*cons >= max_elements) {
+ *cons %= max_elements;
+ *dbinfo_flags ^= 1UL << BNG_RE_FLAG_EPOCH_CONS_SHIFT;
+ }
+}
+
+static inline bool _is_max_srq_ext_supported(u16 dev_cap_ext_flags_2)
+{
+ return !!(dev_cap_ext_flags_2 & CREQ_QUERY_FUNC_RESP_SB_MAX_SRQ_EXTENDED);
+}
+
+void bng_re_free_hwq(struct bng_re_res *res,
+ struct bng_re_hwq *hwq);
+
+int bng_re_alloc_init_hwq(struct bng_re_hwq *hwq,
+ struct bng_re_hwq_attr *hwq_attr);
+
+void bng_re_free_stats_ctx_mem(struct pci_dev *pdev,
+ struct bng_re_stats *stats);
+
+int bng_re_alloc_stats_ctx_mem(struct pci_dev *pdev,
+ struct bng_re_chip_ctx *cctx,
+ struct bng_re_stats *stats);
+#endif
diff --git a/drivers/infiniband/hw/bng_re/bng_sp.c b/drivers/infiniband/hw/bng_re/bng_sp.c
new file mode 100644
index 000000000000..83099e05328d
--- /dev/null
+++ b/drivers/infiniband/hw/bng_re/bng_sp.c
@@ -0,0 +1,131 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2025 Broadcom.
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+
+#include "bng_res.h"
+#include "bng_fw.h"
+#include "bng_sp.h"
+#include "bng_tlv.h"
+
+static bool bng_re_is_atomic_cap(struct bng_re_rcfw *rcfw)
+{
+ u16 pcie_ctl2 = 0;
+
+ pcie_capability_read_word(rcfw->pdev, PCI_EXP_DEVCTL2, &pcie_ctl2);
+ return (pcie_ctl2 & PCI_EXP_DEVCTL2_ATOMIC_REQ);
+}
+
+static void bng_re_query_version(struct bng_re_rcfw *rcfw,
+ char *fw_ver)
+{
+ struct creq_query_version_resp resp = {};
+ struct bng_re_cmdqmsg msg = {};
+ struct cmdq_query_version req = {};
+ int rc;
+
+ bng_re_rcfw_cmd_prep((struct cmdq_base *)&req,
+ CMDQ_BASE_OPCODE_QUERY_VERSION,
+ sizeof(req));
+
+ bng_re_fill_cmdqmsg(&msg, &req, &resp, NULL, sizeof(req), sizeof(resp), 0);
+ rc = bng_re_rcfw_send_message(rcfw, &msg);
+ if (rc)
+ return;
+ fw_ver[0] = resp.fw_maj;
+ fw_ver[1] = resp.fw_minor;
+ fw_ver[2] = resp.fw_bld;
+ fw_ver[3] = resp.fw_rsvd;
+}
+
+int bng_re_get_dev_attr(struct bng_re_rcfw *rcfw)
+{
+ struct bng_re_dev_attr *attr = rcfw->res->dattr;
+ struct creq_query_func_resp resp = {};
+ struct bng_re_cmdqmsg msg = {};
+ struct creq_query_func_resp_sb *sb;
+ struct bng_re_rcfw_sbuf sbuf;
+ struct cmdq_query_func req = {};
+ u8 *tqm_alloc;
+ int i, rc;
+ u32 temp;
+
+ bng_re_rcfw_cmd_prep((struct cmdq_base *)&req,
+ CMDQ_BASE_OPCODE_QUERY_FUNC,
+ sizeof(req));
+
+ sbuf.size = ALIGN(sizeof(*sb), BNG_FW_CMDQE_UNITS);
+ sbuf.sb = dma_alloc_coherent(&rcfw->pdev->dev, sbuf.size,
+ &sbuf.dma_addr, GFP_KERNEL);
+ if (!sbuf.sb)
+ return -ENOMEM;
+ sb = sbuf.sb;
+ req.resp_size = sbuf.size / BNG_FW_CMDQE_UNITS;
+ bng_re_fill_cmdqmsg(&msg, &req, &resp, &sbuf, sizeof(req),
+ sizeof(resp), 0);
+ rc = bng_re_rcfw_send_message(rcfw, &msg);
+ if (rc)
+ goto bail;
+ /* Extract the context from the side buffer */
+ attr->max_qp = le32_to_cpu(sb->max_qp);
+ /* max_qp value reported by FW doesn't include the QP1 */
+ attr->max_qp += 1;
+ attr->max_qp_rd_atom =
+ sb->max_qp_rd_atom > BNG_RE_MAX_OUT_RD_ATOM ?
+ BNG_RE_MAX_OUT_RD_ATOM : sb->max_qp_rd_atom;
+ attr->max_qp_init_rd_atom =
+ sb->max_qp_init_rd_atom > BNG_RE_MAX_OUT_RD_ATOM ?
+ BNG_RE_MAX_OUT_RD_ATOM : sb->max_qp_init_rd_atom;
+ attr->max_qp_wqes = le16_to_cpu(sb->max_qp_wr) - 1;
+
+ /* Adjust for max_qp_wqes for variable wqe */
+ attr->max_qp_wqes = min_t(u32, attr->max_qp_wqes, BNG_VAR_MAX_WQE - 1);
+
+ attr->max_qp_sges = min_t(u32, sb->max_sge_var_wqe, BNG_VAR_MAX_SGE);
+ attr->max_cq = le32_to_cpu(sb->max_cq);
+ attr->max_cq_wqes = le32_to_cpu(sb->max_cqe);
+ attr->max_cq_sges = attr->max_qp_sges;
+ attr->max_mr = le32_to_cpu(sb->max_mr);
+ attr->max_mw = le32_to_cpu(sb->max_mw);
+
+ attr->max_mr_size = le64_to_cpu(sb->max_mr_size);
+ attr->max_pd = 64 * 1024;
+ attr->max_raw_ethy_qp = le32_to_cpu(sb->max_raw_eth_qp);
+ attr->max_ah = le32_to_cpu(sb->max_ah);
+
+ attr->max_srq = le16_to_cpu(sb->max_srq);
+ attr->max_srq_wqes = le32_to_cpu(sb->max_srq_wr) - 1;
+ attr->max_srq_sges = sb->max_srq_sge;
+ attr->max_pkey = 1;
+ attr->max_inline_data = le32_to_cpu(sb->max_inline_data);
+ /*
+ * Read the max gid supported by HW.
+ * For each entry in HW GID in HW table, we consume 2
+ * GID entries in the kernel GID table. So max_gid reported
+ * to stack can be up to twice the value reported by the HW, up to 256 gids.
+ */
+ attr->max_sgid = le32_to_cpu(sb->max_gid);
+ attr->max_sgid = min_t(u32, BNG_RE_NUM_GIDS_SUPPORTED, 2 * attr->max_sgid);
+ attr->dev_cap_flags = le16_to_cpu(sb->dev_cap_flags);
+ attr->dev_cap_flags2 = le16_to_cpu(sb->dev_cap_ext_flags_2);
+
+ if (_is_max_srq_ext_supported(attr->dev_cap_flags2))
+ attr->max_srq += le16_to_cpu(sb->max_srq_ext);
+
+ bng_re_query_version(rcfw, attr->fw_ver);
+ for (i = 0; i < BNG_MAX_TQM_ALLOC_REQ / 4; i++) {
+ temp = le32_to_cpu(sb->tqm_alloc_reqs[i]);
+ tqm_alloc = (u8 *)&temp;
+ attr->tqm_alloc_reqs[i * 4] = *tqm_alloc;
+ attr->tqm_alloc_reqs[i * 4 + 1] = *(++tqm_alloc);
+ attr->tqm_alloc_reqs[i * 4 + 2] = *(++tqm_alloc);
+ attr->tqm_alloc_reqs[i * 4 + 3] = *(++tqm_alloc);
+ }
+
+ attr->max_dpi = le32_to_cpu(sb->max_dpi);
+ attr->is_atomic = bng_re_is_atomic_cap(rcfw);
+bail:
+ dma_free_coherent(&rcfw->pdev->dev, sbuf.size,
+ sbuf.sb, sbuf.dma_addr);
+ return rc;
+}
diff --git a/drivers/infiniband/hw/bng_re/bng_sp.h b/drivers/infiniband/hw/bng_re/bng_sp.h
new file mode 100644
index 000000000000..e15190515ed1
--- /dev/null
+++ b/drivers/infiniband/hw/bng_re/bng_sp.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+// Copyright (c) 2025 Broadcom.
+
+#ifndef __BNG_SP_H__
+#define __BNG_SP_H__
+
+#include "bng_fw.h"
+
+#define BNG_VAR_MAX_WQE 4352
+#define BNG_VAR_MAX_SGE 13
+
+struct bng_re_dev_attr {
+#define FW_VER_ARR_LEN 4
+ u8 fw_ver[FW_VER_ARR_LEN];
+#define BNG_RE_NUM_GIDS_SUPPORTED 256
+ u16 max_sgid;
+ u16 max_mrw;
+ u32 max_qp;
+#define BNG_RE_MAX_OUT_RD_ATOM 126
+ u32 max_qp_rd_atom;
+ u32 max_qp_init_rd_atom;
+ u32 max_qp_wqes;
+ u32 max_qp_sges;
+ u32 max_cq;
+ u32 max_cq_wqes;
+ u32 max_cq_sges;
+ u32 max_mr;
+ u64 max_mr_size;
+ u32 max_pd;
+ u32 max_mw;
+ u32 max_raw_ethy_qp;
+ u32 max_ah;
+ u32 max_srq;
+ u32 max_srq_wqes;
+ u32 max_srq_sges;
+ u32 max_pkey;
+ u32 max_inline_data;
+ u32 l2_db_size;
+ u8 tqm_alloc_reqs[BNG_MAX_TQM_ALLOC_REQ];
+ bool is_atomic;
+ u16 dev_cap_flags;
+ u16 dev_cap_flags2;
+ u32 max_dpi;
+};
+
+int bng_re_get_dev_attr(struct bng_re_rcfw *rcfw);
+#endif
diff --git a/drivers/infiniband/hw/bng_re/bng_tlv.h b/drivers/infiniband/hw/bng_re/bng_tlv.h
new file mode 100644
index 000000000000..278f4922962d
--- /dev/null
+++ b/drivers/infiniband/hw/bng_re/bng_tlv.h
@@ -0,0 +1,128 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
+
+#ifndef __BNG_TLV_H__
+#define __BNG_TLV_H__
+
+#include "roce_hsi.h"
+
+struct roce_tlv {
+ struct tlv tlv;
+ u8 total_size; // in units of 16 byte chunks
+ u8 unused[7]; // for 16 byte alignment
+};
+
+/*
+ * TLV size in units of 16 byte chunks
+ */
+#define TLV_SIZE ((sizeof(struct roce_tlv) + 15) / 16)
+/*
+ * TLV length in bytes
+ */
+#define TLV_BYTES (TLV_SIZE * 16)
+
+#define HAS_TLV_HEADER(msg) (le16_to_cpu(((struct tlv *)(msg))->cmd_discr) == CMD_DISCR_TLV_ENCAP)
+#define GET_TLV_DATA(tlv) ((void *)&((uint8_t *)(tlv))[TLV_BYTES])
+
+static inline u8 __get_cmdq_base_opcode(struct cmdq_base *req, u32 size)
+{
+ if (HAS_TLV_HEADER(req) && size > TLV_BYTES)
+ return ((struct cmdq_base *)GET_TLV_DATA(req))->opcode;
+ else
+ return req->opcode;
+}
+
+static inline void __set_cmdq_base_opcode(struct cmdq_base *req,
+ u32 size, u8 val)
+{
+ if (HAS_TLV_HEADER(req) && size > TLV_BYTES)
+ ((struct cmdq_base *)GET_TLV_DATA(req))->opcode = val;
+ else
+ req->opcode = val;
+}
+
+static inline __le16 __get_cmdq_base_cookie(struct cmdq_base *req, u32 size)
+{
+ if (HAS_TLV_HEADER(req) && size > TLV_BYTES)
+ return ((struct cmdq_base *)GET_TLV_DATA(req))->cookie;
+ else
+ return req->cookie;
+}
+
+static inline void __set_cmdq_base_cookie(struct cmdq_base *req,
+ u32 size, __le16 val)
+{
+ if (HAS_TLV_HEADER(req) && size > TLV_BYTES)
+ ((struct cmdq_base *)GET_TLV_DATA(req))->cookie = val;
+ else
+ req->cookie = val;
+}
+
+static inline __le64 __get_cmdq_base_resp_addr(struct cmdq_base *req, u32 size)
+{
+ if (HAS_TLV_HEADER(req) && size > TLV_BYTES)
+ return ((struct cmdq_base *)GET_TLV_DATA(req))->resp_addr;
+ else
+ return req->resp_addr;
+}
+
+static inline void __set_cmdq_base_resp_addr(struct cmdq_base *req,
+ u32 size, __le64 val)
+{
+ if (HAS_TLV_HEADER(req) && size > TLV_BYTES)
+ ((struct cmdq_base *)GET_TLV_DATA(req))->resp_addr = val;
+ else
+ req->resp_addr = val;
+}
+
+static inline u8 __get_cmdq_base_resp_size(struct cmdq_base *req, u32 size)
+{
+ if (HAS_TLV_HEADER(req) && size > TLV_BYTES)
+ return ((struct cmdq_base *)GET_TLV_DATA(req))->resp_size;
+ else
+ return req->resp_size;
+}
+
+static inline void __set_cmdq_base_resp_size(struct cmdq_base *req,
+ u32 size, u8 val)
+{
+ if (HAS_TLV_HEADER(req) && size > TLV_BYTES)
+ ((struct cmdq_base *)GET_TLV_DATA(req))->resp_size = val;
+ else
+ req->resp_size = val;
+}
+
+static inline u8 __get_cmdq_base_cmd_size(struct cmdq_base *req, u32 size)
+{
+ if (HAS_TLV_HEADER(req) && size > TLV_BYTES)
+ return ((struct roce_tlv *)(req))->total_size;
+ else
+ return req->cmd_size;
+}
+
+static inline void __set_cmdq_base_cmd_size(struct cmdq_base *req,
+ u32 size, u8 val)
+{
+ if (HAS_TLV_HEADER(req) && size > TLV_BYTES)
+ ((struct cmdq_base *)GET_TLV_DATA(req))->cmd_size = val;
+ else
+ req->cmd_size = val;
+}
+
+static inline __le16 __get_cmdq_base_flags(struct cmdq_base *req, u32 size)
+{
+ if (HAS_TLV_HEADER(req) && size > TLV_BYTES)
+ return ((struct cmdq_base *)GET_TLV_DATA(req))->flags;
+ else
+ return req->flags;
+}
+
+static inline void __set_cmdq_base_flags(struct cmdq_base *req,
+ u32 size, __le16 val)
+{
+ if (HAS_TLV_HEADER(req) && size > TLV_BYTES)
+ ((struct cmdq_base *)GET_TLV_DATA(req))->flags = val;
+ else
+ req->flags = val;
+}
+
+#endif /* __BNG_TLV_H__ */
diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h
index 3485e495ac6a..3a7ce4729fcf 100644
--- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h
+++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h
@@ -224,6 +224,8 @@ struct bnxt_re_dev {
struct workqueue_struct *dcb_wq;
struct dentry *cc_config;
struct bnxt_re_dbg_cc_config_params *cc_config_params;
+ struct dentry *cq_coal_cfg;
+ struct bnxt_re_dbg_cq_coal_params *cq_coal_cfg_params;
#define BNXT_VPD_FLD_LEN 32
char board_partno[BNXT_VPD_FLD_LEN];
/* RoCE mirror */
diff --git a/drivers/infiniband/hw/bnxt_re/debugfs.c b/drivers/infiniband/hw/bnxt_re/debugfs.c
index be5e9b5ca2f0..88817c86ae24 100644
--- a/drivers/infiniband/hw/bnxt_re/debugfs.c
+++ b/drivers/infiniband/hw/bnxt_re/debugfs.c
@@ -23,6 +23,14 @@
static struct dentry *bnxt_re_debugfs_root;
+static const char * const bnxt_re_cq_coal_str[] = {
+ "buf_maxtime",
+ "normal_maxbuf",
+ "during_maxbuf",
+ "en_ring_idle_mode",
+ "enable",
+};
+
static const char * const bnxt_re_cc_gen0_name[] = {
"enable_cc",
"run_avg_weight_g",
@@ -349,6 +357,123 @@ static void bnxt_re_debugfs_add_info(struct bnxt_re_dev *rdev)
debugfs_create_file("info", 0400, rdev->dbg_root, rdev, &info_fops);
}
+static ssize_t cq_coal_cfg_write(struct file *file,
+ const char __user *buf,
+ size_t count, loff_t *pos)
+{
+ struct seq_file *s = file->private_data;
+ struct bnxt_re_cq_coal_param *param = s->private;
+ struct bnxt_re_dev *rdev = param->rdev;
+ int offset = param->offset;
+ char lbuf[16] = { };
+ u32 val;
+
+ if (count > sizeof(lbuf))
+ return -EINVAL;
+
+ if (copy_from_user(lbuf, buf, count))
+ return -EFAULT;
+
+ lbuf[sizeof(lbuf) - 1] = '\0';
+
+ if (kstrtou32(lbuf, 0, &val))
+ return -EINVAL;
+
+ switch (offset) {
+ case BNXT_RE_COAL_CQ_BUF_MAXTIME:
+ if (val < 1 || val > BNXT_QPLIB_CQ_COAL_MAX_BUF_MAXTIME)
+ return -EINVAL;
+ rdev->cq_coalescing.buf_maxtime = val;
+ break;
+ case BNXT_RE_COAL_CQ_NORMAL_MAXBUF:
+ if (val < 1 || val > BNXT_QPLIB_CQ_COAL_MAX_NORMAL_MAXBUF)
+ return -EINVAL;
+ rdev->cq_coalescing.normal_maxbuf = val;
+ break;
+ case BNXT_RE_COAL_CQ_DURING_MAXBUF:
+ if (val < 1 || val > BNXT_QPLIB_CQ_COAL_MAX_DURING_MAXBUF)
+ return -EINVAL;
+ rdev->cq_coalescing.during_maxbuf = val;
+ break;
+ case BNXT_RE_COAL_CQ_EN_RING_IDLE_MODE:
+ if (val > BNXT_QPLIB_CQ_COAL_MAX_EN_RING_IDLE_MODE)
+ return -EINVAL;
+ rdev->cq_coalescing.en_ring_idle_mode = val;
+ break;
+ case BNXT_RE_COAL_CQ_ENABLE:
+ if (val > 1)
+ return -EINVAL;
+ rdev->cq_coalescing.enable = val;
+ break;
+ default:
+ return -EINVAL;
+ }
+ return count;
+}
+
+static int cq_coal_cfg_show(struct seq_file *s, void *unused)
+{
+ struct bnxt_re_cq_coal_param *param = s->private;
+ struct bnxt_re_dev *rdev = param->rdev;
+ int offset = param->offset;
+ u32 val = 0;
+
+ switch (offset) {
+ case BNXT_RE_COAL_CQ_BUF_MAXTIME:
+ val = rdev->cq_coalescing.buf_maxtime;
+ break;
+ case BNXT_RE_COAL_CQ_NORMAL_MAXBUF:
+ val = rdev->cq_coalescing.normal_maxbuf;
+ break;
+ case BNXT_RE_COAL_CQ_DURING_MAXBUF:
+ val = rdev->cq_coalescing.during_maxbuf;
+ break;
+ case BNXT_RE_COAL_CQ_EN_RING_IDLE_MODE:
+ val = rdev->cq_coalescing.en_ring_idle_mode;
+ break;
+ case BNXT_RE_COAL_CQ_ENABLE:
+ val = rdev->cq_coalescing.enable;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ seq_printf(s, "%u\n", val);
+ return 0;
+}
+DEFINE_SHOW_STORE_ATTRIBUTE(cq_coal_cfg);
+
+static void bnxt_re_cleanup_cq_coal_debugfs(struct bnxt_re_dev *rdev)
+{
+ debugfs_remove_recursive(rdev->cq_coal_cfg);
+ kfree(rdev->cq_coal_cfg_params);
+}
+
+static void bnxt_re_init_cq_coal_debugfs(struct bnxt_re_dev *rdev)
+{
+ struct bnxt_re_dbg_cq_coal_params *dbg_cq_coal_params;
+ int i;
+
+ if (!_is_cq_coalescing_supported(rdev->dev_attr->dev_cap_flags2))
+ return;
+
+ dbg_cq_coal_params = kzalloc(sizeof(*dbg_cq_coal_params), GFP_KERNEL);
+ if (!dbg_cq_coal_params)
+ return;
+
+ rdev->cq_coal_cfg = debugfs_create_dir("cq_coal_cfg", rdev->dbg_root);
+ rdev->cq_coal_cfg_params = dbg_cq_coal_params;
+
+ for (i = 0; i < BNXT_RE_COAL_CQ_MAX; i++) {
+ dbg_cq_coal_params->params[i].offset = i;
+ dbg_cq_coal_params->params[i].rdev = rdev;
+ debugfs_create_file(bnxt_re_cq_coal_str[i],
+ 0600, rdev->cq_coal_cfg,
+ &dbg_cq_coal_params->params[i],
+ &cq_coal_cfg_fops);
+ }
+}
+
void bnxt_re_debugfs_add_pdev(struct bnxt_re_dev *rdev)
{
struct pci_dev *pdev = rdev->en_dev->pdev;
@@ -374,10 +499,13 @@ void bnxt_re_debugfs_add_pdev(struct bnxt_re_dev *rdev)
rdev->cc_config, tmp_params,
&bnxt_re_cc_config_ops);
}
+
+ bnxt_re_init_cq_coal_debugfs(rdev);
}
void bnxt_re_debugfs_rem_pdev(struct bnxt_re_dev *rdev)
{
+ bnxt_re_cleanup_cq_coal_debugfs(rdev);
debugfs_remove_recursive(rdev->qp_debugfs);
debugfs_remove_recursive(rdev->cc_config);
kfree(rdev->cc_config_params);
diff --git a/drivers/infiniband/hw/bnxt_re/debugfs.h b/drivers/infiniband/hw/bnxt_re/debugfs.h
index 8f101df4e838..98f4620ef245 100644
--- a/drivers/infiniband/hw/bnxt_re/debugfs.h
+++ b/drivers/infiniband/hw/bnxt_re/debugfs.h
@@ -33,4 +33,23 @@ struct bnxt_re_cc_param {
struct bnxt_re_dbg_cc_config_params {
struct bnxt_re_cc_param gen0_parms[BNXT_RE_CC_PARAM_GEN0];
};
+
+struct bnxt_re_cq_coal_param {
+ struct bnxt_re_dev *rdev;
+ u32 offset;
+};
+
+enum bnxt_re_cq_coal_types {
+ BNXT_RE_COAL_CQ_BUF_MAXTIME,
+ BNXT_RE_COAL_CQ_NORMAL_MAXBUF,
+ BNXT_RE_COAL_CQ_DURING_MAXBUF,
+ BNXT_RE_COAL_CQ_EN_RING_IDLE_MODE,
+ BNXT_RE_COAL_CQ_ENABLE,
+ BNXT_RE_COAL_CQ_MAX
+
+};
+
+struct bnxt_re_dbg_cq_coal_params {
+ struct bnxt_re_cq_coal_param params[BNXT_RE_COAL_CQ_MAX];
+};
#endif
diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
index 84ce3fce2826..f19b55c13d58 100644
--- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c
+++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
@@ -601,7 +601,8 @@ static int bnxt_re_create_fence_mr(struct bnxt_re_pd *pd)
mr->qplib_mr.va = (u64)(unsigned long)fence->va;
mr->qplib_mr.total_size = BNXT_RE_FENCE_BYTES;
rc = bnxt_qplib_reg_mr(&rdev->qplib_res, &mr->qplib_mr, NULL,
- BNXT_RE_FENCE_PBL_SIZE, PAGE_SIZE);
+ BNXT_RE_FENCE_PBL_SIZE, PAGE_SIZE,
+ _is_alloc_mr_unified(rdev->dev_attr->dev_cap_flags));
if (rc) {
ibdev_err(&rdev->ibdev, "Failed to register fence-MR\n");
goto fail;
@@ -4027,7 +4028,7 @@ struct ib_mr *bnxt_re_get_dma_mr(struct ib_pd *ib_pd, int mr_access_flags)
mr->qplib_mr.hwq.level = PBL_LVL_MAX;
mr->qplib_mr.total_size = -1; /* Infinte length */
rc = bnxt_qplib_reg_mr(&rdev->qplib_res, &mr->qplib_mr, NULL, 0,
- PAGE_SIZE);
+ PAGE_SIZE, false);
if (rc)
goto fail_mr;
@@ -4257,7 +4258,8 @@ static struct ib_mr *__bnxt_re_user_reg_mr(struct ib_pd *ib_pd, u64 length, u64
umem_pgs = ib_umem_num_dma_blocks(umem, page_size);
rc = bnxt_qplib_reg_mr(&rdev->qplib_res, &mr->qplib_mr, umem,
- umem_pgs, page_size);
+ umem_pgs, page_size,
+ _is_alloc_mr_unified(rdev->dev_attr->dev_cap_flags));
if (rc) {
ibdev_err(&rdev->ibdev, "Failed to register user MR - rc = %d\n", rc);
rc = -EIO;
diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c
index b13810572c2e..73003ad25ee8 100644
--- a/drivers/infiniband/hw/bnxt_re/main.c
+++ b/drivers/infiniband/hw/bnxt_re/main.c
@@ -1453,6 +1453,7 @@ static struct bnxt_re_dev *bnxt_re_dev_add(struct auxiliary_device *adev,
atomic_set(&rdev->stats.res.pd_count, 0);
rdev->cosq[0] = 0xFFFF;
rdev->cosq[1] = 0xFFFF;
+ rdev->cq_coalescing.enable = 1;
rdev->cq_coalescing.buf_maxtime = BNXT_QPLIB_CQ_COAL_DEF_BUF_MAXTIME;
if (bnxt_re_chip_gen_p7(en_dev->chip_num)) {
rdev->cq_coalescing.normal_maxbuf = BNXT_QPLIB_CQ_COAL_DEF_NORMAL_MAXBUF_P7;
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c
index ce90d3d834d4..c88f049136fc 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c
+++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c
@@ -2226,7 +2226,8 @@ int bnxt_qplib_create_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq)
req.cq_handle = cpu_to_le64(cq->cq_handle);
req.cq_size = cpu_to_le32(cq->max_wqe);
- if (_is_cq_coalescing_supported(res->dattr->dev_cap_flags2)) {
+ if (_is_cq_coalescing_supported(res->dattr->dev_cap_flags2) &&
+ cq->coalescing->enable) {
req.flags |= cpu_to_le16(CMDQ_CREATE_CQ_FLAGS_COALESCING_VALID);
coalescing |= ((cq->coalescing->buf_maxtime <<
CMDQ_CREATE_CQ_BUF_MAXTIME_SFT) &
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.h b/drivers/infiniband/hw/bnxt_re/qplib_fp.h
index b990d0c0ce1a..1b414a73b46d 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_fp.h
+++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.h
@@ -395,6 +395,7 @@ struct bnxt_qplib_cq_coal_param {
u8 normal_maxbuf;
u8 during_maxbuf;
u8 en_ring_idle_mode;
+ u8 enable;
};
#define BNXT_QPLIB_CQ_COAL_DEF_BUF_MAXTIME 0x1
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.c b/drivers/infiniband/hw/bnxt_re/qplib_sp.c
index 9ef581ed785c..408a34df2667 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_sp.c
+++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.c
@@ -162,7 +162,7 @@ int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw)
attr->max_srq_wqes = le32_to_cpu(sb->max_srq_wr) - 1;
attr->max_srq_sges = sb->max_srq_sge;
attr->max_pkey = 1;
- attr->max_inline_data = le32_to_cpu(sb->max_inline_data);
+ attr->max_inline_data = attr->max_qp_sges * sizeof(struct sq_sge);
if (!bnxt_qplib_is_chip_gen_p7(rcfw->res->cctx))
attr->l2_db_size = (sb->l2_db_space_size + 1) *
(0x01 << RCFW_DBR_BASE_PAGE_SHIFT);
@@ -578,7 +578,7 @@ int bnxt_qplib_dereg_mrw(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mrw,
}
int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr,
- struct ib_umem *umem, int num_pbls, u32 buf_pg_size)
+ struct ib_umem *umem, int num_pbls, u32 buf_pg_size, bool unified_mr)
{
struct bnxt_qplib_rcfw *rcfw = res->rcfw;
struct bnxt_qplib_hwq_attr hwq_attr = {};
@@ -640,7 +640,7 @@ int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr,
req.access = (mr->access_flags & BNXT_QPLIB_MR_ACCESS_MASK);
req.va = cpu_to_le64(mr->va);
req.key = cpu_to_le32(mr->lkey);
- if (_is_alloc_mr_unified(res->dattr->dev_cap_flags))
+ if (unified_mr)
req.key = cpu_to_le32(mr->pd->id);
req.flags = cpu_to_le16(mr->flags);
req.mr_size = cpu_to_le64(mr->total_size);
@@ -651,7 +651,7 @@ int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr,
if (rc)
goto fail;
- if (_is_alloc_mr_unified(res->dattr->dev_cap_flags)) {
+ if (unified_mr) {
mr->lkey = le32_to_cpu(resp.xid);
mr->rkey = mr->lkey;
}
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.h b/drivers/infiniband/hw/bnxt_re/qplib_sp.h
index 147b5d9c0313..5a45c55c6464 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_sp.h
+++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.h
@@ -341,7 +341,7 @@ int bnxt_qplib_alloc_mrw(struct bnxt_qplib_res *res,
int bnxt_qplib_dereg_mrw(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mrw,
bool block);
int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr,
- struct ib_umem *umem, int num_pbls, u32 buf_pg_size);
+ struct ib_umem *umem, int num_pbls, u32 buf_pg_size, bool unified_mr);
int bnxt_qplib_free_mrw(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr);
int bnxt_qplib_alloc_fast_reg_mr(struct bnxt_qplib_res *res,
struct bnxt_qplib_mrw *mr, int max);
diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c
index dcdfe250bdbe..adeed7447e7b 100644
--- a/drivers/infiniband/hw/cxgb4/mem.c
+++ b/drivers/infiniband/hw/cxgb4/mem.c
@@ -348,7 +348,7 @@ static int write_pbl(struct c4iw_rdev *rdev, __be64 *pbl,
{
int err;
- pr_debug("*pdb_addr 0x%x, pbl_base 0x%x, pbl_size %d\n",
+ pr_debug("*pbl_addr 0x%x, pbl_base 0x%x, pbl_size %d\n",
pbl_addr, rdev->lldi.vr->pbl.start,
pbl_size);
diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c
index b35f92e7d865..e4aef102dac0 100644
--- a/drivers/infiniband/hw/hfi1/init.c
+++ b/drivers/infiniband/hw/hfi1/init.c
@@ -745,8 +745,8 @@ static int create_workqueues(struct hfi1_devdata *dd)
ppd->hfi1_wq =
alloc_workqueue(
"hfi%d_%d",
- WQ_SYSFS | WQ_HIGHPRI | WQ_CPU_INTENSIVE |
- WQ_MEM_RECLAIM,
+ WQ_SYSFS | WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM |
+ WQ_PERCPU,
HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES,
dd->unit, pidx);
if (!ppd->hfi1_wq)
diff --git a/drivers/infiniband/hw/hfi1/opfn.c b/drivers/infiniband/hw/hfi1/opfn.c
index 370a5a8eaa71..6e0e3458d202 100644
--- a/drivers/infiniband/hw/hfi1/opfn.c
+++ b/drivers/infiniband/hw/hfi1/opfn.c
@@ -305,8 +305,8 @@ void opfn_trigger_conn_request(struct rvt_qp *qp, u32 bth1)
int opfn_init(void)
{
opfn_wq = alloc_workqueue("hfi_opfn",
- WQ_SYSFS | WQ_HIGHPRI | WQ_CPU_INTENSIVE |
- WQ_MEM_RECLAIM,
+ WQ_SYSFS | WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM |
+ WQ_PERCPU,
HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES);
if (!opfn_wq)
return -ENOMEM;
diff --git a/drivers/infiniband/hw/hns/Makefile b/drivers/infiniband/hw/hns/Makefile
index baf592e6f21b..d07ef02c5231 100644
--- a/drivers/infiniband/hw/hns/Makefile
+++ b/drivers/infiniband/hw/hns/Makefile
@@ -4,11 +4,13 @@
#
ccflags-y := -I $(srctree)/drivers/net/ethernet/hisilicon/hns3
+ccflags-y += -I $(srctree)/drivers/net/ethernet/hisilicon/hns3/hns3pf
+ccflags-y += -I $(srctree)/drivers/net/ethernet/hisilicon/hns3/hns3_common
ccflags-y += -I $(src)
hns-roce-hw-v2-objs := hns_roce_main.o hns_roce_cmd.o hns_roce_pd.o \
hns_roce_ah.o hns_roce_hem.o hns_roce_mr.o hns_roce_qp.o \
hns_roce_cq.o hns_roce_alloc.o hns_roce_db.o hns_roce_srq.o hns_roce_restrack.o \
- hns_roce_debugfs.o hns_roce_hw_v2.o
+ hns_roce_debugfs.o hns_roce_hw_v2.o hns_roce_bond.o
obj-$(CONFIG_INFINIBAND_HNS_HIP08) += hns-roce-hw-v2.o
diff --git a/drivers/infiniband/hw/hns/hns_roce_ah.c b/drivers/infiniband/hw/hns/hns_roce_ah.c
index 307c35888b30..0c1c32d23c88 100644
--- a/drivers/infiniband/hw/hns/hns_roce_ah.c
+++ b/drivers/infiniband/hw/hns/hns_roce_ah.c
@@ -30,7 +30,6 @@
* SOFTWARE.
*/
-#include <linux/pci.h>
#include <rdma/ib_addr.h>
#include <rdma/ib_cache.h>
#include "hns_roce_device.h"
diff --git a/drivers/infiniband/hw/hns/hns_roce_bond.c b/drivers/infiniband/hw/hns/hns_roce_bond.c
new file mode 100644
index 000000000000..cc85f3ce1f3e
--- /dev/null
+++ b/drivers/infiniband/hw/hns/hns_roce_bond.c
@@ -0,0 +1,1012 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (c) 2025 Hisilicon Limited.
+ */
+
+#include <net/lag.h>
+#include <net/bonding.h>
+#include "hns_roce_device.h"
+#include "hns_roce_hw_v2.h"
+#include "hns_roce_bond.h"
+
+static DEFINE_XARRAY(roce_bond_xa);
+
+static struct hns_roce_dev *hns_roce_get_hrdev_by_netdev(struct net_device *net_dev)
+{
+ struct ib_device *ibdev =
+ ib_device_get_by_netdev(net_dev, RDMA_DRIVER_HNS);
+
+ if (!ibdev)
+ return NULL;
+
+ return container_of(ibdev, struct hns_roce_dev, ib_dev);
+}
+
+static struct net_device *get_upper_dev_from_ndev(struct net_device *net_dev)
+{
+ struct net_device *upper_dev;
+
+ rcu_read_lock();
+ upper_dev = netdev_master_upper_dev_get_rcu(net_dev);
+ dev_hold(upper_dev);
+ rcu_read_unlock();
+
+ return upper_dev;
+}
+
+static int get_netdev_bond_slave_id(struct net_device *net_dev,
+ struct hns_roce_bond_group *bond_grp)
+{
+ int i;
+
+ for (i = 0; i < ROCE_BOND_FUNC_MAX; i++)
+ if (net_dev == bond_grp->bond_func_info[i].net_dev)
+ return i;
+
+ return -ENOENT;
+}
+
+struct hns_roce_bond_group *hns_roce_get_bond_grp(struct net_device *net_dev,
+ u8 bus_num)
+{
+ struct hns_roce_die_info *die_info = xa_load(&roce_bond_xa, bus_num);
+ struct hns_roce_bond_group *bond_grp;
+ struct net_device *upper_dev = NULL;
+ int i;
+
+ if (!die_info)
+ return NULL;
+
+ for (i = 0; i < ROCE_BOND_NUM_MAX; i++) {
+ bond_grp = die_info->bgrps[i];
+ if (!bond_grp)
+ continue;
+ if (get_netdev_bond_slave_id(net_dev, bond_grp) >= 0)
+ return bond_grp;
+ if (bond_grp->upper_dev) {
+ upper_dev = get_upper_dev_from_ndev(net_dev);
+ if (bond_grp->upper_dev == upper_dev) {
+ dev_put(upper_dev);
+ return bond_grp;
+ }
+ dev_put(upper_dev);
+ }
+ }
+
+ return NULL;
+}
+
+static int hns_roce_set_bond_netdev(struct hns_roce_bond_group *bond_grp,
+ struct hns_roce_dev *hr_dev)
+{
+ struct net_device *active_dev;
+ struct net_device *old_dev;
+ int i, ret = 0;
+
+ if (bond_grp->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP) {
+ rcu_read_lock();
+ active_dev =
+ bond_option_active_slave_get_rcu(netdev_priv(bond_grp->upper_dev));
+ rcu_read_unlock();
+ } else {
+ for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) {
+ active_dev = bond_grp->bond_func_info[i].net_dev;
+ if (active_dev &&
+ ib_get_curr_port_state(active_dev) == IB_PORT_ACTIVE)
+ break;
+ }
+ }
+
+ if (!active_dev || i == ROCE_BOND_FUNC_MAX)
+ active_dev = get_hr_netdev(hr_dev, 0);
+
+ old_dev = ib_device_get_netdev(&hr_dev->ib_dev, 1);
+ if (old_dev == active_dev)
+ goto out;
+
+ ret = ib_device_set_netdev(&hr_dev->ib_dev, active_dev, 1);
+ if (ret) {
+ dev_err(hr_dev->dev, "failed to set netdev for bond.\n");
+ goto out;
+ }
+
+ if (bond_grp->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP) {
+ if (old_dev)
+ roce_del_all_netdev_gids(&hr_dev->ib_dev, 1, old_dev);
+ rdma_roce_rescan_port(&hr_dev->ib_dev, 1);
+ }
+out:
+ dev_put(old_dev);
+ return ret;
+}
+
+bool hns_roce_bond_is_active(struct hns_roce_dev *hr_dev)
+{
+ struct net_device *net_dev = get_hr_netdev(hr_dev, 0);
+ struct hns_roce_bond_group *bond_grp;
+ u8 bus_num = get_hr_bus_num(hr_dev);
+
+ bond_grp = hns_roce_get_bond_grp(net_dev, bus_num);
+ if (bond_grp && bond_grp->bond_state != HNS_ROCE_BOND_NOT_BONDED &&
+ bond_grp->bond_state != HNS_ROCE_BOND_NOT_ATTACHED)
+ return true;
+
+ return false;
+}
+
+static void hns_roce_bond_get_active_slave(struct hns_roce_bond_group *bond_grp)
+{
+ struct net_device *net_dev;
+ u32 active_slave_map = 0;
+ u8 active_slave_num = 0;
+ bool active;
+ u8 i;
+
+ for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) {
+ net_dev = bond_grp->bond_func_info[i].net_dev;
+ if (!net_dev || !(bond_grp->slave_map & (1U << i)))
+ continue;
+
+ active = (bond_grp->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP) ?
+ net_lag_port_dev_txable(net_dev) :
+ (ib_get_curr_port_state(net_dev) == IB_PORT_ACTIVE);
+ if (active) {
+ active_slave_num++;
+ active_slave_map |= (1U << i);
+ }
+ }
+
+ bond_grp->active_slave_num = active_slave_num;
+ bond_grp->active_slave_map = active_slave_map;
+}
+
+static int hns_roce_recover_bond(struct hns_roce_bond_group *bond_grp,
+ struct hns_roce_dev *hr_dev)
+{
+ bond_grp->main_hr_dev = hr_dev;
+ hns_roce_bond_get_active_slave(bond_grp);
+
+ return hns_roce_cmd_bond(bond_grp, HNS_ROCE_SET_BOND);
+}
+
+static void hns_roce_slave_uninit(struct hns_roce_bond_group *bond_grp,
+ u8 func_idx)
+{
+ struct hnae3_handle *handle;
+
+ handle = bond_grp->bond_func_info[func_idx].handle;
+ if (handle->priv)
+ hns_roce_bond_uninit_client(bond_grp, func_idx);
+}
+
+static struct hns_roce_dev
+ *hns_roce_slave_init(struct hns_roce_bond_group *bond_grp,
+ u8 func_idx, bool need_switch);
+
+static int switch_main_dev(struct hns_roce_bond_group *bond_grp,
+ u8 main_func_idx)
+{
+ struct hns_roce_dev *hr_dev;
+ struct net_device *net_dev;
+ u8 i;
+
+ bond_grp->main_hr_dev = NULL;
+ hns_roce_bond_uninit_client(bond_grp, main_func_idx);
+
+ for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) {
+ net_dev = bond_grp->bond_func_info[i].net_dev;
+ if ((bond_grp->slave_map & (1U << i)) && net_dev) {
+ /* In case this slave is still being registered as
+ * a non-bonded PF, uninit it first and then re-init
+ * it as the main device.
+ */
+ hns_roce_slave_uninit(bond_grp, i);
+ hr_dev = hns_roce_slave_init(bond_grp, i, false);
+ if (hr_dev) {
+ bond_grp->main_hr_dev = hr_dev;
+ break;
+ }
+ }
+ }
+
+ if (!bond_grp->main_hr_dev)
+ return -ENODEV;
+
+ return 0;
+}
+
+static struct hns_roce_dev
+ *hns_roce_slave_init(struct hns_roce_bond_group *bond_grp,
+ u8 func_idx, bool need_switch)
+{
+ struct hns_roce_dev *hr_dev = NULL;
+ struct hnae3_handle *handle;
+ u8 main_func_idx;
+ int ret;
+
+ if (need_switch) {
+ main_func_idx = PCI_FUNC(bond_grp->main_hr_dev->pci_dev->devfn);
+ if (func_idx == main_func_idx) {
+ ret = switch_main_dev(bond_grp, main_func_idx);
+ if (ret == -ENODEV)
+ return NULL;
+ }
+ }
+
+ handle = bond_grp->bond_func_info[func_idx].handle;
+ if (handle) {
+ if (handle->priv)
+ return handle->priv;
+ /* Prevent this device from being initialized as a bond device */
+ if (need_switch)
+ bond_grp->bond_func_info[func_idx].net_dev = NULL;
+ hr_dev = hns_roce_bond_init_client(bond_grp, func_idx);
+ if (!hr_dev)
+ BOND_ERR_LOG("failed to init slave %u.\n", func_idx);
+ }
+
+ return hr_dev;
+}
+
+static struct hns_roce_die_info *alloc_die_info(int bus_num)
+{
+ struct hns_roce_die_info *die_info;
+ int ret;
+
+ die_info = kzalloc(sizeof(*die_info), GFP_KERNEL);
+ if (!die_info)
+ return NULL;
+
+ ret = xa_err(xa_store(&roce_bond_xa, bus_num, die_info, GFP_KERNEL));
+ if (ret) {
+ kfree(die_info);
+ return NULL;
+ }
+
+ mutex_init(&die_info->die_mutex);
+
+ return die_info;
+}
+
+static void dealloc_die_info(struct hns_roce_die_info *die_info, u8 bus_num)
+{
+ mutex_destroy(&die_info->die_mutex);
+ xa_erase(&roce_bond_xa, bus_num);
+ kfree(die_info);
+}
+
+static int alloc_bond_id(struct hns_roce_bond_group *bond_grp)
+{
+ u8 bus_num = bond_grp->bus_num;
+ struct hns_roce_die_info *die_info = xa_load(&roce_bond_xa, bus_num);
+ int i;
+
+ if (!die_info) {
+ die_info = alloc_die_info(bus_num);
+ if (!die_info)
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < ROCE_BOND_NUM_MAX; i++) {
+ if (die_info->bond_id_mask & BOND_ID(i))
+ continue;
+
+ die_info->bond_id_mask |= BOND_ID(i);
+ die_info->bgrps[i] = bond_grp;
+ bond_grp->bond_id = i;
+
+ return 0;
+ }
+
+ return -ENOSPC;
+}
+
+static int remove_bond_id(int bus_num, u8 bond_id)
+{
+ struct hns_roce_die_info *die_info = xa_load(&roce_bond_xa, bus_num);
+
+ if (bond_id >= ROCE_BOND_NUM_MAX)
+ return -EINVAL;
+
+ if (!die_info)
+ return -ENODEV;
+
+ die_info->bond_id_mask &= ~BOND_ID(bond_id);
+ die_info->bgrps[bond_id] = NULL;
+ if (!die_info->bond_id_mask)
+ dealloc_die_info(die_info, bus_num);
+
+ return 0;
+}
+
+static void hns_roce_set_bond(struct hns_roce_bond_group *bond_grp)
+{
+ struct hns_roce_dev *hr_dev;
+ int ret;
+ int i;
+
+ for (i = ROCE_BOND_FUNC_MAX - 1; i >= 0; i--) {
+ if (bond_grp->slave_map & (1 << i))
+ hns_roce_slave_uninit(bond_grp, i);
+ }
+
+ mutex_lock(&bond_grp->bond_mutex);
+ bond_grp->bond_state = HNS_ROCE_BOND_IS_BONDED;
+ mutex_unlock(&bond_grp->bond_mutex);
+ bond_grp->main_hr_dev = NULL;
+
+ for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) {
+ if (bond_grp->slave_map & (1 << i)) {
+ hr_dev = hns_roce_slave_init(bond_grp, i, false);
+ if (hr_dev) {
+ bond_grp->main_hr_dev = hr_dev;
+ break;
+ }
+ }
+ }
+
+ if (!bond_grp->main_hr_dev) {
+ ret = -ENODEV;
+ goto out;
+ }
+
+ hns_roce_bond_get_active_slave(bond_grp);
+
+ ret = hns_roce_cmd_bond(bond_grp, HNS_ROCE_SET_BOND);
+
+out:
+ if (ret) {
+ BOND_ERR_LOG("failed to set RoCE bond, ret = %d.\n", ret);
+ hns_roce_cleanup_bond(bond_grp);
+ } else {
+ ibdev_info(&bond_grp->main_hr_dev->ib_dev,
+ "RoCE set bond finished!\n");
+ }
+}
+
+static void hns_roce_clear_bond(struct hns_roce_bond_group *bond_grp)
+{
+ u8 main_func_idx = PCI_FUNC(bond_grp->main_hr_dev->pci_dev->devfn);
+ struct hns_roce_dev *hr_dev;
+ u8 i;
+
+ if (bond_grp->bond_state == HNS_ROCE_BOND_NOT_BONDED)
+ goto out;
+
+ bond_grp->bond_state = HNS_ROCE_BOND_NOT_BONDED;
+ bond_grp->main_hr_dev = NULL;
+
+ hns_roce_slave_uninit(bond_grp, main_func_idx);
+
+ for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) {
+ hr_dev = hns_roce_slave_init(bond_grp, i, false);
+ if (hr_dev)
+ bond_grp->main_hr_dev = hr_dev;
+ }
+
+out:
+ hns_roce_cleanup_bond(bond_grp);
+}
+
+static void hns_roce_slave_changestate(struct hns_roce_bond_group *bond_grp)
+{
+ int ret;
+
+ hns_roce_bond_get_active_slave(bond_grp);
+
+ ret = hns_roce_cmd_bond(bond_grp, HNS_ROCE_CHANGE_BOND);
+
+ mutex_lock(&bond_grp->bond_mutex);
+ if (bond_grp->bond_state == HNS_ROCE_BOND_SLAVE_CHANGESTATE)
+ bond_grp->bond_state = HNS_ROCE_BOND_IS_BONDED;
+ mutex_unlock(&bond_grp->bond_mutex);
+
+ if (ret)
+ ibdev_err(&bond_grp->main_hr_dev->ib_dev,
+ "failed to change RoCE bond slave state, ret = %d.\n",
+ ret);
+ else
+ ibdev_info(&bond_grp->main_hr_dev->ib_dev,
+ "RoCE slave changestate finished!\n");
+}
+
+static void hns_roce_slave_change_num(struct hns_roce_bond_group *bond_grp)
+{
+ int ret;
+ u8 i;
+
+ for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) {
+ if (bond_grp->slave_map & (1U << i)) {
+ if (i == PCI_FUNC(bond_grp->main_hr_dev->pci_dev->devfn))
+ continue;
+ hns_roce_slave_uninit(bond_grp, i);
+ } else {
+ hns_roce_slave_init(bond_grp, i, true);
+ if (!bond_grp->main_hr_dev) {
+ ret = -ENODEV;
+ goto out;
+ }
+ bond_grp->bond_func_info[i].net_dev = NULL;
+ bond_grp->bond_func_info[i].handle = NULL;
+ }
+ }
+
+ hns_roce_bond_get_active_slave(bond_grp);
+
+ ret = hns_roce_cmd_bond(bond_grp, HNS_ROCE_CHANGE_BOND);
+
+out:
+ if (ret) {
+ BOND_ERR_LOG("failed to change RoCE bond slave num, ret = %d.\n", ret);
+ hns_roce_cleanup_bond(bond_grp);
+ } else {
+ mutex_lock(&bond_grp->bond_mutex);
+ if (bond_grp->bond_state == HNS_ROCE_BOND_SLAVE_CHANGE_NUM)
+ bond_grp->bond_state = HNS_ROCE_BOND_IS_BONDED;
+ mutex_unlock(&bond_grp->bond_mutex);
+ ibdev_info(&bond_grp->main_hr_dev->ib_dev,
+ "RoCE slave change num finished!\n");
+ }
+}
+
+static void hns_roce_bond_info_update_nolock(struct hns_roce_bond_group *bond_grp,
+ struct net_device *upper_dev)
+{
+ struct hns_roce_v2_priv *priv;
+ struct hns_roce_dev *hr_dev;
+ struct net_device *net_dev;
+ int func_idx;
+
+ bond_grp->slave_map = 0;
+ rcu_read_lock();
+ for_each_netdev_in_bond_rcu(upper_dev, net_dev) {
+ func_idx = get_netdev_bond_slave_id(net_dev, bond_grp);
+ if (func_idx < 0) {
+ hr_dev = hns_roce_get_hrdev_by_netdev(net_dev);
+ if (!hr_dev)
+ continue;
+ func_idx = PCI_FUNC(hr_dev->pci_dev->devfn);
+ if (!bond_grp->bond_func_info[func_idx].net_dev) {
+ priv = hr_dev->priv;
+ bond_grp->bond_func_info[func_idx].net_dev =
+ net_dev;
+ bond_grp->bond_func_info[func_idx].handle =
+ priv->handle;
+ }
+ ib_device_put(&hr_dev->ib_dev);
+ }
+
+ bond_grp->slave_map |= (1 << func_idx);
+ }
+ rcu_read_unlock();
+}
+
+static bool is_dev_bond_supported(struct hns_roce_bond_group *bond_grp,
+ struct net_device *net_dev)
+{
+ struct hns_roce_dev *hr_dev = hns_roce_get_hrdev_by_netdev(net_dev);
+ bool ret = true;
+
+ if (!hr_dev) {
+ if (bond_grp &&
+ get_netdev_bond_slave_id(net_dev, bond_grp) >= 0)
+ return true;
+ else
+ return false;
+ }
+
+ if (!(hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_BOND)) {
+ ret = false;
+ goto out;
+ }
+
+ if (hr_dev->is_vf || pci_num_vf(hr_dev->pci_dev) > 0) {
+ ret = false;
+ goto out;
+ }
+
+ if (bond_grp->bus_num != get_hr_bus_num(hr_dev))
+ ret = false;
+
+out:
+ ib_device_put(&hr_dev->ib_dev);
+ return ret;
+}
+
+static bool check_slave_support(struct hns_roce_bond_group *bond_grp,
+ struct net_device *upper_dev)
+{
+ struct net_device *net_dev;
+ u8 slave_num = 0;
+
+ rcu_read_lock();
+ for_each_netdev_in_bond_rcu(upper_dev, net_dev) {
+ if (is_dev_bond_supported(bond_grp, net_dev)) {
+ slave_num++;
+ continue;
+ }
+ rcu_read_unlock();
+ return false;
+ }
+ rcu_read_unlock();
+
+ return (slave_num > 1 && slave_num <= ROCE_BOND_FUNC_MAX);
+}
+
+static void hns_roce_bond_work(struct work_struct *work)
+{
+ struct delayed_work *delayed_work = to_delayed_work(work);
+ struct hns_roce_bond_group *bond_grp =
+ container_of(delayed_work, struct hns_roce_bond_group,
+ bond_work);
+ enum hns_roce_bond_state bond_state;
+ bool bond_ready;
+
+ mutex_lock(&bond_grp->bond_mutex);
+ bond_ready = check_slave_support(bond_grp, bond_grp->upper_dev);
+ hns_roce_bond_info_update_nolock(bond_grp, bond_grp->upper_dev);
+ bond_state = bond_grp->bond_state;
+ bond_grp->bond_ready = bond_ready;
+ mutex_unlock(&bond_grp->bond_mutex);
+
+ ibdev_info(&bond_grp->main_hr_dev->ib_dev,
+ "bond work: bond_ready - %d, bond_state - %d.\n",
+ bond_ready, bond_state);
+
+ if (!bond_ready) {
+ hns_roce_clear_bond(bond_grp);
+ return;
+ }
+
+ switch (bond_state) {
+ case HNS_ROCE_BOND_NOT_BONDED:
+ hns_roce_set_bond(bond_grp);
+ /* In set_bond flow, we don't need to set bond netdev here as
+ * it has been done when bond_grp->main_hr_dev is registered.
+ */
+ return;
+ case HNS_ROCE_BOND_SLAVE_CHANGESTATE:
+ hns_roce_slave_changestate(bond_grp);
+ break;
+ case HNS_ROCE_BOND_SLAVE_CHANGE_NUM:
+ hns_roce_slave_change_num(bond_grp);
+ break;
+ default:
+ return;
+ }
+ hns_roce_set_bond_netdev(bond_grp, bond_grp->main_hr_dev);
+}
+
+static void hns_roce_attach_bond_grp(struct hns_roce_bond_group *bond_grp,
+ struct hns_roce_dev *hr_dev,
+ struct net_device *upper_dev)
+{
+ bond_grp->upper_dev = upper_dev;
+ bond_grp->main_hr_dev = hr_dev;
+ bond_grp->bond_state = HNS_ROCE_BOND_NOT_BONDED;
+ bond_grp->bond_ready = false;
+}
+
+static void hns_roce_detach_bond_grp(struct hns_roce_bond_group *bond_grp)
+{
+ mutex_lock(&bond_grp->bond_mutex);
+
+ cancel_delayed_work(&bond_grp->bond_work);
+ bond_grp->upper_dev = NULL;
+ bond_grp->main_hr_dev = NULL;
+ bond_grp->bond_ready = false;
+ bond_grp->bond_state = HNS_ROCE_BOND_NOT_ATTACHED;
+ bond_grp->slave_map = 0;
+ memset(bond_grp->bond_func_info, 0, sizeof(bond_grp->bond_func_info));
+
+ mutex_unlock(&bond_grp->bond_mutex);
+}
+
+void hns_roce_cleanup_bond(struct hns_roce_bond_group *bond_grp)
+{
+ int ret;
+
+ ret = bond_grp->main_hr_dev ?
+ hns_roce_cmd_bond(bond_grp, HNS_ROCE_CLEAR_BOND) : -EIO;
+ if (ret)
+ BOND_ERR_LOG("failed to clear RoCE bond, ret = %d.\n", ret);
+ else
+ ibdev_info(&bond_grp->main_hr_dev->ib_dev,
+ "RoCE clear bond finished!\n");
+
+ hns_roce_detach_bond_grp(bond_grp);
+}
+
+static bool lowerstate_event_filter(struct hns_roce_bond_group *bond_grp,
+ struct net_device *net_dev)
+{
+ struct hns_roce_bond_group *bond_grp_tmp;
+
+ bond_grp_tmp = hns_roce_get_bond_grp(net_dev, bond_grp->bus_num);
+ return bond_grp_tmp == bond_grp;
+}
+
+static void lowerstate_event_setting(struct hns_roce_bond_group *bond_grp,
+ struct netdev_notifier_changelowerstate_info *info)
+{
+ mutex_lock(&bond_grp->bond_mutex);
+
+ if (bond_grp->bond_ready &&
+ bond_grp->bond_state == HNS_ROCE_BOND_IS_BONDED)
+ bond_grp->bond_state = HNS_ROCE_BOND_SLAVE_CHANGESTATE;
+
+ mutex_unlock(&bond_grp->bond_mutex);
+}
+
+static bool hns_roce_bond_lowerstate_event(struct hns_roce_bond_group *bond_grp,
+ struct netdev_notifier_changelowerstate_info *info)
+{
+ struct net_device *net_dev =
+ netdev_notifier_info_to_dev((struct netdev_notifier_info *)info);
+
+ if (!netif_is_lag_port(net_dev))
+ return false;
+
+ if (!lowerstate_event_filter(bond_grp, net_dev))
+ return false;
+
+ lowerstate_event_setting(bond_grp, info);
+
+ return true;
+}
+
+static bool is_bond_setting_supported(struct netdev_lag_upper_info *bond_info)
+{
+ if (!bond_info)
+ return false;
+
+ if (bond_info->tx_type != NETDEV_LAG_TX_TYPE_ACTIVEBACKUP &&
+ bond_info->tx_type != NETDEV_LAG_TX_TYPE_HASH)
+ return false;
+
+ if (bond_info->tx_type == NETDEV_LAG_TX_TYPE_HASH &&
+ bond_info->hash_type > NETDEV_LAG_HASH_L23)
+ return false;
+
+ return true;
+}
+
+static void upper_event_setting(struct hns_roce_bond_group *bond_grp,
+ struct netdev_notifier_changeupper_info *info)
+{
+ struct netdev_lag_upper_info *bond_upper_info = NULL;
+ bool slave_inc = info->linking;
+
+ if (slave_inc)
+ bond_upper_info = info->upper_info;
+
+ if (bond_upper_info) {
+ bond_grp->tx_type = bond_upper_info->tx_type;
+ bond_grp->hash_type = bond_upper_info->hash_type;
+ }
+}
+
+static bool check_unlinking_bond_support(struct hns_roce_bond_group *bond_grp)
+{
+ struct net_device *net_dev;
+ u8 slave_num = 0;
+
+ rcu_read_lock();
+ for_each_netdev_in_bond_rcu(bond_grp->upper_dev, net_dev) {
+ if (get_netdev_bond_slave_id(net_dev, bond_grp) >= 0)
+ slave_num++;
+ }
+ rcu_read_unlock();
+
+ return (slave_num > 1);
+}
+
+static bool check_linking_bond_support(struct netdev_lag_upper_info *bond_info,
+ struct hns_roce_bond_group *bond_grp,
+ struct net_device *upper_dev)
+{
+ if (!is_bond_setting_supported(bond_info))
+ return false;
+
+ return check_slave_support(bond_grp, upper_dev);
+}
+
+static enum bond_support_type
+ check_bond_support(struct hns_roce_bond_group *bond_grp,
+ struct net_device *upper_dev,
+ struct netdev_notifier_changeupper_info *info)
+{
+ bool bond_grp_exist = false;
+ bool support;
+
+ if (upper_dev == bond_grp->upper_dev)
+ bond_grp_exist = true;
+
+ if (!info->linking && !bond_grp_exist)
+ return BOND_NOT_SUPPORT;
+
+ if (info->linking)
+ support = check_linking_bond_support(info->upper_info, bond_grp,
+ upper_dev);
+ else
+ support = check_unlinking_bond_support(bond_grp);
+
+ if (support)
+ return BOND_SUPPORT;
+
+ return bond_grp_exist ? BOND_EXISTING_NOT_SUPPORT : BOND_NOT_SUPPORT;
+}
+
+static bool upper_event_filter(struct netdev_notifier_changeupper_info *info,
+ struct hns_roce_bond_group *bond_grp,
+ struct net_device *net_dev)
+{
+ struct net_device *upper_dev = info->upper_dev;
+ struct hns_roce_bond_group *bond_grp_tmp;
+ struct hns_roce_dev *hr_dev;
+ bool ret = true;
+ u8 bus_num;
+
+ if (!info->linking ||
+ bond_grp->bond_state != HNS_ROCE_BOND_NOT_ATTACHED)
+ return bond_grp->upper_dev == upper_dev;
+
+ hr_dev = hns_roce_get_hrdev_by_netdev(net_dev);
+ if (!hr_dev)
+ return false;
+
+ bus_num = get_hr_bus_num(hr_dev);
+ if (bond_grp->bus_num != bus_num) {
+ ret = false;
+ goto out;
+ }
+
+ bond_grp_tmp = hns_roce_get_bond_grp(net_dev, bus_num);
+ if (bond_grp_tmp && bond_grp_tmp != bond_grp)
+ ret = false;
+out:
+ ib_device_put(&hr_dev->ib_dev);
+ return ret;
+}
+
+static bool hns_roce_bond_upper_event(struct hns_roce_bond_group *bond_grp,
+ struct netdev_notifier_changeupper_info *info)
+{
+ struct net_device *net_dev =
+ netdev_notifier_info_to_dev((struct netdev_notifier_info *)info);
+ struct net_device *upper_dev = info->upper_dev;
+ enum bond_support_type support = BOND_SUPPORT;
+ struct hns_roce_dev *hr_dev;
+ int slave_id;
+
+ if (!upper_dev || !netif_is_lag_master(upper_dev))
+ return false;
+
+ if (!upper_event_filter(info, bond_grp, net_dev))
+ return false;
+
+ mutex_lock(&bond_grp->bond_mutex);
+ support = check_bond_support(bond_grp, upper_dev, info);
+ if (support == BOND_NOT_SUPPORT) {
+ mutex_unlock(&bond_grp->bond_mutex);
+ return false;
+ }
+
+ if (bond_grp->bond_state == HNS_ROCE_BOND_NOT_ATTACHED) {
+ hr_dev = hns_roce_get_hrdev_by_netdev(net_dev);
+ if (!hr_dev) {
+ mutex_unlock(&bond_grp->bond_mutex);
+ return false;
+ }
+ hns_roce_attach_bond_grp(bond_grp, hr_dev, upper_dev);
+ ib_device_put(&hr_dev->ib_dev);
+ }
+
+ /* In the case of netdev being unregistered, the roce
+ * instance shouldn't be inited.
+ */
+ if (net_dev->reg_state >= NETREG_UNREGISTERING) {
+ slave_id = get_netdev_bond_slave_id(net_dev, bond_grp);
+ if (slave_id >= 0) {
+ bond_grp->bond_func_info[slave_id].net_dev = NULL;
+ bond_grp->bond_func_info[slave_id].handle = NULL;
+ }
+ }
+
+ if (support == BOND_SUPPORT) {
+ bond_grp->bond_ready = true;
+ if (bond_grp->bond_state != HNS_ROCE_BOND_NOT_BONDED)
+ bond_grp->bond_state = HNS_ROCE_BOND_SLAVE_CHANGE_NUM;
+ }
+ mutex_unlock(&bond_grp->bond_mutex);
+ if (support == BOND_SUPPORT)
+ upper_event_setting(bond_grp, info);
+
+ return true;
+}
+
+static int hns_roce_bond_event(struct notifier_block *self,
+ unsigned long event, void *ptr)
+{
+ struct hns_roce_bond_group *bond_grp =
+ container_of(self, struct hns_roce_bond_group, bond_nb);
+ bool changed = false;
+
+ if (event == NETDEV_CHANGEUPPER)
+ changed = hns_roce_bond_upper_event(bond_grp, ptr);
+ if (event == NETDEV_CHANGELOWERSTATE)
+ changed = hns_roce_bond_lowerstate_event(bond_grp, ptr);
+
+ if (changed)
+ schedule_delayed_work(&bond_grp->bond_work, HZ);
+
+ return NOTIFY_DONE;
+}
+
+int hns_roce_alloc_bond_grp(struct hns_roce_dev *hr_dev)
+{
+ struct hns_roce_bond_group *bgrps[ROCE_BOND_NUM_MAX];
+ struct hns_roce_bond_group *bond_grp;
+ u8 bus_num = get_hr_bus_num(hr_dev);
+ int ret;
+ int i;
+
+ if (xa_load(&roce_bond_xa, bus_num))
+ return 0;
+
+ for (i = 0; i < ROCE_BOND_NUM_MAX; i++) {
+ bond_grp = kvzalloc(sizeof(*bond_grp), GFP_KERNEL);
+ if (!bond_grp) {
+ ret = -ENOMEM;
+ goto mem_err;
+ }
+
+ mutex_init(&bond_grp->bond_mutex);
+ INIT_DELAYED_WORK(&bond_grp->bond_work, hns_roce_bond_work);
+
+ bond_grp->bond_ready = false;
+ bond_grp->bond_state = HNS_ROCE_BOND_NOT_ATTACHED;
+ bond_grp->bus_num = bus_num;
+
+ ret = alloc_bond_id(bond_grp);
+ if (ret) {
+ dev_err(hr_dev->dev,
+ "failed to alloc bond ID, ret = %d.\n", ret);
+ goto alloc_id_err;
+ }
+
+ bond_grp->bond_nb.notifier_call = hns_roce_bond_event;
+ ret = register_netdevice_notifier(&bond_grp->bond_nb);
+ if (ret) {
+ ibdev_err(&hr_dev->ib_dev,
+ "failed to register bond nb, ret = %d.\n", ret);
+ goto register_nb_err;
+ }
+ bgrps[i] = bond_grp;
+ }
+
+ return 0;
+
+register_nb_err:
+ remove_bond_id(bond_grp->bus_num, bond_grp->bond_id);
+alloc_id_err:
+ mutex_destroy(&bond_grp->bond_mutex);
+ kvfree(bond_grp);
+mem_err:
+ for (i--; i >= 0; i--) {
+ unregister_netdevice_notifier(&bgrps[i]->bond_nb);
+ cancel_delayed_work_sync(&bgrps[i]->bond_work);
+ remove_bond_id(bgrps[i]->bus_num, bgrps[i]->bond_id);
+ mutex_destroy(&bgrps[i]->bond_mutex);
+ kvfree(bgrps[i]);
+ }
+ return ret;
+}
+
+void hns_roce_dealloc_bond_grp(void)
+{
+ struct hns_roce_bond_group *bond_grp;
+ struct hns_roce_die_info *die_info;
+ unsigned long id;
+ int i;
+
+ xa_for_each(&roce_bond_xa, id, die_info) {
+ for (i = 0; i < ROCE_BOND_NUM_MAX; i++) {
+ bond_grp = die_info->bgrps[i];
+ if (!bond_grp)
+ continue;
+ unregister_netdevice_notifier(&bond_grp->bond_nb);
+ cancel_delayed_work_sync(&bond_grp->bond_work);
+ remove_bond_id(bond_grp->bus_num, bond_grp->bond_id);
+ mutex_destroy(&bond_grp->bond_mutex);
+ kvfree(bond_grp);
+ }
+ }
+}
+
+int hns_roce_bond_init(struct hns_roce_dev *hr_dev)
+{
+ struct net_device *net_dev = get_hr_netdev(hr_dev, 0);
+ struct hns_roce_v2_priv *priv = hr_dev->priv;
+ struct hns_roce_bond_group *bond_grp;
+ u8 bus_num = get_hr_bus_num(hr_dev);
+ int ret;
+
+ bond_grp = hns_roce_get_bond_grp(net_dev, bus_num);
+
+ if (priv->handle->rinfo.reset_state == HNS_ROCE_STATE_RST_INIT) {
+ ret = hns_roce_recover_bond(bond_grp, hr_dev);
+ if (ret) {
+ dev_err(hr_dev->dev,
+ "failed to recover RoCE bond, ret = %d.\n", ret);
+ return ret;
+ }
+ }
+
+ return hns_roce_set_bond_netdev(bond_grp, hr_dev);
+}
+
+void hns_roce_bond_suspend(struct hnae3_handle *handle)
+{
+ u8 bus_num = handle->pdev->bus->number;
+ struct hns_roce_bond_group *bond_grp;
+ struct hns_roce_die_info *die_info;
+ int i;
+
+ die_info = xa_load(&roce_bond_xa, bus_num);
+ if (!die_info)
+ return;
+
+ mutex_lock(&die_info->die_mutex);
+
+ /*
+ * Avoid duplicated processing when calling this function
+ * multiple times.
+ */
+ if (die_info->suspend_cnt)
+ goto out;
+
+ for (i = 0; i < ROCE_BOND_NUM_MAX; i++) {
+ bond_grp = die_info->bgrps[i];
+ if (!bond_grp)
+ continue;
+ unregister_netdevice_notifier(&bond_grp->bond_nb);
+ cancel_delayed_work_sync(&bond_grp->bond_work);
+ }
+
+out:
+ die_info->suspend_cnt++;
+ mutex_unlock(&die_info->die_mutex);
+}
+
+void hns_roce_bond_resume(struct hnae3_handle *handle)
+{
+ u8 bus_num = handle->pdev->bus->number;
+ struct hns_roce_bond_group *bond_grp;
+ struct hns_roce_die_info *die_info;
+ int i, ret;
+
+ die_info = xa_load(&roce_bond_xa, bus_num);
+ if (!die_info)
+ return;
+
+ mutex_lock(&die_info->die_mutex);
+
+ die_info->suspend_cnt--;
+ if (die_info->suspend_cnt)
+ goto out;
+
+ for (i = 0; i < ROCE_BOND_NUM_MAX; i++) {
+ bond_grp = die_info->bgrps[i];
+ if (!bond_grp)
+ continue;
+ ret = register_netdevice_notifier(&bond_grp->bond_nb);
+ if (ret)
+ dev_err(&handle->pdev->dev,
+ "failed to resume bond notifier(bus_num = %u, id = %u), ret = %d.\n",
+ bus_num, bond_grp->bond_id, ret);
+ }
+
+out:
+ mutex_unlock(&die_info->die_mutex);
+}
diff --git a/drivers/infiniband/hw/hns/hns_roce_bond.h b/drivers/infiniband/hw/hns/hns_roce_bond.h
new file mode 100644
index 000000000000..98c295d78ca1
--- /dev/null
+++ b/drivers/infiniband/hw/hns/hns_roce_bond.h
@@ -0,0 +1,95 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Copyright (c) 2025 Hisilicon Limited.
+ */
+
+#ifndef _HNS_ROCE_BOND_H
+#define _HNS_ROCE_BOND_H
+
+#include <linux/netdevice.h>
+#include <net/bonding.h>
+
+#define ROCE_BOND_FUNC_MAX 4
+#define ROCE_BOND_NUM_MAX 2
+
+#define BOND_ID(id) BIT(id)
+
+#define BOND_ERR_LOG(fmt, ...) \
+ pr_err("HNS RoCE Bonding: " fmt, ##__VA_ARGS__)
+
+enum {
+ BOND_MODE_1,
+ BOND_MODE_2_4,
+};
+
+enum hns_roce_bond_hashtype {
+ BOND_HASH_L2,
+ BOND_HASH_L34,
+ BOND_HASH_L23,
+};
+
+enum bond_support_type {
+ BOND_NOT_SUPPORT,
+ /*
+ * bond_grp already exists, but in the current
+ * conditions it's no longer supported
+ */
+ BOND_EXISTING_NOT_SUPPORT,
+ BOND_SUPPORT,
+};
+
+enum hns_roce_bond_state {
+ HNS_ROCE_BOND_NOT_ATTACHED,
+ HNS_ROCE_BOND_NOT_BONDED,
+ HNS_ROCE_BOND_IS_BONDED,
+ HNS_ROCE_BOND_SLAVE_CHANGE_NUM,
+ HNS_ROCE_BOND_SLAVE_CHANGESTATE,
+};
+
+enum hns_roce_bond_cmd_type {
+ HNS_ROCE_SET_BOND,
+ HNS_ROCE_CHANGE_BOND,
+ HNS_ROCE_CLEAR_BOND,
+};
+
+struct hns_roce_func_info {
+ struct net_device *net_dev;
+ struct hnae3_handle *handle;
+};
+
+struct hns_roce_bond_group {
+ struct net_device *upper_dev;
+ struct hns_roce_dev *main_hr_dev;
+ u8 active_slave_num;
+ u32 slave_map;
+ u32 active_slave_map;
+ u8 bond_id;
+ u8 bus_num;
+ struct hns_roce_func_info bond_func_info[ROCE_BOND_FUNC_MAX];
+ bool bond_ready;
+ enum hns_roce_bond_state bond_state;
+ enum netdev_lag_tx_type tx_type;
+ enum netdev_lag_hash hash_type;
+ struct mutex bond_mutex;
+ struct notifier_block bond_nb;
+ struct delayed_work bond_work;
+};
+
+struct hns_roce_die_info {
+ u8 bond_id_mask;
+ struct hns_roce_bond_group *bgrps[ROCE_BOND_NUM_MAX];
+ struct mutex die_mutex;
+ u8 suspend_cnt;
+};
+
+struct hns_roce_bond_group *hns_roce_get_bond_grp(struct net_device *net_dev,
+ u8 bus_num);
+int hns_roce_alloc_bond_grp(struct hns_roce_dev *hr_dev);
+void hns_roce_dealloc_bond_grp(void);
+void hns_roce_cleanup_bond(struct hns_roce_bond_group *bond_grp);
+bool hns_roce_bond_is_active(struct hns_roce_dev *hr_dev);
+int hns_roce_bond_init(struct hns_roce_dev *hr_dev);
+void hns_roce_bond_suspend(struct hnae3_handle *handle);
+void hns_roce_bond_resume(struct hnae3_handle *handle);
+
+#endif
diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h
index 06832c0ac055..318f18cf37aa 100644
--- a/drivers/infiniband/hw/hns/hns_roce_device.h
+++ b/drivers/infiniband/hw/hns/hns_roce_device.h
@@ -33,6 +33,7 @@
#ifndef _HNS_ROCE_DEVICE_H
#define _HNS_ROCE_DEVICE_H
+#include <linux/pci.h>
#include <rdma/ib_verbs.h>
#include <rdma/hns-abi.h>
#include "hns_roce_debugfs.h"
@@ -153,6 +154,7 @@ enum {
HNS_ROCE_CAP_FLAG_SDI_MODE = BIT(14),
HNS_ROCE_CAP_FLAG_STASH = BIT(17),
HNS_ROCE_CAP_FLAG_CQE_INLINE = BIT(19),
+ HNS_ROCE_CAP_FLAG_BOND = BIT(21),
HNS_ROCE_CAP_FLAG_SRQ_RECORD_DB = BIT(22),
};
@@ -177,6 +179,7 @@ enum hns_roce_instance_state {
HNS_ROCE_STATE_INIT,
HNS_ROCE_STATE_INITED,
HNS_ROCE_STATE_UNINIT,
+ HNS_ROCE_STATE_BOND_UNINIT,
};
enum {
@@ -1167,6 +1170,17 @@ static inline u8 get_tclass(const struct ib_global_route *grh)
grh->traffic_class >> DSCP_SHIFT : grh->traffic_class;
}
+static inline struct net_device *get_hr_netdev(struct hns_roce_dev *hr_dev,
+ u8 port)
+{
+ return hr_dev->iboe.netdevs[port];
+}
+
+static inline u8 get_hr_bus_num(struct hns_roce_dev *hr_dev)
+{
+ return hr_dev->pci_dev->bus->number;
+}
+
void hns_roce_init_uar_table(struct hns_roce_dev *dev);
int hns_roce_uar_alloc(struct hns_roce_dev *dev, struct hns_roce_uar *uar);
@@ -1293,7 +1307,7 @@ void hns_roce_flush_cqe(struct hns_roce_dev *hr_dev, u32 qpn);
void hns_roce_srq_event(struct hns_roce_dev *hr_dev, u32 srqn, int event_type);
void hns_roce_handle_device_err(struct hns_roce_dev *hr_dev);
int hns_roce_init(struct hns_roce_dev *hr_dev);
-void hns_roce_exit(struct hns_roce_dev *hr_dev);
+void hns_roce_exit(struct hns_roce_dev *hr_dev, bool bond_cleanup);
int hns_roce_fill_res_cq_entry(struct sk_buff *msg, struct ib_cq *ib_cq);
int hns_roce_fill_res_cq_entry_raw(struct sk_buff *msg, struct ib_cq *ib_cq);
int hns_roce_fill_res_qp_entry(struct sk_buff *msg, struct ib_qp *ib_qp);
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index 63052c0e7613..2d6ae89e525b 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -43,11 +43,13 @@
#include <rdma/ib_umem.h>
#include <rdma/uverbs_ioctl.h>
+#include "hclge_main.h"
#include "hns_roce_common.h"
#include "hns_roce_device.h"
#include "hns_roce_cmd.h"
#include "hns_roce_hem.h"
#include "hns_roce_hw_v2.h"
+#include "hns_roce_bond.h"
#define CREATE_TRACE_POINTS
#include "hns_roce_trace.h"
@@ -1434,6 +1436,79 @@ static int hns_roce_cmq_send(struct hns_roce_dev *hr_dev,
return ret;
}
+static enum hns_roce_opcode_type
+ get_bond_opcode(enum hns_roce_bond_cmd_type bond_type)
+{
+ switch (bond_type) {
+ case HNS_ROCE_SET_BOND:
+ return HNS_ROCE_OPC_SET_BOND_INFO;
+ case HNS_ROCE_CHANGE_BOND:
+ return HNS_ROCE_OPC_CHANGE_ACTIVE_PORT;
+ case HNS_ROCE_CLEAR_BOND:
+ return HNS_ROCE_OPC_CLEAR_BOND_INFO;
+ default:
+ WARN(true, "Invalid bond type %d!\n", bond_type);
+ return HNS_ROCE_OPC_SET_BOND_INFO;
+ }
+}
+
+static enum hns_roce_bond_hashtype
+ get_bond_hashtype(enum netdev_lag_hash netdev_hashtype)
+{
+ switch (netdev_hashtype) {
+ case NETDEV_LAG_HASH_L2:
+ return BOND_HASH_L2;
+ case NETDEV_LAG_HASH_L34:
+ return BOND_HASH_L34;
+ case NETDEV_LAG_HASH_L23:
+ return BOND_HASH_L23;
+ default:
+ WARN(true, "Invalid hash type %d!\n", netdev_hashtype);
+ return BOND_HASH_L2;
+ }
+}
+
+int hns_roce_cmd_bond(struct hns_roce_bond_group *bond_grp,
+ enum hns_roce_bond_cmd_type bond_type)
+{
+ enum hns_roce_opcode_type opcode = get_bond_opcode(bond_type);
+ struct hns_roce_bond_info *slave_info;
+ struct hns_roce_cmq_desc desc = {};
+ int ret;
+
+ slave_info = (struct hns_roce_bond_info *)desc.data;
+ hns_roce_cmq_setup_basic_desc(&desc, opcode, false);
+
+ slave_info->bond_id = cpu_to_le32(bond_grp->bond_id);
+ if (bond_type == HNS_ROCE_CLEAR_BOND)
+ goto out;
+
+ if (bond_grp->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP) {
+ slave_info->bond_mode = cpu_to_le32(BOND_MODE_1);
+ if (bond_grp->active_slave_num != 1)
+ ibdev_warn(&bond_grp->main_hr_dev->ib_dev,
+ "active slave cnt(%u) in Mode 1 is invalid.\n",
+ bond_grp->active_slave_num);
+ } else {
+ slave_info->bond_mode = cpu_to_le32(BOND_MODE_2_4);
+ slave_info->hash_policy =
+ cpu_to_le32(get_bond_hashtype(bond_grp->hash_type));
+ }
+
+ slave_info->active_slave_cnt = cpu_to_le32(bond_grp->active_slave_num);
+ slave_info->active_slave_mask = cpu_to_le32(bond_grp->active_slave_map);
+ slave_info->slave_mask = cpu_to_le32(bond_grp->slave_map);
+
+out:
+ ret = hns_roce_cmq_send(bond_grp->main_hr_dev, &desc, 1);
+ if (ret)
+ ibdev_err(&bond_grp->main_hr_dev->ib_dev,
+ "cmq bond type(%d) failed, ret = %d.\n",
+ bond_type, ret);
+
+ return ret;
+}
+
static int config_hem_ba_to_hw(struct hns_roce_dev *hr_dev,
dma_addr_t base_addr, u8 cmd, unsigned long tag)
{
@@ -2275,6 +2350,9 @@ static int hns_roce_query_caps(struct hns_roce_dev *hr_dev)
caps->flags |= le16_to_cpu(resp_d->cap_flags_ex) <<
HNS_ROCE_CAP_FLAGS_EX_SHIFT;
+ if (hr_dev->is_vf)
+ caps->flags &= ~HNS_ROCE_CAP_FLAG_BOND;
+
caps->num_cqs = 1 << hr_reg_read(resp_c, PF_CAPS_C_NUM_CQS);
caps->gid_table_len[0] = hr_reg_read(resp_c, PF_CAPS_C_MAX_GID);
caps->max_cqes = 1 << hr_reg_read(resp_c, PF_CAPS_C_CQ_DEPTH);
@@ -7067,7 +7145,7 @@ error_failed_kzalloc:
}
static void __hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle,
- bool reset)
+ bool reset, bool bond_cleanup)
{
struct hns_roce_dev *hr_dev = handle->priv;
@@ -7079,7 +7157,7 @@ static void __hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle,
hr_dev->state = HNS_ROCE_DEVICE_STATE_UNINIT;
hns_roce_handle_device_err(hr_dev);
- hns_roce_exit(hr_dev);
+ hns_roce_exit(hr_dev, bond_cleanup);
kfree(hr_dev->priv);
ib_dealloc_device(&hr_dev->ib_dev);
}
@@ -7130,12 +7208,51 @@ reset_chk_err:
static void hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle,
bool reset)
{
+ /* Suspend bond to avoid concurrency */
+ hns_roce_bond_suspend(handle);
+
if (handle->rinfo.instance_state != HNS_ROCE_STATE_INITED)
- return;
+ goto out;
handle->rinfo.instance_state = HNS_ROCE_STATE_UNINIT;
- __hns_roce_hw_v2_uninit_instance(handle, reset);
+ __hns_roce_hw_v2_uninit_instance(handle, reset, true);
+
+ handle->rinfo.instance_state = HNS_ROCE_STATE_NON_INIT;
+
+out:
+ hns_roce_bond_resume(handle);
+}
+
+struct hns_roce_dev
+ *hns_roce_bond_init_client(struct hns_roce_bond_group *bond_grp,
+ int func_idx)
+{
+ struct hnae3_handle *handle;
+ int ret;
+
+ handle = bond_grp->bond_func_info[func_idx].handle;
+ if (!handle || !handle->client)
+ return NULL;
+
+ ret = hns_roce_hw_v2_init_instance(handle);
+ if (ret)
+ return NULL;
+
+ return handle->priv;
+}
+
+void hns_roce_bond_uninit_client(struct hns_roce_bond_group *bond_grp,
+ int func_idx)
+{
+ struct hnae3_handle *handle = bond_grp->bond_func_info[func_idx].handle;
+
+ if (handle->rinfo.instance_state != HNS_ROCE_STATE_INITED)
+ return;
+
+ handle->rinfo.instance_state = HNS_ROCE_STATE_BOND_UNINIT;
+
+ __hns_roce_hw_v2_uninit_instance(handle, false, false);
handle->rinfo.instance_state = HNS_ROCE_STATE_NON_INIT;
}
@@ -7144,6 +7261,9 @@ static int hns_roce_hw_v2_reset_notify_down(struct hnae3_handle *handle)
{
struct hns_roce_dev *hr_dev;
+ /* Suspend bond to avoid concurrency */
+ hns_roce_bond_suspend(handle);
+
if (handle->rinfo.instance_state != HNS_ROCE_STATE_INITED) {
set_bit(HNS_ROCE_RST_DIRECT_RETURN, &handle->rinfo.state);
return 0;
@@ -7174,6 +7294,7 @@ static int hns_roce_hw_v2_reset_notify_init(struct hnae3_handle *handle)
if (test_and_clear_bit(HNS_ROCE_RST_DIRECT_RETURN,
&handle->rinfo.state)) {
handle->rinfo.reset_state = HNS_ROCE_STATE_RST_INITED;
+ hns_roce_bond_resume(handle);
return 0;
}
@@ -7193,6 +7314,7 @@ static int hns_roce_hw_v2_reset_notify_init(struct hnae3_handle *handle)
dev_info(dev, "reset done, RoCE client reinit finished.\n");
}
+ hns_roce_bond_resume(handle);
return ret;
}
@@ -7204,7 +7326,7 @@ static int hns_roce_hw_v2_reset_notify_uninit(struct hnae3_handle *handle)
handle->rinfo.reset_state = HNS_ROCE_STATE_RST_UNINIT;
dev_info(&handle->pdev->dev, "In reset process RoCE client uninit.\n");
msleep(HNS_ROCE_V2_HW_RST_UNINT_DELAY);
- __hns_roce_hw_v2_uninit_instance(handle, false);
+ __hns_roce_hw_v2_uninit_instance(handle, false, false);
return 0;
}
@@ -7240,6 +7362,14 @@ static void hns_roce_hw_v2_link_status_change(struct hnae3_handle *handle,
if (linkup || !hr_dev)
return;
+ /* For bond device, the link status depends on the upper netdev,
+ * and the upper device's link status depends on all the slaves'
+ * netdev but not only one. So bond device cannot get a correct
+ * link status from this path.
+ */
+ if (hns_roce_get_bond_grp(netdev, get_hr_bus_num(hr_dev)))
+ return;
+
ib_dispatch_port_state_event(&hr_dev->ib_dev, netdev);
}
@@ -7264,6 +7394,7 @@ static int __init hns_roce_hw_v2_init(void)
static void __exit hns_roce_hw_v2_exit(void)
{
+ hns_roce_dealloc_bond_grp();
hnae3_unregister_client(&hns_roce_hw_v2_client);
hns_roce_cleanup_debugfs();
}
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
index e64a04d6f85b..285fe0875fac 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
@@ -35,6 +35,7 @@
#include <linux/bitops.h>
#include "hnae3.h"
+#include "hns_roce_bond.h"
#define HNS_ROCE_V2_MAX_RC_INL_INN_SZ 32
#define HNS_ROCE_V2_MTT_ENTRY_SZ 64
@@ -228,6 +229,9 @@ enum hns_roce_opcode_type {
HNS_ROCE_OPC_CFG_GMV_BT = 0x8510,
HNS_ROCE_QUERY_RAM_ECC = 0x8513,
HNS_SWITCH_PARAMETER_CFG = 0x1033,
+ HNS_ROCE_OPC_SET_BOND_INFO = 0x8601,
+ HNS_ROCE_OPC_CLEAR_BOND_INFO = 0x8602,
+ HNS_ROCE_OPC_CHANGE_ACTIVE_PORT = 0x8603,
};
#define HNS_ROCE_OPC_POST_MB_TIMEOUT 35000
@@ -1465,7 +1469,23 @@ struct hns_roce_sccc_clr_done {
__le32 rsv[5];
};
+struct hns_roce_bond_info {
+ __le32 bond_id;
+ __le32 bond_mode;
+ __le32 active_slave_cnt;
+ __le32 active_slave_mask;
+ __le32 slave_mask;
+ __le32 hash_policy;
+};
+
+struct hns_roce_dev
+ *hns_roce_bond_init_client(struct hns_roce_bond_group *bond_grp,
+ int func_idx);
+void hns_roce_bond_uninit_client(struct hns_roce_bond_group *bond_grp,
+ int func_idx);
int hns_roce_v2_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata);
+int hns_roce_cmd_bond(struct hns_roce_bond_group *bond_grp,
+ enum hns_roce_bond_cmd_type bond_type);
static inline void hns_roce_write64(struct hns_roce_dev *hr_dev, __le32 val[2],
void __iomem *dest)
diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c
index f3607fe107a7..2f4864ab7d4e 100644
--- a/drivers/infiniband/hw/hns/hns_roce_main.c
+++ b/drivers/infiniband/hw/hns/hns_roce_main.c
@@ -32,7 +32,6 @@
*/
#include <linux/acpi.h>
#include <linux/module.h>
-#include <linux/pci.h>
#include <rdma/ib_addr.h>
#include <rdma/ib_smi.h>
#include <rdma/ib_user_verbs.h>
@@ -41,6 +40,7 @@
#include "hns_roce_device.h"
#include "hns_roce_hem.h"
#include "hns_roce_hw_v2.h"
+#include "hns_roce_bond.h"
static int hns_roce_set_mac(struct hns_roce_dev *hr_dev, u32 port,
const u8 *addr)
@@ -89,30 +89,75 @@ static int hns_roce_del_gid(const struct ib_gid_attr *attr, void **context)
return ret;
}
-static int handle_en_event(struct hns_roce_dev *hr_dev, u32 port,
- unsigned long event)
+static int hns_roce_get_port_state(struct hns_roce_dev *hr_dev, u32 port_num,
+ enum ib_port_state *state)
{
+ struct hns_roce_bond_group *bond_grp;
+ u8 bus_num = get_hr_bus_num(hr_dev);
+ struct net_device *net_dev;
+
+ net_dev = ib_device_get_netdev(&hr_dev->ib_dev, port_num);
+ if (!net_dev)
+ return -ENODEV;
+
+ if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_BOND) {
+ bond_grp = hns_roce_get_bond_grp(net_dev, bus_num);
+ if (bond_grp) {
+ *state = ib_get_curr_port_state(bond_grp->upper_dev);
+ goto out;
+ }
+ }
+
+ *state = ib_get_curr_port_state(net_dev);
+out:
+ dev_put(net_dev);
+ return 0;
+}
+
+static int handle_en_event(struct net_device *netdev,
+ struct hns_roce_dev *hr_dev,
+ u32 port, unsigned long event)
+{
+ struct ib_device *ibdev = &hr_dev->ib_dev;
struct device *dev = hr_dev->dev;
- struct net_device *netdev;
+ enum ib_port_state curr_state;
+ struct ib_event ibevent;
int ret = 0;
- netdev = hr_dev->iboe.netdevs[port];
if (!netdev) {
dev_err(dev, "can't find netdev on port(%u)!\n", port);
return -ENODEV;
}
switch (event) {
- case NETDEV_UP:
- case NETDEV_CHANGE:
case NETDEV_REGISTER:
case NETDEV_CHANGEADDR:
ret = hns_roce_set_mac(hr_dev, port, netdev->dev_addr);
break;
+ case NETDEV_UP:
+ case NETDEV_CHANGE:
+ ret = hns_roce_set_mac(hr_dev, port, netdev->dev_addr);
+ if (ret)
+ return ret;
+ fallthrough;
case NETDEV_DOWN:
- /*
- * In v1 engine, only support all ports closed together.
- */
+ if (!netif_is_lag_master(netdev))
+ break;
+ curr_state = ib_get_curr_port_state(netdev);
+
+ write_lock_irq(&ibdev->cache_lock);
+ if (ibdev->port_data[port].cache.last_port_state == curr_state) {
+ write_unlock_irq(&ibdev->cache_lock);
+ return 0;
+ }
+ ibdev->port_data[port].cache.last_port_state = curr_state;
+ write_unlock_irq(&ibdev->cache_lock);
+
+ ibevent.event = (curr_state == IB_PORT_DOWN) ?
+ IB_EVENT_PORT_ERR : IB_EVENT_PORT_ACTIVE;
+ ibevent.device = ibdev;
+ ibevent.element.port_num = port + 1;
+ ib_dispatch_event(&ibevent);
break;
default:
dev_dbg(dev, "NETDEV event = 0x%x!\n", (u32)(event));
@@ -126,17 +171,25 @@ static int hns_roce_netdev_event(struct notifier_block *self,
unsigned long event, void *ptr)
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct hns_roce_bond_group *bond_grp;
struct hns_roce_ib_iboe *iboe = NULL;
struct hns_roce_dev *hr_dev = NULL;
+ struct net_device *upper = NULL;
int ret;
u32 port;
hr_dev = container_of(self, struct hns_roce_dev, iboe.nb);
iboe = &hr_dev->iboe;
+ if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_BOND) {
+ bond_grp = hns_roce_get_bond_grp(get_hr_netdev(hr_dev, 0),
+ get_hr_bus_num(hr_dev));
+ upper = bond_grp ? bond_grp->upper_dev : NULL;
+ }
for (port = 0; port < hr_dev->caps.num_ports; port++) {
- if (dev == iboe->netdevs[port]) {
- ret = handle_en_event(hr_dev, port, event);
+ if ((!upper && dev == iboe->netdevs[port]) ||
+ (upper && dev == upper)) {
+ ret = handle_en_event(dev, hr_dev, port, event);
if (ret)
return NOTIFY_DONE;
break;
@@ -148,12 +201,13 @@ static int hns_roce_netdev_event(struct notifier_block *self,
static int hns_roce_setup_mtu_mac(struct hns_roce_dev *hr_dev)
{
+ struct net_device *net_dev;
int ret;
u8 i;
for (i = 0; i < hr_dev->caps.num_ports; i++) {
- ret = hns_roce_set_mac(hr_dev, i,
- hr_dev->iboe.netdevs[i]->dev_addr);
+ net_dev = get_hr_netdev(hr_dev, i);
+ ret = hns_roce_set_mac(hr_dev, i, net_dev->dev_addr);
if (ret)
return ret;
}
@@ -221,9 +275,7 @@ static int hns_roce_query_port(struct ib_device *ib_dev, u32 port_num,
struct ib_port_attr *props)
{
struct hns_roce_dev *hr_dev = to_hr_dev(ib_dev);
- struct device *dev = hr_dev->dev;
struct net_device *net_dev;
- unsigned long flags;
enum ib_mtu mtu;
u32 port;
int ret;
@@ -244,26 +296,26 @@ static int hns_roce_query_port(struct ib_device *ib_dev, u32 port_num,
if (ret)
ibdev_warn(ib_dev, "failed to get speed, ret = %d.\n", ret);
- spin_lock_irqsave(&hr_dev->iboe.lock, flags);
-
- net_dev = hr_dev->iboe.netdevs[port];
+ net_dev = ib_device_get_netdev(ib_dev, port_num);
if (!net_dev) {
- spin_unlock_irqrestore(&hr_dev->iboe.lock, flags);
- dev_err(dev, "find netdev %u failed!\n", port);
+ ibdev_err(ib_dev, "find netdev %u failed!\n", port);
return -EINVAL;
}
mtu = iboe_get_mtu(net_dev->mtu);
props->active_mtu = mtu ? min(props->max_mtu, mtu) : IB_MTU_256;
- props->state = netif_running(net_dev) && netif_carrier_ok(net_dev) ?
- IB_PORT_ACTIVE :
- IB_PORT_DOWN;
+
+ dev_put(net_dev);
+
+ ret = hns_roce_get_port_state(hr_dev, port_num, &props->state);
+ if (ret) {
+ ibdev_err(ib_dev, "failed to get port state.\n");
+ return ret;
+ }
+
props->phys_state = props->state == IB_PORT_ACTIVE ?
IB_PORT_PHYS_STATE_LINK_UP :
IB_PORT_PHYS_STATE_DISABLED;
-
- spin_unlock_irqrestore(&hr_dev->iboe.lock, flags);
-
return 0;
}
@@ -617,9 +669,40 @@ static int hns_roce_get_hw_stats(struct ib_device *device,
return num_counters;
}
-static void hns_roce_unregister_device(struct hns_roce_dev *hr_dev)
+static void
+ hns_roce_unregister_bond_cleanup(struct hns_roce_dev *hr_dev,
+ struct hns_roce_bond_group *bond_grp)
+{
+ struct net_device *net_dev;
+ int i;
+
+ /* To avoid the loss of other slave devices when main_hr_dev
+ * is unregistered, re-initialize the remaining slaves before
+ * the bond resources cleanup.
+ */
+ bond_grp->bond_state = HNS_ROCE_BOND_NOT_BONDED;
+ for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) {
+ net_dev = bond_grp->bond_func_info[i].net_dev;
+ if (net_dev && net_dev != get_hr_netdev(hr_dev, 0))
+ hns_roce_bond_init_client(bond_grp, i);
+ }
+
+ hns_roce_cleanup_bond(bond_grp);
+}
+
+static void hns_roce_unregister_device(struct hns_roce_dev *hr_dev,
+ bool bond_cleanup)
{
+ struct net_device *net_dev = get_hr_netdev(hr_dev, 0);
struct hns_roce_ib_iboe *iboe = &hr_dev->iboe;
+ struct hns_roce_bond_group *bond_grp;
+ u8 bus_num = get_hr_bus_num(hr_dev);
+
+ if (bond_cleanup && hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_BOND) {
+ bond_grp = hns_roce_get_bond_grp(net_dev, bus_num);
+ if (bond_grp)
+ hns_roce_unregister_bond_cleanup(hr_dev, bond_grp);
+ }
hr_dev->active = false;
unregister_netdevice_notifier(&iboe->nb);
@@ -708,11 +791,12 @@ static const struct ib_device_ops hns_roce_dev_restrack_ops = {
static int hns_roce_register_device(struct hns_roce_dev *hr_dev)
{
- int ret;
struct hns_roce_ib_iboe *iboe = NULL;
- struct ib_device *ib_dev = NULL;
struct device *dev = hr_dev->dev;
+ struct ib_device *ib_dev = NULL;
+ struct net_device *net_dev;
unsigned int i;
+ int ret;
iboe = &hr_dev->iboe;
spin_lock_init(&iboe->lock);
@@ -747,17 +831,38 @@ static int hns_roce_register_device(struct hns_roce_dev *hr_dev)
ib_set_device_ops(ib_dev, hr_dev->hw->hns_roce_dev_ops);
ib_set_device_ops(ib_dev, &hns_roce_dev_ops);
ib_set_device_ops(ib_dev, &hns_roce_dev_restrack_ops);
- for (i = 0; i < hr_dev->caps.num_ports; i++) {
- if (!hr_dev->iboe.netdevs[i])
- continue;
- ret = ib_device_set_netdev(ib_dev, hr_dev->iboe.netdevs[i],
- i + 1);
- if (ret)
+ dma_set_max_seg_size(dev, SZ_2G);
+
+ if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_BOND) {
+ ret = hns_roce_alloc_bond_grp(hr_dev);
+ if (ret) {
+ dev_err(dev, "failed to alloc bond_grp for bus %u, ret = %d\n",
+ get_hr_bus_num(hr_dev), ret);
return ret;
+ }
+ }
+
+ if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_BOND &&
+ hns_roce_bond_is_active(hr_dev)) {
+ ret = hns_roce_bond_init(hr_dev);
+ if (ret) {
+ dev_err(dev, "failed to init bond!\n");
+ return ret;
+ }
+ ret = ib_register_device(ib_dev, "hns_bond_%d", dev);
+ } else {
+ for (i = 0; i < hr_dev->caps.num_ports; i++) {
+ net_dev = get_hr_netdev(hr_dev, i);
+ if (!net_dev)
+ continue;
+
+ ret = ib_device_set_netdev(ib_dev, net_dev, i + 1);
+ if (ret)
+ return ret;
+ }
+ ret = ib_register_device(ib_dev, "hns_%d", dev);
}
- dma_set_max_seg_size(dev, SZ_2G);
- ret = ib_register_device(ib_dev, "hns_%d", dev);
if (ret) {
dev_err(dev, "ib_register_device failed!\n");
return ret;
@@ -1157,10 +1262,10 @@ error_failed_alloc_dfx_cnt:
return ret;
}
-void hns_roce_exit(struct hns_roce_dev *hr_dev)
+void hns_roce_exit(struct hns_roce_dev *hr_dev, bool bond_cleanup)
{
hns_roce_unregister_debugfs(hr_dev);
- hns_roce_unregister_device(hr_dev);
+ hns_roce_unregister_device(hr_dev, bond_cleanup);
if (hr_dev->hw->hw_exit)
hr_dev->hw->hw_exit(hr_dev);
diff --git a/drivers/infiniband/hw/hns/hns_roce_pd.c b/drivers/infiniband/hw/hns/hns_roce_pd.c
index d35cf59d0f43..225c3e328e0e 100644
--- a/drivers/infiniband/hw/hns/hns_roce_pd.c
+++ b/drivers/infiniband/hw/hns/hns_roce_pd.c
@@ -30,7 +30,6 @@
* SOFTWARE.
*/
-#include <linux/pci.h>
#include "hns_roce_device.h"
void hns_roce_init_pd_table(struct hns_roce_dev *hr_dev)
diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c
index bdd879ac12dd..d1640c5fbaab 100644
--- a/drivers/infiniband/hw/hns/hns_roce_qp.c
+++ b/drivers/infiniband/hw/hns/hns_roce_qp.c
@@ -31,7 +31,6 @@
* SOFTWARE.
*/
-#include <linux/pci.h>
#include <rdma/ib_addr.h>
#include <rdma/ib_umem.h>
#include <rdma/uverbs_ioctl.h>
@@ -1348,11 +1347,13 @@ static int check_mtu_validate(struct hns_roce_dev *hr_dev,
struct hns_roce_qp *hr_qp,
struct ib_qp_attr *attr, int attr_mask)
{
+ struct net_device *net_dev;
enum ib_mtu active_mtu;
int p;
p = attr_mask & IB_QP_PORT ? (attr->port_num - 1) : hr_qp->port;
- active_mtu = iboe_get_mtu(hr_dev->iboe.netdevs[p]->mtu);
+ net_dev = get_hr_netdev(hr_dev, p);
+ active_mtu = iboe_get_mtu(net_dev->mtu);
if ((hr_dev->caps.max_mtu >= IB_MTU_2048 &&
attr->path_mtu > hr_dev->caps.max_mtu) ||
diff --git a/drivers/infiniband/hw/hns/hns_roce_srq.c b/drivers/infiniband/hw/hns/hns_roce_srq.c
index 1090051f493b..8a6efb6b9c9e 100644
--- a/drivers/infiniband/hw/hns/hns_roce_srq.c
+++ b/drivers/infiniband/hw/hns/hns_roce_srq.c
@@ -3,7 +3,6 @@
* Copyright (c) 2018 Hisilicon Limited.
*/
-#include <linux/pci.h>
#include <rdma/ib_umem.h>
#include <rdma/uverbs_ioctl.h>
#include "hns_roce_device.h"
diff --git a/drivers/infiniband/hw/irdma/cm.c b/drivers/infiniband/hw/irdma/cm.c
index c6a0a661d6e7..f4f4f92ba63a 100644
--- a/drivers/infiniband/hw/irdma/cm.c
+++ b/drivers/infiniband/hw/irdma/cm.c
@@ -3710,7 +3710,7 @@ int irdma_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
iwpd = iwqp->iwpd;
tagged_offset = (uintptr_t)iwqp->ietf_mem.va;
ibmr = irdma_reg_phys_mr(&iwpd->ibpd, iwqp->ietf_mem.pa, buf_len,
- IB_ACCESS_LOCAL_WRITE, &tagged_offset);
+ IB_ACCESS_LOCAL_WRITE, &tagged_offset, false);
if (IS_ERR(ibmr)) {
ret = -ENOMEM;
goto error;
diff --git a/drivers/infiniband/hw/irdma/ctrl.c b/drivers/infiniband/hw/irdma/ctrl.c
index 4ef1c29032f7..ce5cf89c463c 100644
--- a/drivers/infiniband/hw/irdma/ctrl.c
+++ b/drivers/infiniband/hw/irdma/ctrl.c
@@ -2943,8 +2943,6 @@ static int irdma_sc_cq_create(struct irdma_sc_cq *cq, u64 scratch,
__le64 *wqe;
struct irdma_sc_cqp *cqp;
u64 hdr;
- struct irdma_sc_ceq *ceq;
- int ret_code = 0;
cqp = cq->dev->cqp;
if (cq->cq_uk.cq_id >= cqp->dev->hmc_info->hmc_obj[IRDMA_HMC_IW_CQ].max_cnt)
@@ -2953,19 +2951,9 @@ static int irdma_sc_cq_create(struct irdma_sc_cq *cq, u64 scratch,
if (cq->ceq_id >= cq->dev->hmc_fpm_misc.max_ceqs)
return -EINVAL;
- ceq = cq->dev->ceq[cq->ceq_id];
- if (ceq && ceq->reg_cq)
- ret_code = irdma_sc_add_cq_ctx(ceq, cq);
-
- if (ret_code)
- return ret_code;
-
wqe = irdma_sc_cqp_get_next_send_wqe(cqp, scratch);
- if (!wqe) {
- if (ceq && ceq->reg_cq)
- irdma_sc_remove_cq_ctx(ceq, cq);
+ if (!wqe)
return -ENOMEM;
- }
set_64bit_val(wqe, 0, cq->cq_uk.cq_size);
set_64bit_val(wqe, 8, (uintptr_t)cq >> 1);
@@ -3018,17 +3006,12 @@ int irdma_sc_cq_destroy(struct irdma_sc_cq *cq, u64 scratch, bool post_sq)
struct irdma_sc_cqp *cqp;
__le64 *wqe;
u64 hdr;
- struct irdma_sc_ceq *ceq;
cqp = cq->dev->cqp;
wqe = irdma_sc_cqp_get_next_send_wqe(cqp, scratch);
if (!wqe)
return -ENOMEM;
- ceq = cq->dev->ceq[cq->ceq_id];
- if (ceq && ceq->reg_cq)
- irdma_sc_remove_cq_ctx(ceq, cq);
-
set_64bit_val(wqe, 0, cq->cq_uk.cq_size);
set_64bit_val(wqe, 8, (uintptr_t)cq >> 1);
set_64bit_val(wqe, 40, cq->shadow_area_pa);
@@ -3602,71 +3585,6 @@ static int irdma_sc_parse_fpm_query_buf(struct irdma_sc_dev *dev, __le64 *buf,
}
/**
- * irdma_sc_find_reg_cq - find cq ctx index
- * @ceq: ceq sc structure
- * @cq: cq sc structure
- */
-static u32 irdma_sc_find_reg_cq(struct irdma_sc_ceq *ceq,
- struct irdma_sc_cq *cq)
-{
- u32 i;
-
- for (i = 0; i < ceq->reg_cq_size; i++) {
- if (cq == ceq->reg_cq[i])
- return i;
- }
-
- return IRDMA_INVALID_CQ_IDX;
-}
-
-/**
- * irdma_sc_add_cq_ctx - add cq ctx tracking for ceq
- * @ceq: ceq sc structure
- * @cq: cq sc structure
- */
-int irdma_sc_add_cq_ctx(struct irdma_sc_ceq *ceq, struct irdma_sc_cq *cq)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&ceq->req_cq_lock, flags);
-
- if (ceq->reg_cq_size == ceq->elem_cnt) {
- spin_unlock_irqrestore(&ceq->req_cq_lock, flags);
- return -ENOMEM;
- }
-
- ceq->reg_cq[ceq->reg_cq_size++] = cq;
-
- spin_unlock_irqrestore(&ceq->req_cq_lock, flags);
-
- return 0;
-}
-
-/**
- * irdma_sc_remove_cq_ctx - remove cq ctx tracking for ceq
- * @ceq: ceq sc structure
- * @cq: cq sc structure
- */
-void irdma_sc_remove_cq_ctx(struct irdma_sc_ceq *ceq, struct irdma_sc_cq *cq)
-{
- unsigned long flags;
- u32 cq_ctx_idx;
-
- spin_lock_irqsave(&ceq->req_cq_lock, flags);
- cq_ctx_idx = irdma_sc_find_reg_cq(ceq, cq);
- if (cq_ctx_idx == IRDMA_INVALID_CQ_IDX)
- goto exit;
-
- ceq->reg_cq_size--;
- if (cq_ctx_idx != ceq->reg_cq_size)
- ceq->reg_cq[cq_ctx_idx] = ceq->reg_cq[ceq->reg_cq_size];
- ceq->reg_cq[ceq->reg_cq_size] = NULL;
-
-exit:
- spin_unlock_irqrestore(&ceq->req_cq_lock, flags);
-}
-
-/**
* irdma_sc_cqp_init - Initialize buffers for a control Queue Pair
* @cqp: IWARP control queue pair pointer
* @info: IWARP control queue pair init info pointer
@@ -3950,11 +3868,13 @@ int irdma_sc_cqp_destroy(struct irdma_sc_cqp *cqp)
*/
void irdma_sc_ccq_arm(struct irdma_sc_cq *ccq)
{
+ unsigned long flags;
u64 temp_val;
u16 sw_cq_sel;
u8 arm_next_se;
u8 arm_seq_num;
+ spin_lock_irqsave(&ccq->dev->cqp_lock, flags);
get_64bit_val(ccq->cq_uk.shadow_area, 32, &temp_val);
sw_cq_sel = (u16)FIELD_GET(IRDMA_CQ_DBSA_SW_CQ_SELECT, temp_val);
arm_next_se = (u8)FIELD_GET(IRDMA_CQ_DBSA_ARM_NEXT_SE, temp_val);
@@ -3965,6 +3885,7 @@ void irdma_sc_ccq_arm(struct irdma_sc_cq *ccq)
FIELD_PREP(IRDMA_CQ_DBSA_ARM_NEXT_SE, arm_next_se) |
FIELD_PREP(IRDMA_CQ_DBSA_ARM_NEXT, 1);
set_64bit_val(ccq->cq_uk.shadow_area, 32, temp_val);
+ spin_unlock_irqrestore(&ccq->dev->cqp_lock, flags);
dma_wmb(); /* make sure shadow area is updated before arming */
@@ -4387,9 +4308,6 @@ int irdma_sc_ceq_init(struct irdma_sc_ceq *ceq,
ceq->ceq_elem_pa = info->ceqe_pa;
ceq->virtual_map = info->virtual_map;
ceq->itr_no_expire = info->itr_no_expire;
- ceq->reg_cq = info->reg_cq;
- ceq->reg_cq_size = 0;
- spin_lock_init(&ceq->req_cq_lock);
ceq->pbl_chunk_size = (ceq->virtual_map ? info->pbl_chunk_size : 0);
ceq->first_pm_pbl_idx = (ceq->virtual_map ? info->first_pm_pbl_idx : 0);
ceq->pbl_list = (ceq->virtual_map ? info->pbl_list : NULL);
@@ -4472,9 +4390,6 @@ int irdma_sc_cceq_destroy_done(struct irdma_sc_ceq *ceq)
{
struct irdma_sc_cqp *cqp;
- if (ceq->reg_cq)
- irdma_sc_remove_cq_ctx(ceq, ceq->dev->ccq);
-
cqp = ceq->dev->cqp;
cqp->process_cqp_sds = irdma_update_sds_noccq;
@@ -4493,11 +4408,6 @@ int irdma_sc_cceq_create(struct irdma_sc_ceq *ceq, u64 scratch)
struct irdma_sc_dev *dev = ceq->dev;
dev->ccq->vsi_idx = ceq->vsi_idx;
- if (ceq->reg_cq) {
- ret_code = irdma_sc_add_cq_ctx(ceq, ceq->dev->ccq);
- if (ret_code)
- return ret_code;
- }
ret_code = irdma_sc_ceq_create(ceq, scratch, true);
if (!ret_code)
@@ -4562,7 +4472,6 @@ void *irdma_sc_process_ceq(struct irdma_sc_dev *dev, struct irdma_sc_ceq *ceq)
struct irdma_sc_cq *temp_cq;
u8 polarity;
u32 cq_idx;
- unsigned long flags;
do {
cq_idx = 0;
@@ -4583,11 +4492,6 @@ void *irdma_sc_process_ceq(struct irdma_sc_dev *dev, struct irdma_sc_ceq *ceq)
}
cq = temp_cq;
- if (ceq->reg_cq) {
- spin_lock_irqsave(&ceq->req_cq_lock, flags);
- cq_idx = irdma_sc_find_reg_cq(ceq, cq);
- spin_unlock_irqrestore(&ceq->req_cq_lock, flags);
- }
IRDMA_RING_MOVE_TAIL(ceq->ceq_ring);
if (!IRDMA_RING_CURRENT_TAIL(ceq->ceq_ring))
@@ -4731,7 +4635,8 @@ static int irdma_sc_aeq_destroy(struct irdma_sc_aeq *aeq, u64 scratch,
u64 hdr;
dev = aeq->dev;
- if (dev->privileged)
+
+ if (dev->hw_attrs.uk_attrs.hw_rev <= IRDMA_GEN_2)
writel(0, dev->hw_regs[IRDMA_PFINT_AEQCTL]);
cqp = dev->cqp;
diff --git a/drivers/infiniband/hw/irdma/hw.c b/drivers/infiniband/hw/irdma/hw.c
index 7bad0e38786a..d1fc5726b979 100644
--- a/drivers/infiniband/hw/irdma/hw.c
+++ b/drivers/infiniband/hw/irdma/hw.c
@@ -2365,7 +2365,6 @@ static int irdma_cqp_manage_apbvt_cmd(struct irdma_device *iwdev,
cqp_info = &cqp_request->info;
info = &cqp_info->in.u.manage_apbvt_entry.info;
- memset(info, 0, sizeof(*info));
info->add = add_port;
info->port = accel_local_port;
cqp_info->cqp_cmd = IRDMA_OP_MANAGE_APBVT_ENTRY;
@@ -2474,7 +2473,6 @@ void irdma_manage_arp_cache(struct irdma_pci_f *rf,
if (action == IRDMA_ARP_ADD) {
cqp_info->cqp_cmd = IRDMA_OP_ADD_ARP_CACHE_ENTRY;
info = &cqp_info->in.u.add_arp_cache_entry.info;
- memset(info, 0, sizeof(*info));
info->arp_index = (u16)arp_index;
info->permanent = true;
ether_addr_copy(info->mac_addr, mac_addr);
@@ -2533,7 +2531,6 @@ int irdma_manage_qhash(struct irdma_device *iwdev, struct irdma_cm_info *cminfo,
cqp_info = &cqp_request->info;
info = &cqp_info->in.u.manage_qhash_table_entry.info;
- memset(info, 0, sizeof(*info));
info->vsi = &iwdev->vsi;
info->manage = mtype;
info->entry_type = etype;
diff --git a/drivers/infiniband/hw/irdma/icrdma_if.c b/drivers/infiniband/hw/irdma/icrdma_if.c
index 27b191f61caf..b49fd9cf2476 100644
--- a/drivers/infiniband/hw/irdma/icrdma_if.c
+++ b/drivers/infiniband/hw/irdma/icrdma_if.c
@@ -302,7 +302,8 @@ err_rt_init:
err_ctrl_init:
icrdma_deinit_interrupts(rf, cdev_info);
err_init_interrupts:
- kfree(iwdev->rf);
+ mutex_destroy(&rf->ah_tbl_lock);
+ kfree(rf);
ib_dealloc_device(&iwdev->ibdev);
return err;
@@ -319,6 +320,9 @@ static void icrdma_remove(struct auxiliary_device *aux_dev)
ice_rdma_update_vsi_filter(cdev_info, iwdev->vsi_num, false);
irdma_ib_unregister_device(iwdev);
icrdma_deinit_interrupts(iwdev->rf, cdev_info);
+ mutex_destroy(&iwdev->rf->ah_tbl_lock);
+
+ kfree(iwdev->rf);
pr_debug("INIT: Gen[%d] func[%d] device remove success\n",
rdma_ver, PCI_FUNC(cdev_info->pdev->devfn));
diff --git a/drivers/infiniband/hw/irdma/ig3rdma_if.c b/drivers/infiniband/hw/irdma/ig3rdma_if.c
index 1bb42eb298ba..e1d6670d9396 100644
--- a/drivers/infiniband/hw/irdma/ig3rdma_if.c
+++ b/drivers/infiniband/hw/irdma/ig3rdma_if.c
@@ -55,6 +55,7 @@ static int ig3rdma_vchnl_init(struct irdma_pci_f *rf,
ret = irdma_sc_vchnl_init(&rf->sc_dev, &virt_info);
if (ret) {
destroy_workqueue(rf->vchnl_wq);
+ mutex_destroy(&rf->sc_dev.vchnl_mutex);
return ret;
}
@@ -124,7 +125,9 @@ static void ig3rdma_decfg_rf(struct irdma_pci_f *rf)
{
struct irdma_hw *hw = &rf->hw;
+ mutex_destroy(&rf->ah_tbl_lock);
destroy_workqueue(rf->vchnl_wq);
+ mutex_destroy(&rf->sc_dev.vchnl_mutex);
kfree(hw->io_regs);
iounmap(hw->rdma_reg.addr);
}
@@ -149,6 +152,7 @@ static int ig3rdma_cfg_rf(struct irdma_pci_f *rf,
err = ig3rdma_cfg_regions(&rf->hw, cdev_info);
if (err) {
destroy_workqueue(rf->vchnl_wq);
+ mutex_destroy(&rf->sc_dev.vchnl_mutex);
return err;
}
diff --git a/drivers/infiniband/hw/irdma/main.h b/drivers/infiniband/hw/irdma/main.h
index 886b30da188a..baab61e424a2 100644
--- a/drivers/infiniband/hw/irdma/main.h
+++ b/drivers/infiniband/hw/irdma/main.h
@@ -556,7 +556,7 @@ void irdma_copy_ip_htonl(__be32 *dst, u32 *src);
u16 irdma_get_vlan_ipv4(u32 *addr);
void irdma_get_vlan_mac_ipv6(u32 *addr, u16 *vlan_id, u8 *mac);
struct ib_mr *irdma_reg_phys_mr(struct ib_pd *ib_pd, u64 addr, u64 size,
- int acc, u64 *iova_start);
+ int acc, u64 *iova_start, bool dma_mr);
int irdma_upload_qp_context(struct irdma_qp *iwqp, bool freeze, bool raw);
void irdma_cqp_ce_handler(struct irdma_pci_f *rf, struct irdma_sc_cq *cq);
int irdma_ah_cqp_op(struct irdma_pci_f *rf, struct irdma_sc_ah *sc_ah, u8 cmd,
@@ -564,7 +564,6 @@ int irdma_ah_cqp_op(struct irdma_pci_f *rf, struct irdma_sc_ah *sc_ah, u8 cmd,
void (*callback_fcn)(struct irdma_cqp_request *cqp_request),
void *cb_param);
void irdma_gsi_ud_qp_ah_cb(struct irdma_cqp_request *cqp_request);
-bool irdma_cq_empty(struct irdma_cq *iwcq);
int irdma_inetaddr_event(struct notifier_block *notifier, unsigned long event,
void *ptr);
int irdma_inet6addr_event(struct notifier_block *notifier, unsigned long event,
diff --git a/drivers/infiniband/hw/irdma/pble.c b/drivers/infiniband/hw/irdma/pble.c
index fa6325adaede..28dfad7f940c 100644
--- a/drivers/infiniband/hw/irdma/pble.c
+++ b/drivers/infiniband/hw/irdma/pble.c
@@ -506,12 +506,14 @@ exit:
void irdma_free_pble(struct irdma_hmc_pble_rsrc *pble_rsrc,
struct irdma_pble_alloc *palloc)
{
- pble_rsrc->freedpbles += palloc->total_cnt;
-
if (palloc->level == PBLE_LEVEL_2)
free_lvl2(pble_rsrc, palloc);
else
irdma_prm_return_pbles(&pble_rsrc->pinfo,
&palloc->level1.chunkinfo);
+
+ mutex_lock(&pble_rsrc->pble_mutex_lock);
+ pble_rsrc->freedpbles += palloc->total_cnt;
pble_rsrc->stats_alloc_freed++;
+ mutex_unlock(&pble_rsrc->pble_mutex_lock);
}
diff --git a/drivers/infiniband/hw/irdma/puda.c b/drivers/infiniband/hw/irdma/puda.c
index 694e5a9ed15d..cee47ddbd1b5 100644
--- a/drivers/infiniband/hw/irdma/puda.c
+++ b/drivers/infiniband/hw/irdma/puda.c
@@ -685,7 +685,6 @@ static int irdma_puda_qp_create(struct irdma_puda_rsrc *rsrc)
ukqp->rq_size = rsrc->rq_size;
IRDMA_RING_INIT(ukqp->sq_ring, ukqp->sq_size);
- IRDMA_RING_INIT(ukqp->initial_ring, ukqp->sq_size);
IRDMA_RING_INIT(ukqp->rq_ring, ukqp->rq_size);
ukqp->wqe_alloc_db = qp->pd->dev->wqe_alloc_db;
@@ -726,7 +725,6 @@ static int irdma_puda_cq_wqe(struct irdma_sc_dev *dev, struct irdma_sc_cq *cq)
struct irdma_sc_cqp *cqp;
u64 hdr;
struct irdma_ccq_cqe_info compl_info;
- int status = 0;
cqp = dev->cqp;
wqe = irdma_sc_cqp_get_next_send_wqe(cqp, 0);
@@ -756,16 +754,8 @@ static int irdma_puda_cq_wqe(struct irdma_sc_dev *dev, struct irdma_sc_cq *cq)
print_hex_dump_debug("PUDA: PUDA CREATE CQ", DUMP_PREFIX_OFFSET, 16,
8, wqe, IRDMA_CQP_WQE_SIZE * 8, false);
irdma_sc_cqp_post_sq(dev->cqp);
- status = irdma_sc_poll_for_cqp_op_done(dev->cqp, IRDMA_CQP_OP_CREATE_CQ,
- &compl_info);
- if (!status) {
- struct irdma_sc_ceq *ceq = dev->ceq[0];
-
- if (ceq && ceq->reg_cq)
- status = irdma_sc_add_cq_ctx(ceq, cq);
- }
-
- return status;
+ return irdma_sc_poll_for_cqp_op_done(dev->cqp, IRDMA_CQP_OP_CREATE_CQ,
+ &compl_info);
}
/**
@@ -897,23 +887,17 @@ void irdma_puda_dele_rsrc(struct irdma_sc_vsi *vsi, enum puda_rsrc_type type,
struct irdma_puda_buf *buf = NULL;
struct irdma_puda_buf *nextbuf = NULL;
struct irdma_virt_mem *vmem;
- struct irdma_sc_ceq *ceq;
- ceq = vsi->dev->ceq[0];
switch (type) {
case IRDMA_PUDA_RSRC_TYPE_ILQ:
rsrc = vsi->ilq;
vmem = &vsi->ilq_mem;
vsi->ilq = NULL;
- if (ceq && ceq->reg_cq)
- irdma_sc_remove_cq_ctx(ceq, &rsrc->cq);
break;
case IRDMA_PUDA_RSRC_TYPE_IEQ:
rsrc = vsi->ieq;
vmem = &vsi->ieq_mem;
vsi->ieq = NULL;
- if (ceq && ceq->reg_cq)
- irdma_sc_remove_cq_ctx(ceq, &rsrc->cq);
break;
default:
ibdev_dbg(to_ibdev(dev), "PUDA: error resource type = 0x%x\n",
diff --git a/drivers/infiniband/hw/irdma/type.h b/drivers/infiniband/hw/irdma/type.h
index c1b8f81ea283..cab4896640a1 100644
--- a/drivers/infiniband/hw/irdma/type.h
+++ b/drivers/infiniband/hw/irdma/type.h
@@ -492,9 +492,6 @@ struct irdma_sc_ceq {
u32 first_pm_pbl_idx;
u8 polarity;
u16 vsi_idx;
- struct irdma_sc_cq **reg_cq;
- u32 reg_cq_size;
- spinlock_t req_cq_lock; /* protect access to reg_cq array */
bool virtual_map:1;
bool tph_en:1;
bool itr_no_expire:1;
@@ -894,8 +891,6 @@ struct irdma_ceq_init_info {
u8 tph_val;
u16 vsi_idx;
u32 first_pm_pbl_idx;
- struct irdma_sc_cq **reg_cq;
- u32 reg_cq_idx;
};
struct irdma_aeq_init_info {
diff --git a/drivers/infiniband/hw/irdma/uk.c b/drivers/infiniband/hw/irdma/uk.c
index ce1ae10c30fc..f0846b800913 100644
--- a/drivers/infiniband/hw/irdma/uk.c
+++ b/drivers/infiniband/hw/irdma/uk.c
@@ -114,33 +114,8 @@ void irdma_clr_wqes(struct irdma_qp_uk *qp, u32 qp_wqe_idx)
*/
void irdma_uk_qp_post_wr(struct irdma_qp_uk *qp)
{
- u64 temp;
- u32 hw_sq_tail;
- u32 sw_sq_head;
-
- /* valid bit is written and loads completed before reading shadow */
- mb();
-
- /* read the doorbell shadow area */
- get_64bit_val(qp->shadow_area, 0, &temp);
-
- hw_sq_tail = (u32)FIELD_GET(IRDMA_QP_DBSA_HW_SQ_TAIL, temp);
- sw_sq_head = IRDMA_RING_CURRENT_HEAD(qp->sq_ring);
- if (sw_sq_head != qp->initial_ring.head) {
- if (sw_sq_head != hw_sq_tail) {
- if (sw_sq_head > qp->initial_ring.head) {
- if (hw_sq_tail >= qp->initial_ring.head &&
- hw_sq_tail < sw_sq_head)
- writel(qp->qp_id, qp->wqe_alloc_db);
- } else {
- if (hw_sq_tail >= qp->initial_ring.head ||
- hw_sq_tail < sw_sq_head)
- writel(qp->qp_id, qp->wqe_alloc_db);
- }
- }
- }
-
- qp->initial_ring.head = qp->sq_ring.head;
+ dma_wmb();
+ writel(qp->qp_id, qp->wqe_alloc_db);
}
/**
@@ -194,6 +169,7 @@ __le64 *irdma_qp_get_next_send_wqe(struct irdma_qp_uk *qp, u32 *wqe_idx,
qp->sq_wrtrk_array[*wqe_idx].wrid = info->wr_id;
qp->sq_wrtrk_array[*wqe_idx].wr_len = total_size;
qp->sq_wrtrk_array[*wqe_idx].quanta = quanta;
+ qp->sq_wrtrk_array[*wqe_idx].signaled = info->signaled;
return wqe;
}
@@ -1137,6 +1113,27 @@ void irdma_uk_cq_request_notification(struct irdma_cq_uk *cq,
}
/**
+ * irdma_uk_cq_empty - Check if CQ is empty
+ * @cq: hw cq
+ */
+bool irdma_uk_cq_empty(struct irdma_cq_uk *cq)
+{
+ __le64 *cqe;
+ u8 polarity;
+ u64 qword3;
+
+ if (cq->avoid_mem_cflct)
+ cqe = IRDMA_GET_CURRENT_EXTENDED_CQ_ELEM(cq);
+ else
+ cqe = IRDMA_GET_CURRENT_CQ_ELEM(cq);
+
+ get_64bit_val(cqe, 24, &qword3);
+ polarity = (u8)FIELD_GET(IRDMA_CQ_VALID, qword3);
+
+ return polarity != cq->polarity;
+}
+
+/**
* irdma_uk_cq_poll_cmpl - get cq completion info
* @cq: hw cq
* @info: cq poll information returned
@@ -1287,6 +1284,8 @@ int irdma_uk_cq_poll_cmpl(struct irdma_cq_uk *cq,
info->op_type = (u8)FIELD_GET(IRDMACQ_OP, qword3);
if (info->q_type == IRDMA_CQE_QTYPE_RQ && is_srq) {
+ unsigned long flags;
+
srq = qp->srq_uk;
get_64bit_val(cqe, 8, &info->wr_id);
@@ -1299,8 +1298,11 @@ int irdma_uk_cq_poll_cmpl(struct irdma_cq_uk *cq,
} else {
info->stag_invalid_set = false;
}
+ spin_lock_irqsave(srq->lock, flags);
IRDMA_RING_MOVE_TAIL(srq->srq_ring);
+ spin_unlock_irqrestore(srq->lock, flags);
pring = &srq->srq_ring;
+
} else if (info->q_type == IRDMA_CQE_QTYPE_RQ && !is_srq) {
u32 array_idx;
@@ -1355,6 +1357,10 @@ int irdma_uk_cq_poll_cmpl(struct irdma_cq_uk *cq,
info->wr_id = qp->sq_wrtrk_array[wqe_idx].wrid;
if (!info->comp_status)
info->bytes_xfered = qp->sq_wrtrk_array[wqe_idx].wr_len;
+ if (!qp->sq_wrtrk_array[wqe_idx].signaled) {
+ ret_code = -EFAULT;
+ goto exit;
+ }
info->op_type = (u8)FIELD_GET(IRDMACQ_OP, qword3);
IRDMA_RING_SET_TAIL(qp->sq_ring,
wqe_idx + qp->sq_wrtrk_array[wqe_idx].quanta);
@@ -1420,8 +1426,9 @@ exit:
IRDMA_RING_MOVE_TAIL(cq->cq_ring);
if (!cq->avoid_mem_cflct && ext_valid)
IRDMA_RING_MOVE_TAIL(cq->cq_ring);
- set_64bit_val(cq->shadow_area, 0,
- IRDMA_RING_CURRENT_HEAD(cq->cq_ring));
+ if (IRDMA_RING_CURRENT_HEAD(cq->cq_ring) & 0x3F || irdma_uk_cq_empty(cq))
+ set_64bit_val(cq->shadow_area, 0,
+ IRDMA_RING_CURRENT_HEAD(cq->cq_ring));
} else {
qword3 &= ~IRDMA_CQ_WQEIDX;
qword3 |= FIELD_PREP(IRDMA_CQ_WQEIDX, pring->tail);
@@ -1574,7 +1581,6 @@ static void irdma_setup_connection_wqes(struct irdma_qp_uk *qp,
qp->conn_wqes = move_cnt;
IRDMA_RING_MOVE_HEAD_BY_COUNT_NOCHECK(qp->sq_ring, move_cnt);
IRDMA_RING_MOVE_TAIL_BY_COUNT(qp->sq_ring, move_cnt);
- IRDMA_RING_MOVE_HEAD_BY_COUNT_NOCHECK(qp->initial_ring, move_cnt);
}
/**
@@ -1719,7 +1725,6 @@ int irdma_uk_qp_init(struct irdma_qp_uk *qp, struct irdma_qp_uk_init_info *info)
qp->max_sq_frag_cnt = info->max_sq_frag_cnt;
sq_ring_size = qp->sq_size << info->sq_shift;
IRDMA_RING_INIT(qp->sq_ring, sq_ring_size);
- IRDMA_RING_INIT(qp->initial_ring, sq_ring_size);
if (info->first_sq_wq) {
irdma_setup_connection_wqes(qp, info);
qp->swqe_polarity = 1;
diff --git a/drivers/infiniband/hw/irdma/user.h b/drivers/infiniband/hw/irdma/user.h
index ab57f689827a..9eb7fd0b1cbf 100644
--- a/drivers/infiniband/hw/irdma/user.h
+++ b/drivers/infiniband/hw/irdma/user.h
@@ -429,6 +429,7 @@ struct irdma_wqe_uk_ops {
struct irdma_bind_window *op_info);
};
+bool irdma_uk_cq_empty(struct irdma_cq_uk *cq);
int irdma_uk_cq_poll_cmpl(struct irdma_cq_uk *cq,
struct irdma_cq_poll_info *info);
void irdma_uk_cq_request_notification(struct irdma_cq_uk *cq,
@@ -456,7 +457,6 @@ struct irdma_srq_uk {
struct irdma_uk_attrs *uk_attrs;
__le64 *shadow_area;
struct irdma_ring srq_ring;
- struct irdma_ring initial_ring;
u32 srq_id;
u32 srq_size;
u32 max_srq_frag_cnt;
@@ -465,6 +465,7 @@ struct irdma_srq_uk {
u8 wqe_size;
u8 wqe_size_multiplier;
u8 deferred_flag;
+ spinlock_t *lock;
};
struct irdma_srq_uk_init_info {
@@ -482,7 +483,8 @@ struct irdma_sq_uk_wr_trk_info {
u64 wrid;
u32 wr_len;
u16 quanta;
- u8 reserved[2];
+ u8 signaled;
+ u8 reserved[1];
};
struct irdma_qp_quanta {
diff --git a/drivers/infiniband/hw/irdma/utils.c b/drivers/infiniband/hw/irdma/utils.c
index 8b94d87b0192..cc2a12f735d3 100644
--- a/drivers/infiniband/hw/irdma/utils.c
+++ b/drivers/infiniband/hw/irdma/utils.c
@@ -452,6 +452,7 @@ struct irdma_cqp_request *irdma_alloc_and_get_cqp_request(struct irdma_cqp *cqp,
cqp_request->waiting = wait;
refcount_set(&cqp_request->refcnt, 1);
memset(&cqp_request->compl_info, 0, sizeof(cqp_request->compl_info));
+ memset(&cqp_request->info, 0, sizeof(cqp_request->info));
return cqp_request;
}
@@ -1068,7 +1069,6 @@ int irdma_cqp_qp_create_cmd(struct irdma_sc_dev *dev, struct irdma_sc_qp *qp)
cqp_info = &cqp_request->info;
qp_info = &cqp_request->info.in.u.qp_create.info;
- memset(qp_info, 0, sizeof(*qp_info));
qp_info->cq_num_valid = true;
qp_info->next_iwarp_state = IRDMA_QP_STATE_RTS;
cqp_info->cqp_cmd = IRDMA_OP_QP_CREATE;
@@ -1343,7 +1343,6 @@ int irdma_cqp_qp_destroy_cmd(struct irdma_sc_dev *dev, struct irdma_sc_qp *qp)
return -ENOMEM;
cqp_info = &cqp_request->info;
- memset(cqp_info, 0, sizeof(*cqp_info));
cqp_info->cqp_cmd = IRDMA_OP_QP_DESTROY;
cqp_info->post_sq = 1;
cqp_info->in.u.qp_destroy.qp = qp;
@@ -1749,7 +1748,6 @@ int irdma_cqp_gather_stats_cmd(struct irdma_sc_dev *dev,
return -ENOMEM;
cqp_info = &cqp_request->info;
- memset(cqp_info, 0, sizeof(*cqp_info));
cqp_info->cqp_cmd = IRDMA_OP_STATS_GATHER;
cqp_info->post_sq = 1;
cqp_info->in.u.stats_gather.info = pestat->gather_info;
@@ -1789,7 +1787,6 @@ int irdma_cqp_stats_inst_cmd(struct irdma_sc_vsi *vsi, u8 cmd,
return -ENOMEM;
cqp_info = &cqp_request->info;
- memset(cqp_info, 0, sizeof(*cqp_info));
cqp_info->cqp_cmd = cmd;
cqp_info->post_sq = 1;
cqp_info->in.u.stats_manage.info = *stats_info;
@@ -1890,7 +1887,6 @@ int irdma_cqp_ws_node_cmd(struct irdma_sc_dev *dev, u8 cmd,
return -ENOMEM;
cqp_info = &cqp_request->info;
- memset(cqp_info, 0, sizeof(*cqp_info));
cqp_info->cqp_cmd = cmd;
cqp_info->post_sq = 1;
cqp_info->in.u.ws_node.info = *node_info;
@@ -2357,24 +2353,6 @@ void irdma_ib_qp_event(struct irdma_qp *iwqp, enum irdma_qp_event_type event)
iwqp->ibqp.event_handler(&ibevent, iwqp->ibqp.qp_context);
}
-bool irdma_cq_empty(struct irdma_cq *iwcq)
-{
- struct irdma_cq_uk *ukcq;
- u64 qword3;
- __le64 *cqe;
- u8 polarity;
-
- ukcq = &iwcq->sc_cq.cq_uk;
- if (ukcq->avoid_mem_cflct)
- cqe = IRDMA_GET_CURRENT_EXTENDED_CQ_ELEM(ukcq);
- else
- cqe = IRDMA_GET_CURRENT_CQ_ELEM(ukcq);
- get_64bit_val(cqe, 24, &qword3);
- polarity = (u8)FIELD_GET(IRDMA_CQ_VALID, qword3);
-
- return polarity != ukcq->polarity;
-}
-
void irdma_remove_cmpls_list(struct irdma_cq *iwcq)
{
struct irdma_cmpl_gen *cmpl_node;
@@ -2436,6 +2414,8 @@ void irdma_generate_flush_completions(struct irdma_qp *iwqp)
struct irdma_qp_uk *qp = &iwqp->sc_qp.qp_uk;
struct irdma_ring *sq_ring = &qp->sq_ring;
struct irdma_ring *rq_ring = &qp->rq_ring;
+ struct irdma_cq *iwscq = iwqp->iwscq;
+ struct irdma_cq *iwrcq = iwqp->iwrcq;
struct irdma_cmpl_gen *cmpl;
__le64 *sw_wqe;
u64 wqe_qword;
@@ -2443,8 +2423,8 @@ void irdma_generate_flush_completions(struct irdma_qp *iwqp)
bool compl_generated = false;
unsigned long flags1;
- spin_lock_irqsave(&iwqp->iwscq->lock, flags1);
- if (irdma_cq_empty(iwqp->iwscq)) {
+ spin_lock_irqsave(&iwscq->lock, flags1);
+ if (irdma_uk_cq_empty(&iwscq->sc_cq.cq_uk)) {
unsigned long flags2;
spin_lock_irqsave(&iwqp->lock, flags2);
@@ -2452,7 +2432,7 @@ void irdma_generate_flush_completions(struct irdma_qp *iwqp)
cmpl = kzalloc(sizeof(*cmpl), GFP_ATOMIC);
if (!cmpl) {
spin_unlock_irqrestore(&iwqp->lock, flags2);
- spin_unlock_irqrestore(&iwqp->iwscq->lock, flags1);
+ spin_unlock_irqrestore(&iwscq->lock, flags1);
return;
}
@@ -2471,24 +2451,24 @@ void irdma_generate_flush_completions(struct irdma_qp *iwqp)
kfree(cmpl);
continue;
}
- ibdev_dbg(iwqp->iwscq->ibcq.device,
+ ibdev_dbg(iwscq->ibcq.device,
"DEV: %s: adding wr_id = 0x%llx SQ Completion to list qp_id=%d\n",
__func__, cmpl->cpi.wr_id, qp->qp_id);
- list_add_tail(&cmpl->list, &iwqp->iwscq->cmpl_generated);
+ list_add_tail(&cmpl->list, &iwscq->cmpl_generated);
compl_generated = true;
}
spin_unlock_irqrestore(&iwqp->lock, flags2);
- spin_unlock_irqrestore(&iwqp->iwscq->lock, flags1);
+ spin_unlock_irqrestore(&iwscq->lock, flags1);
if (compl_generated)
- irdma_comp_handler(iwqp->iwscq);
+ irdma_comp_handler(iwscq);
} else {
- spin_unlock_irqrestore(&iwqp->iwscq->lock, flags1);
+ spin_unlock_irqrestore(&iwscq->lock, flags1);
mod_delayed_work(iwqp->iwdev->cleanup_wq, &iwqp->dwork_flush,
msecs_to_jiffies(IRDMA_FLUSH_DELAY_MS));
}
- spin_lock_irqsave(&iwqp->iwrcq->lock, flags1);
- if (irdma_cq_empty(iwqp->iwrcq)) {
+ spin_lock_irqsave(&iwrcq->lock, flags1);
+ if (irdma_uk_cq_empty(&iwrcq->sc_cq.cq_uk)) {
unsigned long flags2;
spin_lock_irqsave(&iwqp->lock, flags2);
@@ -2496,7 +2476,7 @@ void irdma_generate_flush_completions(struct irdma_qp *iwqp)
cmpl = kzalloc(sizeof(*cmpl), GFP_ATOMIC);
if (!cmpl) {
spin_unlock_irqrestore(&iwqp->lock, flags2);
- spin_unlock_irqrestore(&iwqp->iwrcq->lock, flags1);
+ spin_unlock_irqrestore(&iwrcq->lock, flags1);
return;
}
@@ -2508,20 +2488,20 @@ void irdma_generate_flush_completions(struct irdma_qp *iwqp)
cmpl->cpi.q_type = IRDMA_CQE_QTYPE_RQ;
/* remove the RQ WR by moving RQ tail */
IRDMA_RING_SET_TAIL(*rq_ring, rq_ring->tail + 1);
- ibdev_dbg(iwqp->iwrcq->ibcq.device,
+ ibdev_dbg(iwrcq->ibcq.device,
"DEV: %s: adding wr_id = 0x%llx RQ Completion to list qp_id=%d, wqe_idx=%d\n",
__func__, cmpl->cpi.wr_id, qp->qp_id,
wqe_idx);
- list_add_tail(&cmpl->list, &iwqp->iwrcq->cmpl_generated);
+ list_add_tail(&cmpl->list, &iwrcq->cmpl_generated);
compl_generated = true;
}
spin_unlock_irqrestore(&iwqp->lock, flags2);
- spin_unlock_irqrestore(&iwqp->iwrcq->lock, flags1);
+ spin_unlock_irqrestore(&iwrcq->lock, flags1);
if (compl_generated)
- irdma_comp_handler(iwqp->iwrcq);
+ irdma_comp_handler(iwrcq);
} else {
- spin_unlock_irqrestore(&iwqp->iwrcq->lock, flags1);
+ spin_unlock_irqrestore(&iwrcq->lock, flags1);
mod_delayed_work(iwqp->iwdev->cleanup_wq, &iwqp->dwork_flush,
msecs_to_jiffies(IRDMA_FLUSH_DELAY_MS));
}
diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c
index c883c9ea5a83..6d9af41a2884 100644
--- a/drivers/infiniband/hw/irdma/verbs.c
+++ b/drivers/infiniband/hw/irdma/verbs.c
@@ -27,7 +27,8 @@ static int irdma_query_device(struct ib_device *ibdev,
irdma_fw_minor_ver(&rf->sc_dev);
props->device_cap_flags = IB_DEVICE_MEM_WINDOW |
IB_DEVICE_MEM_MGT_EXTENSIONS;
- props->kernel_cap_flags = IBK_LOCAL_DMA_LKEY;
+ if (hw_attrs->uk_attrs.hw_rev < IRDMA_GEN_3)
+ props->kernel_cap_flags = IBK_LOCAL_DMA_LKEY;
props->vendor_id = pcidev->vendor;
props->vendor_part_id = pcidev->device;
@@ -771,7 +772,6 @@ static int irdma_cqp_create_qp_cmd(struct irdma_qp *iwqp)
cqp_info = &cqp_request->info;
qp_info = &cqp_request->info.in.u.qp_create.info;
- memset(qp_info, 0, sizeof(*qp_info));
qp_info->mac_valid = true;
qp_info->cq_num_valid = true;
qp_info->next_iwarp_state = IRDMA_QP_STATE_IDLE;
@@ -2029,6 +2029,7 @@ static int irdma_resize_cq(struct ib_cq *ibcq, int entries,
struct irdma_pci_f *rf;
struct irdma_cq_buf *cq_buf = NULL;
unsigned long flags;
+ u8 cqe_size;
int ret;
iwdev = to_iwdev(ibcq->device);
@@ -2045,7 +2046,7 @@ static int irdma_resize_cq(struct ib_cq *ibcq, int entries,
return -EINVAL;
if (!iwcq->user_mode) {
- entries++;
+ entries += 2;
if (!iwcq->sc_cq.cq_uk.avoid_mem_cflct &&
dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_2)
@@ -2053,6 +2054,10 @@ static int irdma_resize_cq(struct ib_cq *ibcq, int entries,
if (entries & 1)
entries += 1; /* cq size must be an even number */
+
+ cqe_size = iwcq->sc_cq.cq_uk.avoid_mem_cflct ? 64 : 32;
+ if (entries * cqe_size == IRDMA_HW_PAGE_SIZE)
+ entries += 2;
}
info.cq_size = max(entries, 4);
@@ -2306,8 +2311,8 @@ static int irdma_setup_kmode_srq(struct irdma_device *iwdev,
ukinfo->srq_size = depth >> shift;
ukinfo->shadow_area = mem->va + ring_size;
- info->shadow_area_pa = info->srq_pa + ring_size;
info->srq_pa = mem->pa;
+ info->shadow_area_pa = info->srq_pa + ring_size;
return 0;
}
@@ -2384,6 +2389,7 @@ static int irdma_create_srq(struct ib_srq *ibsrq,
info.vsi = &iwdev->vsi;
info.pd = &iwpd->sc_pd;
+ iwsrq->sc_srq.srq_uk.lock = &iwsrq->lock;
err_code = irdma_sc_srq_init(&iwsrq->sc_srq, &info);
if (err_code)
goto free_dmem;
@@ -2483,6 +2489,7 @@ static int irdma_create_cq(struct ib_cq *ibcq,
int err_code;
int entries = attr->cqe;
bool cqe_64byte_ena;
+ u8 cqe_size;
err_code = cq_validate_flags(attr->flags, dev->hw_attrs.uk_attrs.hw_rev);
if (err_code)
@@ -2509,6 +2516,7 @@ static int irdma_create_cq(struct ib_cq *ibcq,
ukinfo->cq_id = cq_num;
cqe_64byte_ena = dev->hw_attrs.uk_attrs.feature_flags & IRDMA_FEATURE_64_BYTE_CQE ?
true : false;
+ cqe_size = cqe_64byte_ena ? 64 : 32;
ukinfo->avoid_mem_cflct = cqe_64byte_ena;
iwcq->ibcq.cqe = info.cq_uk_init_info.cq_size;
if (attr->comp_vector < rf->ceqs_count)
@@ -2581,13 +2589,16 @@ static int irdma_create_cq(struct ib_cq *ibcq,
goto cq_free_rsrc;
}
- entries++;
+ entries += 2;
if (!cqe_64byte_ena && dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_2)
entries *= 2;
if (entries & 1)
entries += 1; /* cq size must be an even number */
+ if (entries * cqe_size == IRDMA_HW_PAGE_SIZE)
+ entries += 2;
+
ukinfo->cq_size = entries;
if (cqe_64byte_ena)
@@ -3103,12 +3114,10 @@ static int irdma_hw_alloc_stag(struct irdma_device *iwdev,
cqp_info = &cqp_request->info;
info = &cqp_info->in.u.alloc_stag.info;
- memset(info, 0, sizeof(*info));
info->page_size = PAGE_SIZE;
info->stag_idx = iwmr->stag >> IRDMA_CQPSQ_STAG_IDX_S;
info->pd_id = iwpd->sc_pd.pd_id;
info->total_len = iwmr->len;
- info->all_memory = pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY;
info->remote_access = true;
cqp_info->cqp_cmd = IRDMA_OP_ALLOC_STAG;
cqp_info->post_sq = 1;
@@ -3119,7 +3128,7 @@ static int irdma_hw_alloc_stag(struct irdma_device *iwdev,
if (status)
return status;
- iwmr->is_hwreg = 1;
+ iwmr->is_hwreg = true;
return 0;
}
@@ -3253,7 +3262,6 @@ static int irdma_hwreg_mr(struct irdma_device *iwdev, struct irdma_mr *iwmr,
cqp_info = &cqp_request->info;
stag_info = &cqp_info->in.u.mr_reg_non_shared.info;
- memset(stag_info, 0, sizeof(*stag_info));
stag_info->va = iwpbl->user_base;
stag_info->stag_idx = iwmr->stag >> IRDMA_CQPSQ_STAG_IDX_S;
stag_info->stag_key = (u8)iwmr->stag;
@@ -3263,7 +3271,7 @@ static int irdma_hwreg_mr(struct irdma_device *iwdev, struct irdma_mr *iwmr,
if (iwdev->rf->sc_dev.hw_attrs.uk_attrs.feature_flags & IRDMA_FEATURE_ATOMIC_OPS)
stag_info->remote_atomics_en = (access & IB_ACCESS_REMOTE_ATOMIC) ? 1 : 0;
stag_info->pd_id = iwpd->sc_pd.pd_id;
- stag_info->all_memory = pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY;
+ stag_info->all_memory = iwmr->dma_mr;
if (stag_info->access_rights & IRDMA_ACCESS_FLAGS_ZERO_BASED)
stag_info->addr_type = IRDMA_ADDR_TYPE_ZERO_BASED;
else
@@ -3290,7 +3298,7 @@ static int irdma_hwreg_mr(struct irdma_device *iwdev, struct irdma_mr *iwmr,
irdma_put_cqp_request(&iwdev->rf->cqp, cqp_request);
if (!ret)
- iwmr->is_hwreg = 1;
+ iwmr->is_hwreg = true;
return ret;
}
@@ -3647,7 +3655,6 @@ static int irdma_hwdereg_mr(struct ib_mr *ib_mr)
cqp_info = &cqp_request->info;
info = &cqp_info->in.u.dealloc_stag.info;
- memset(info, 0, sizeof(*info));
info->pd_id = iwpd->sc_pd.pd_id;
info->stag_idx = ib_mr->rkey >> IRDMA_CQPSQ_STAG_IDX_S;
info->mr = true;
@@ -3663,7 +3670,7 @@ static int irdma_hwdereg_mr(struct ib_mr *ib_mr)
if (status)
return status;
- iwmr->is_hwreg = 0;
+ iwmr->is_hwreg = false;
return 0;
}
@@ -3786,9 +3793,10 @@ static struct ib_mr *irdma_rereg_user_mr(struct ib_mr *ib_mr, int flags,
* @size: size of memory to register
* @access: Access rights
* @iova_start: start of virtual address for physical buffers
+ * @dma_mr: Flag indicating whether this region is a PD DMA MR
*/
struct ib_mr *irdma_reg_phys_mr(struct ib_pd *pd, u64 addr, u64 size, int access,
- u64 *iova_start)
+ u64 *iova_start, bool dma_mr)
{
struct irdma_device *iwdev = to_iwdev(pd->device);
struct irdma_pbl *iwpbl;
@@ -3805,6 +3813,7 @@ struct ib_mr *irdma_reg_phys_mr(struct ib_pd *pd, u64 addr, u64 size, int access
iwpbl = &iwmr->iwpbl;
iwpbl->iwmr = iwmr;
iwmr->type = IRDMA_MEMREG_TYPE_MEM;
+ iwmr->dma_mr = dma_mr;
iwpbl->user_base = *iova_start;
stag = irdma_create_stag(iwdev);
if (!stag) {
@@ -3843,7 +3852,7 @@ static struct ib_mr *irdma_get_dma_mr(struct ib_pd *pd, int acc)
{
u64 kva = 0;
- return irdma_reg_phys_mr(pd, 0, 0, acc, &kva);
+ return irdma_reg_phys_mr(pd, 0, 0, acc, &kva, true);
}
/**
@@ -4078,7 +4087,7 @@ static int irdma_post_send(struct ib_qp *ibqp,
break;
case IB_WR_LOCAL_INV:
info.op_type = IRDMA_OP_TYPE_INV_STAG;
- info.local_fence = info.read_fence;
+ info.local_fence = true;
info.op.inv_local_stag.target_stag = ib_wr->ex.invalidate_rkey;
err = irdma_uk_stag_local_invalidate(ukqp, &info, true);
break;
@@ -4505,7 +4514,7 @@ static int irdma_req_notify_cq(struct ib_cq *ibcq,
}
if ((notify_flags & IB_CQ_REPORT_MISSED_EVENTS) &&
- (!irdma_cq_empty(iwcq) || !list_empty(&iwcq->cmpl_generated)))
+ (!irdma_uk_cq_empty(ukcq) || !list_empty(&iwcq->cmpl_generated)))
ret = 1;
spin_unlock_irqrestore(&iwcq->lock, flags);
@@ -5204,7 +5213,7 @@ static int irdma_create_user_ah(struct ib_ah *ibah,
struct irdma_ah *parent_ah;
int err;
- if (udata && udata->outlen < IRDMA_CREATE_AH_MIN_RESP_LEN)
+ if (udata->outlen < IRDMA_CREATE_AH_MIN_RESP_LEN)
return -EINVAL;
err = irdma_setup_ah(ibah, attr);
@@ -5500,7 +5509,9 @@ void irdma_ib_dealloc_device(struct ib_device *ibdev)
irdma_rt_deinit_hw(iwdev);
if (!iwdev->is_vport) {
irdma_ctrl_deinit_hw(iwdev->rf);
- if (iwdev->rf->vchnl_wq)
+ if (iwdev->rf->vchnl_wq) {
destroy_workqueue(iwdev->rf->vchnl_wq);
+ mutex_destroy(&iwdev->rf->sc_dev.vchnl_mutex);
+ }
}
}
diff --git a/drivers/infiniband/hw/irdma/verbs.h b/drivers/infiniband/hw/irdma/verbs.h
index ac8b38701835..aabbb3442098 100644
--- a/drivers/infiniband/hw/irdma/verbs.h
+++ b/drivers/infiniband/hw/irdma/verbs.h
@@ -111,7 +111,8 @@ struct irdma_mr {
};
struct ib_umem *region;
int access;
- u8 is_hwreg;
+ bool is_hwreg:1;
+ bool dma_mr:1;
u16 type;
u32 page_cnt;
u64 page_size;
diff --git a/drivers/infiniband/hw/mlx4/cm.c b/drivers/infiniband/hw/mlx4/cm.c
index 12b481d138cf..03aacd526860 100644
--- a/drivers/infiniband/hw/mlx4/cm.c
+++ b/drivers/infiniband/hw/mlx4/cm.c
@@ -591,7 +591,7 @@ void mlx4_ib_cm_paravirt_clean(struct mlx4_ib_dev *dev, int slave)
int mlx4_ib_cm_init(void)
{
- cm_wq = alloc_workqueue("mlx4_ib_cm", 0, 0);
+ cm_wq = alloc_workqueue("mlx4_ib_cm", WQ_PERCPU, 0);
if (!cm_wq)
return -ENOMEM;
diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c
index 8b506417ad2f..d31d7f3005c6 100644
--- a/drivers/infiniband/hw/mlx5/devx.c
+++ b/drivers/infiniband/hw/mlx5/devx.c
@@ -1225,6 +1225,11 @@ static void devx_obj_build_destroy_cmd(void *in, void *out, void *din,
MLX5_GET(create_flow_table_in, in, other_vport));
MLX5_SET(destroy_flow_table_in, din, vport_number,
MLX5_GET(create_flow_table_in, in, vport_number));
+ MLX5_SET(destroy_flow_table_in, din, other_eswitch,
+ MLX5_GET(create_flow_table_in, in, other_eswitch));
+ MLX5_SET(destroy_flow_table_in, din, eswitch_owner_vhca_id,
+ MLX5_GET(create_flow_table_in, in,
+ eswitch_owner_vhca_id));
MLX5_SET(destroy_flow_table_in, din, table_type,
MLX5_GET(create_flow_table_in, in, table_type));
MLX5_SET(destroy_flow_table_in, din, table_id, *obj_id);
@@ -1237,6 +1242,11 @@ static void devx_obj_build_destroy_cmd(void *in, void *out, void *din,
MLX5_GET(create_flow_group_in, in, other_vport));
MLX5_SET(destroy_flow_group_in, din, vport_number,
MLX5_GET(create_flow_group_in, in, vport_number));
+ MLX5_SET(destroy_flow_group_in, din, other_eswitch,
+ MLX5_GET(create_flow_group_in, in, other_eswitch));
+ MLX5_SET(destroy_flow_group_in, din, eswitch_owner_vhca_id,
+ MLX5_GET(create_flow_group_in, in,
+ eswitch_owner_vhca_id));
MLX5_SET(destroy_flow_group_in, din, table_type,
MLX5_GET(create_flow_group_in, in, table_type));
MLX5_SET(destroy_flow_group_in, din, table_id,
@@ -1251,6 +1261,10 @@ static void devx_obj_build_destroy_cmd(void *in, void *out, void *din,
MLX5_GET(set_fte_in, in, other_vport));
MLX5_SET(delete_fte_in, din, vport_number,
MLX5_GET(set_fte_in, in, vport_number));
+ MLX5_SET(delete_fte_in, din, other_eswitch,
+ MLX5_GET(set_fte_in, in, other_eswitch));
+ MLX5_SET(delete_fte_in, din, eswitch_owner_vhca_id,
+ MLX5_GET(set_fte_in, in, eswitch_owner_vhca_id));
MLX5_SET(delete_fte_in, din, table_type,
MLX5_GET(set_fte_in, in, table_type));
MLX5_SET(delete_fte_in, din, table_id,
diff --git a/drivers/infiniband/hw/mlx5/fs.c b/drivers/infiniband/hw/mlx5/fs.c
index b0f7663c24c1..d17823ce7f38 100644
--- a/drivers/infiniband/hw/mlx5/fs.c
+++ b/drivers/infiniband/hw/mlx5/fs.c
@@ -691,22 +691,13 @@ static bool __maybe_unused mlx5_ib_shared_ft_allowed(struct ib_device *device)
return MLX5_CAP_GEN(dev->mdev, shared_object_to_user_object_allowed);
}
-static struct mlx5_ib_flow_prio *_get_prio(struct mlx5_ib_dev *dev,
- struct mlx5_flow_namespace *ns,
+static struct mlx5_ib_flow_prio *_get_prio(struct mlx5_flow_namespace *ns,
struct mlx5_ib_flow_prio *prio,
- int priority,
- int num_entries, int num_groups,
- u32 flags, u16 vport)
+ struct mlx5_flow_table_attr *ft_attr)
{
- struct mlx5_flow_table_attr ft_attr = {};
struct mlx5_flow_table *ft;
- ft_attr.prio = priority;
- ft_attr.max_fte = num_entries;
- ft_attr.flags = flags;
- ft_attr.vport = vport;
- ft_attr.autogroup.max_num_groups = num_groups;
- ft = mlx5_create_auto_grouped_flow_table(ns, &ft_attr);
+ ft = mlx5_create_auto_grouped_flow_table(ns, ft_attr);
if (IS_ERR(ft))
return ERR_CAST(ft);
@@ -720,6 +711,7 @@ static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev,
enum flow_table_type ft_type)
{
bool dont_trap = flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP;
+ struct mlx5_flow_table_attr ft_attr = {};
struct mlx5_flow_namespace *ns = NULL;
enum mlx5_flow_namespace_type fn_type;
struct mlx5_ib_flow_prio *prio;
@@ -797,11 +789,14 @@ static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev,
max_table_size = min_t(int, num_entries, max_table_size);
ft = prio->flow_table;
- if (!ft)
- return _get_prio(dev, ns, prio, priority, max_table_size,
- num_groups, flags, 0);
+ if (ft)
+ return prio;
- return prio;
+ ft_attr.prio = priority;
+ ft_attr.max_fte = max_table_size;
+ ft_attr.flags = flags;
+ ft_attr.autogroup.max_num_groups = num_groups;
+ return _get_prio(ns, prio, &ft_attr);
}
enum {
@@ -950,6 +945,7 @@ static int get_per_qp_prio(struct mlx5_ib_dev *dev,
enum mlx5_ib_optional_counter_type type)
{
enum mlx5_ib_optional_counter_type per_qp_type;
+ struct mlx5_flow_table_attr ft_attr = {};
enum mlx5_flow_namespace_type fn_type;
struct mlx5_flow_namespace *ns;
struct mlx5_ib_flow_prio *prio;
@@ -1003,7 +999,10 @@ static int get_per_qp_prio(struct mlx5_ib_dev *dev,
if (prio->flow_table)
return 0;
- prio = _get_prio(dev, ns, prio, priority, MLX5_FS_MAX_POOL_SIZE, 1, 0, 0);
+ ft_attr.prio = priority;
+ ft_attr.max_fte = MLX5_FS_MAX_POOL_SIZE;
+ ft_attr.autogroup.max_num_groups = 1;
+ prio = _get_prio(ns, prio, &ft_attr);
if (IS_ERR(prio))
return PTR_ERR(prio);
@@ -1223,6 +1222,7 @@ int mlx5_ib_fs_add_op_fc(struct mlx5_ib_dev *dev, u32 port_num,
struct mlx5_ib_op_fc *opfc,
enum mlx5_ib_optional_counter_type type)
{
+ struct mlx5_flow_table_attr ft_attr = {};
enum mlx5_flow_namespace_type fn_type;
int priority, i, err, spec_num;
struct mlx5_flow_act flow_act = {};
@@ -1304,8 +1304,10 @@ int mlx5_ib_fs_add_op_fc(struct mlx5_ib_dev *dev, u32 port_num,
if (err)
goto free;
- prio = _get_prio(dev, ns, prio, priority,
- dev->num_ports * MAX_OPFC_RULES, 1, 0, 0);
+ ft_attr.prio = priority;
+ ft_attr.max_fte = dev->num_ports * MAX_OPFC_RULES;
+ ft_attr.autogroup.max_num_groups = 1;
+ prio = _get_prio(ns, prio, &ft_attr);
if (IS_ERR(prio)) {
err = PTR_ERR(prio);
goto put_prio;
@@ -1872,7 +1874,7 @@ static int mlx5_ib_fill_transport_ns_info(struct mlx5_ib_dev *dev,
u32 *flags, u16 *vport_idx,
u16 *vport,
struct mlx5_core_dev **ft_mdev,
- u32 ib_port)
+ u32 ib_port, u16 *esw_owner_vhca_id)
{
struct mlx5_core_dev *esw_mdev;
@@ -1886,8 +1888,13 @@ static int mlx5_ib_fill_transport_ns_info(struct mlx5_ib_dev *dev,
return -EINVAL;
esw_mdev = mlx5_eswitch_get_core_dev(dev->port[ib_port - 1].rep->esw);
- if (esw_mdev != dev->mdev)
- return -EOPNOTSUPP;
+ if (esw_mdev != dev->mdev) {
+ if (!MLX5_CAP_ADV_RDMA(dev->mdev,
+ rdma_transport_manager_other_eswitch))
+ return -EOPNOTSUPP;
+ *flags |= MLX5_FLOW_TABLE_OTHER_ESWITCH;
+ *esw_owner_vhca_id = MLX5_CAP_GEN(esw_mdev, vhca_id);
+ }
*flags |= MLX5_FLOW_TABLE_OTHER_VPORT;
*ft_mdev = esw_mdev;
@@ -1903,8 +1910,10 @@ _get_flow_table(struct mlx5_ib_dev *dev, u16 user_priority,
bool mcast, u32 ib_port)
{
struct mlx5_core_dev *ft_mdev = dev->mdev;
+ struct mlx5_flow_table_attr ft_attr = {};
struct mlx5_flow_namespace *ns = NULL;
struct mlx5_ib_flow_prio *prio = NULL;
+ u16 esw_owner_vhca_id = 0;
int max_table_size = 0;
u16 vport_idx = 0;
bool esw_encap;
@@ -1966,7 +1975,8 @@ _get_flow_table(struct mlx5_ib_dev *dev, u16 user_priority,
return ERR_PTR(-EINVAL);
ret = mlx5_ib_fill_transport_ns_info(dev, ns_type, &flags,
&vport_idx, &vport,
- &ft_mdev, ib_port);
+ &ft_mdev, ib_port,
+ &esw_owner_vhca_id);
if (ret)
return ERR_PTR(ret);
@@ -2026,8 +2036,13 @@ _get_flow_table(struct mlx5_ib_dev *dev, u16 user_priority,
if (prio->flow_table)
return prio;
- return _get_prio(dev, ns, prio, priority, max_table_size,
- MLX5_FS_MAX_TYPES, flags, vport);
+ ft_attr.prio = priority;
+ ft_attr.max_fte = max_table_size;
+ ft_attr.flags = flags;
+ ft_attr.vport = vport;
+ ft_attr.esw_owner_vhca_id = esw_owner_vhca_id;
+ ft_attr.autogroup.max_num_groups = MLX5_FS_MAX_TYPES;
+ return _get_prio(ns, prio, &ft_attr);
}
static struct mlx5_ib_flow_handler *
diff --git a/drivers/infiniband/hw/mlx5/ib_rep.c b/drivers/infiniband/hw/mlx5/ib_rep.c
index cc8859d3c2f5..bbecca405171 100644
--- a/drivers/infiniband/hw/mlx5/ib_rep.c
+++ b/drivers/infiniband/hw/mlx5/ib_rep.c
@@ -44,6 +44,63 @@ static void mlx5_ib_num_ports_update(struct mlx5_core_dev *dev, u32 *num_ports)
}
}
+static int mlx5_ib_set_owner_transport(struct mlx5_core_dev *cur_owner,
+ struct mlx5_core_dev *new_owner)
+{
+ int ret;
+
+ if (!MLX5_CAP_FLOWTABLE_RDMA_TRANSPORT_TX(cur_owner, ft_support) ||
+ !MLX5_CAP_FLOWTABLE_RDMA_TRANSPORT_RX(cur_owner, ft_support))
+ return 0;
+
+ if (!MLX5_CAP_ADV_RDMA(new_owner, rdma_transport_manager) ||
+ !MLX5_CAP_ADV_RDMA(new_owner, rdma_transport_manager_other_eswitch))
+ return 0;
+
+ ret = mlx5_fs_set_root_dev(cur_owner, new_owner,
+ FS_FT_RDMA_TRANSPORT_TX);
+ if (ret)
+ return ret;
+
+ ret = mlx5_fs_set_root_dev(cur_owner, new_owner,
+ FS_FT_RDMA_TRANSPORT_RX);
+ if (ret) {
+ mlx5_fs_set_root_dev(cur_owner, cur_owner,
+ FS_FT_RDMA_TRANSPORT_TX);
+ return ret;
+ }
+
+ return 0;
+}
+
+static void mlx5_ib_release_transport(struct mlx5_core_dev *dev)
+{
+ struct mlx5_core_dev *peer_dev;
+ int i, ret;
+
+ mlx5_lag_for_each_peer_mdev(dev, peer_dev, i) {
+ ret = mlx5_ib_set_owner_transport(peer_dev, peer_dev);
+ WARN_ON_ONCE(ret);
+ }
+}
+
+static int mlx5_ib_take_transport(struct mlx5_core_dev *dev)
+{
+ struct mlx5_core_dev *peer_dev;
+ int ret;
+ int i;
+
+ mlx5_lag_for_each_peer_mdev(dev, peer_dev, i) {
+ ret = mlx5_ib_set_owner_transport(peer_dev, dev);
+ if (ret) {
+ mlx5_ib_release_transport(dev);
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
static int
mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
{
@@ -88,10 +145,18 @@ mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
else
return mlx5_ib_set_vport_rep(lag_master, rep, vport_index);
+ if (mlx5_lag_is_shared_fdb(dev)) {
+ ret = mlx5_ib_take_transport(lag_master);
+ if (ret)
+ return ret;
+ }
+
ibdev = ib_alloc_device_with_net(mlx5_ib_dev, ib_dev,
mlx5_core_net(lag_master));
- if (!ibdev)
- return -ENOMEM;
+ if (!ibdev) {
+ ret = -ENOMEM;
+ goto release_transport;
+ }
ibdev->port = kcalloc(num_ports, sizeof(*ibdev->port),
GFP_KERNEL);
@@ -127,6 +192,10 @@ fail_add:
kfree(ibdev->port);
fail_port:
ib_dealloc_device(&ibdev->ib_dev);
+release_transport:
+ if (mlx5_lag_is_shared_fdb(lag_master))
+ mlx5_ib_release_transport(lag_master);
+
return ret;
}
@@ -182,6 +251,7 @@ mlx5_ib_vport_rep_unload(struct mlx5_eswitch_rep *rep)
esw = peer_mdev->priv.eswitch;
mlx5_eswitch_unregister_vport_reps(esw, REP_IB);
}
+ mlx5_ib_release_transport(mdev);
}
__mlx5_ib_remove(dev, dev->profile, MLX5_IB_STAGE_MAX);
}
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 90daa58126f4..40284bbb45d6 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -511,6 +511,10 @@ static int translate_eth_ext_proto_oper(u32 eth_proto_oper, u16 *active_speed,
*active_width = IB_WIDTH_4X;
*active_speed = IB_SPEED_XDR;
break;
+ case MLX5E_PROT_MASK(MLX5E_1600TAUI_8_1600TBASE_CR8_KR8):
+ *active_width = IB_WIDTH_8X;
+ *active_speed = IB_SPEED_XDR;
+ break;
default:
return -EINVAL;
}
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index 0e8ae85af5a6..e71ee3d52eb0 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -97,33 +97,28 @@ struct mlx5_pagefault {
* a pagefault. */
#define MMU_NOTIFIER_TIMEOUT 1000
-#define MLX5_IMR_MTT_BITS (30 - PAGE_SHIFT)
-#define MLX5_IMR_MTT_SHIFT (MLX5_IMR_MTT_BITS + PAGE_SHIFT)
-#define MLX5_IMR_MTT_ENTRIES BIT_ULL(MLX5_IMR_MTT_BITS)
-#define MLX5_IMR_MTT_SIZE BIT_ULL(MLX5_IMR_MTT_SHIFT)
-#define MLX5_IMR_MTT_MASK (~(MLX5_IMR_MTT_SIZE - 1))
-
-#define MLX5_KSM_PAGE_SHIFT MLX5_IMR_MTT_SHIFT
-
static u64 mlx5_imr_ksm_entries;
+static u64 mlx5_imr_mtt_entries;
+static u64 mlx5_imr_mtt_size;
+static u8 mlx5_imr_mtt_shift;
+static u8 mlx5_imr_ksm_page_shift;
-static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries,
+static void populate_ksm(struct mlx5_ksm *pksm, size_t idx, size_t nentries,
struct mlx5_ib_mr *imr, int flags)
{
struct mlx5_core_dev *dev = mr_to_mdev(imr)->mdev;
- struct mlx5_klm *end = pklm + nentries;
- int step = MLX5_CAP_ODP(dev, mem_page_fault) ? MLX5_IMR_MTT_SIZE : 0;
+ struct mlx5_ksm *end = pksm + nentries;
+ u64 step = MLX5_CAP_ODP(dev, mem_page_fault) ? mlx5_imr_mtt_size : 0;
__be32 key = MLX5_CAP_ODP(dev, mem_page_fault) ?
cpu_to_be32(imr->null_mmkey.key) :
mr_to_mdev(imr)->mkeys.null_mkey;
u64 va =
- MLX5_CAP_ODP(dev, mem_page_fault) ? idx * MLX5_IMR_MTT_SIZE : 0;
+ MLX5_CAP_ODP(dev, mem_page_fault) ? idx * mlx5_imr_mtt_size : 0;
if (flags & MLX5_IB_UPD_XLT_ZAP) {
- for (; pklm != end; pklm++, idx++, va += step) {
- pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE);
- pklm->key = key;
- pklm->va = cpu_to_be64(va);
+ for (; pksm != end; pksm++, idx++, va += step) {
+ pksm->key = key;
+ pksm->va = cpu_to_be64(va);
}
return;
}
@@ -147,16 +142,15 @@ static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries,
*/
lockdep_assert_held(&to_ib_umem_odp(imr->umem)->umem_mutex);
- for (; pklm != end; pklm++, idx++, va += step) {
+ for (; pksm != end; pksm++, idx++, va += step) {
struct mlx5_ib_mr *mtt = xa_load(&imr->implicit_children, idx);
- pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE);
if (mtt) {
- pklm->key = cpu_to_be32(mtt->ibmr.lkey);
- pklm->va = cpu_to_be64(idx * MLX5_IMR_MTT_SIZE);
+ pksm->key = cpu_to_be32(mtt->ibmr.lkey);
+ pksm->va = cpu_to_be64(idx * mlx5_imr_mtt_size);
} else {
- pklm->key = key;
- pklm->va = cpu_to_be64(va);
+ pksm->key = key;
+ pksm->va = cpu_to_be64(va);
}
}
}
@@ -201,7 +195,7 @@ int mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
struct mlx5_ib_mr *mr, int flags)
{
if (flags & MLX5_IB_UPD_XLT_INDIRECT) {
- populate_klm(xlt, idx, nentries, mr, flags);
+ populate_ksm(xlt, idx, nentries, mr, flags);
return 0;
} else {
return populate_mtt(xlt, idx, nentries, mr, flags);
@@ -226,7 +220,7 @@ static void free_implicit_child_mr_work(struct work_struct *work)
mutex_lock(&odp_imr->umem_mutex);
mlx5r_umr_update_xlt(mr->parent,
- ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT, 1, 0,
+ ib_umem_start(odp) >> mlx5_imr_mtt_shift, 1, 0,
MLX5_IB_UPD_XLT_INDIRECT | MLX5_IB_UPD_XLT_ATOMIC);
mutex_unlock(&odp_imr->umem_mutex);
mlx5_ib_dereg_mr(&mr->ibmr, NULL);
@@ -237,7 +231,7 @@ static void free_implicit_child_mr_work(struct work_struct *work)
static void destroy_unused_implicit_child_mr(struct mlx5_ib_mr *mr)
{
struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
- unsigned long idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT;
+ unsigned long idx = ib_umem_start(odp) >> mlx5_imr_mtt_shift;
struct mlx5_ib_mr *imr = mr->parent;
/*
@@ -265,7 +259,7 @@ static void destroy_unused_implicit_child_mr(struct mlx5_ib_mr *mr)
/* Freeing a MR is a sleeping operation, so bounce to a work queue */
INIT_WORK(&mr->odp_destroy.work, free_implicit_child_mr_work);
- queue_work(system_unbound_wq, &mr->odp_destroy.work);
+ queue_work(system_dfl_wq, &mr->odp_destroy.work);
}
static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier *mni,
@@ -425,7 +419,10 @@ static void internal_fill_odp_caps(struct mlx5_ib_dev *dev)
if (MLX5_CAP_GEN(dev->mdev, fixed_buffer_size) &&
MLX5_CAP_GEN(dev->mdev, null_mkey) &&
MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset) &&
- !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled))
+ !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled) &&
+ mlx5_imr_ksm_entries != 0 &&
+ !(mlx5_imr_ksm_page_shift >
+ get_max_log_entity_size_cap(dev, MLX5_MKC_ACCESS_MODE_KSM)))
caps->general_caps |= IB_ODP_SUPPORT_IMPLICIT;
}
@@ -476,14 +473,14 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
int err;
odp = ib_umem_odp_alloc_child(to_ib_umem_odp(imr->umem),
- idx * MLX5_IMR_MTT_SIZE,
- MLX5_IMR_MTT_SIZE, &mlx5_mn_ops);
+ idx * mlx5_imr_mtt_size,
+ mlx5_imr_mtt_size, &mlx5_mn_ops);
if (IS_ERR(odp))
return ERR_CAST(odp);
mr = mlx5_mr_cache_alloc(dev, imr->access_flags,
MLX5_MKC_ACCESS_MODE_MTT,
- MLX5_IMR_MTT_ENTRIES);
+ mlx5_imr_mtt_entries);
if (IS_ERR(mr)) {
ib_umem_odp_release(odp);
return mr;
@@ -495,7 +492,7 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
mr->umem = &odp->umem;
mr->ibmr.lkey = mr->mmkey.key;
mr->ibmr.rkey = mr->mmkey.key;
- mr->ibmr.iova = idx * MLX5_IMR_MTT_SIZE;
+ mr->ibmr.iova = idx * mlx5_imr_mtt_size;
mr->parent = imr;
odp->private = mr;
@@ -506,7 +503,7 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
refcount_set(&mr->mmkey.usecount, 2);
err = mlx5r_umr_update_xlt(mr, 0,
- MLX5_IMR_MTT_ENTRIES,
+ mlx5_imr_mtt_entries,
PAGE_SHIFT,
MLX5_IB_UPD_XLT_ZAP |
MLX5_IB_UPD_XLT_ENABLE);
@@ -611,7 +608,7 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
struct mlx5_ib_mr *imr;
int err;
- if (!mlx5r_umr_can_load_pas(dev, MLX5_IMR_MTT_ENTRIES * PAGE_SIZE))
+ if (!mlx5r_umr_can_load_pas(dev, mlx5_imr_mtt_entries * PAGE_SIZE))
return ERR_PTR(-EOPNOTSUPP);
umem_odp = ib_umem_odp_alloc_implicit(&dev->ib_dev, access_flags);
@@ -647,7 +644,7 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
err = mlx5r_umr_update_xlt(imr, 0,
mlx5_imr_ksm_entries,
- MLX5_KSM_PAGE_SHIFT,
+ mlx5_imr_ksm_page_shift,
MLX5_IB_UPD_XLT_INDIRECT |
MLX5_IB_UPD_XLT_ZAP |
MLX5_IB_UPD_XLT_ENABLE);
@@ -750,20 +747,20 @@ static int pagefault_implicit_mr(struct mlx5_ib_mr *imr,
struct ib_umem_odp *odp_imr, u64 user_va,
size_t bcnt, u32 *bytes_mapped, u32 flags)
{
- unsigned long end_idx = (user_va + bcnt - 1) >> MLX5_IMR_MTT_SHIFT;
+ unsigned long end_idx = (user_va + bcnt - 1) >> mlx5_imr_mtt_shift;
unsigned long upd_start_idx = end_idx + 1;
unsigned long upd_len = 0;
unsigned long npages = 0;
int err;
int ret;
- if (unlikely(user_va >= mlx5_imr_ksm_entries * MLX5_IMR_MTT_SIZE ||
- mlx5_imr_ksm_entries * MLX5_IMR_MTT_SIZE - user_va < bcnt))
+ if (unlikely(user_va >= mlx5_imr_ksm_entries * mlx5_imr_mtt_size ||
+ mlx5_imr_ksm_entries * mlx5_imr_mtt_size - user_va < bcnt))
return -EFAULT;
/* Fault each child mr that intersects with our interval. */
while (bcnt) {
- unsigned long idx = user_va >> MLX5_IMR_MTT_SHIFT;
+ unsigned long idx = user_va >> mlx5_imr_mtt_shift;
struct ib_umem_odp *umem_odp;
struct mlx5_ib_mr *mtt;
u64 len;
@@ -1924,9 +1921,25 @@ void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *dev)
int mlx5_ib_odp_init(void)
{
+ u32 log_va_pages = ilog2(TASK_SIZE) - PAGE_SHIFT;
+ u8 mlx5_imr_mtt_bits;
+
+ /* 48 is default ARM64 VA space and covers X86 4-level paging which is 47 */
+ if (log_va_pages <= 48 - PAGE_SHIFT)
+ mlx5_imr_mtt_shift = 30;
+ /* 56 is x86-64, 5-level paging */
+ else if (log_va_pages <= 56 - PAGE_SHIFT)
+ mlx5_imr_mtt_shift = 34;
+ else
+ return 0;
+
+ mlx5_imr_mtt_size = BIT_ULL(mlx5_imr_mtt_shift);
+ mlx5_imr_mtt_bits = mlx5_imr_mtt_shift - PAGE_SHIFT;
+ mlx5_imr_mtt_entries = BIT_ULL(mlx5_imr_mtt_bits);
mlx5_imr_ksm_entries = BIT_ULL(get_order(TASK_SIZE) -
- MLX5_IMR_MTT_BITS);
+ mlx5_imr_mtt_bits);
+ mlx5_imr_ksm_page_shift = mlx5_imr_mtt_shift;
return 0;
}
@@ -2093,6 +2106,6 @@ int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
destroy_prefetch_work(work);
return rc;
}
- queue_work(system_unbound_wq, &work->work);
+ queue_work(system_dfl_wq, &work->work);
return 0;
}
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index 88724d15705d..69af20790481 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -3451,10 +3451,11 @@ int mlx5r_ib_rate(struct mlx5_ib_dev *dev, u8 rate)
{
u32 stat_rate_support;
- if (rate == IB_RATE_PORT_CURRENT || rate == IB_RATE_800_GBPS)
+ if (rate == IB_RATE_PORT_CURRENT || rate == IB_RATE_800_GBPS ||
+ rate == IB_RATE_1600_GBPS)
return 0;
- if (rate < IB_RATE_2_5_GBPS || rate > IB_RATE_800_GBPS)
+ if (rate < IB_RATE_2_5_GBPS || rate > IB_RATE_1600_GBPS)
return -EINVAL;
stat_rate_support = MLX5_CAP_GEN(dev->mdev, stat_rate_support);
diff --git a/drivers/infiniband/sw/rdmavt/cq.c b/drivers/infiniband/sw/rdmavt/cq.c
index 0ca2743f1075..e7835ca70e2b 100644
--- a/drivers/infiniband/sw/rdmavt/cq.c
+++ b/drivers/infiniband/sw/rdmavt/cq.c
@@ -518,7 +518,8 @@ int rvt_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry)
*/
int rvt_driver_cq_init(void)
{
- comp_vector_wq = alloc_workqueue("%s", WQ_HIGHPRI | WQ_CPU_INTENSIVE,
+ comp_vector_wq = alloc_workqueue("%s",
+ WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_PERCPU,
0, "rdmavt_cq");
if (!comp_vector_wq)
return -ENOMEM;
diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c
index bcb97b3ea58a..b1df05238848 100644
--- a/drivers/infiniband/sw/rxe/rxe_mr.c
+++ b/drivers/infiniband/sw/rxe/rxe_mr.c
@@ -452,7 +452,6 @@ static int rxe_mr_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int leng
length -= bytes;
iova += bytes;
- page_offset = 0;
}
return 0;
diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c
index ac0183a2ff7a..0195d361e5e3 100644
--- a/drivers/infiniband/sw/rxe/rxe_net.c
+++ b/drivers/infiniband/sw/rxe/rxe_net.c
@@ -20,6 +20,54 @@
static struct rxe_recv_sockets recv_sockets;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+/*
+ * lockdep can detect false positive circular dependencies
+ * when there are user-space socket API users or in kernel
+ * users switching between a tcp and rdma transport.
+ * Maybe also switching between siw and rxe may cause
+ * problems as per default sockets are only classified
+ * by family and not by ip protocol. And there might
+ * be different locks used between the application
+ * and the low level sockets.
+ *
+ * Problems were seen with ksmbd.ko and cifs.ko,
+ * switching transports, use git blame to find
+ * more details.
+ */
+static struct lock_class_key rxe_recv_sk_key[2];
+static struct lock_class_key rxe_recv_slock_key[2];
+#endif /* CONFIG_DEBUG_LOCK_ALLOC */
+
+static inline void rxe_reclassify_recv_socket(struct socket *sock)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ struct sock *sk = sock->sk;
+
+ if (WARN_ON_ONCE(!sock_allow_reclassification(sk)))
+ return;
+
+ switch (sk->sk_family) {
+ case AF_INET:
+ sock_lock_init_class_and_name(sk,
+ "slock-AF_INET-RDMA-RXE-RECV",
+ &rxe_recv_slock_key[0],
+ "sk_lock-AF_INET-RDMA-RXE-RECV",
+ &rxe_recv_sk_key[0]);
+ break;
+ case AF_INET6:
+ sock_lock_init_class_and_name(sk,
+ "slock-AF_INET6-RDMA-RXE-RECV",
+ &rxe_recv_slock_key[1],
+ "sk_lock-AF_INET6-RDMA-RXE-RECV",
+ &rxe_recv_sk_key[1]);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ }
+#endif /* CONFIG_DEBUG_LOCK_ALLOC */
+}
+
static struct dst_entry *rxe_find_route4(struct rxe_qp *qp,
struct net_device *ndev,
struct in_addr *saddr,
@@ -192,6 +240,7 @@ static struct socket *rxe_setup_udp_tunnel(struct net *net, __be16 port,
err = udp_sock_create(net, &udp_cfg, &sock);
if (err < 0)
return ERR_PTR(err);
+ rxe_reclassify_recv_socket(sock);
tnl_cfg.encap_type = 1;
tnl_cfg.encap_rcv = rxe_udp_encap_recv;
diff --git a/drivers/infiniband/sw/rxe/rxe_odp.c b/drivers/infiniband/sw/rxe/rxe_odp.c
index f58e3ec6252f..ae71812bea82 100644
--- a/drivers/infiniband/sw/rxe/rxe_odp.c
+++ b/drivers/infiniband/sw/rxe/rxe_odp.c
@@ -358,7 +358,6 @@ int rxe_odp_flush_pmem_iova(struct rxe_mr *mr, u64 iova,
length -= bytes;
iova += bytes;
- page_offset = 0;
}
mutex_unlock(&umem_odp->umem_mutex);
diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c
index 95f1c1c2949d..845bdd03ca28 100644
--- a/drivers/infiniband/sw/rxe/rxe_qp.c
+++ b/drivers/infiniband/sw/rxe/rxe_qp.c
@@ -15,6 +15,54 @@
#include "rxe_queue.h"
#include "rxe_task.h"
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+/*
+ * lockdep can detect false positive circular dependencies
+ * when there are user-space socket API users or in kernel
+ * users switching between a tcp and rdma transport.
+ * Maybe also switching between siw and rxe may cause
+ * problems as per default sockets are only classified
+ * by family and not by ip protocol. And there might
+ * be different locks used between the application
+ * and the low level sockets.
+ *
+ * Problems were seen with ksmbd.ko and cifs.ko,
+ * switching transports, use git blame to find
+ * more details.
+ */
+static struct lock_class_key rxe_send_sk_key[2];
+static struct lock_class_key rxe_send_slock_key[2];
+#endif /* CONFIG_DEBUG_LOCK_ALLOC */
+
+static inline void rxe_reclassify_send_socket(struct socket *sock)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ struct sock *sk = sock->sk;
+
+ if (WARN_ON_ONCE(!sock_allow_reclassification(sk)))
+ return;
+
+ switch (sk->sk_family) {
+ case AF_INET:
+ sock_lock_init_class_and_name(sk,
+ "slock-AF_INET-RDMA-RXE-SEND",
+ &rxe_send_slock_key[0],
+ "sk_lock-AF_INET-RDMA-RXE-SEND",
+ &rxe_send_sk_key[0]);
+ break;
+ case AF_INET6:
+ sock_lock_init_class_and_name(sk,
+ "slock-AF_INET6-RDMA-RXE-SEND",
+ &rxe_send_slock_key[1],
+ "sk_lock-AF_INET6-RDMA-RXE-SEND",
+ &rxe_send_sk_key[1]);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ }
+#endif /* CONFIG_DEBUG_LOCK_ALLOC */
+}
+
static int rxe_qp_chk_cap(struct rxe_dev *rxe, struct ib_qp_cap *cap,
int has_srq)
{
@@ -244,6 +292,7 @@ static int rxe_qp_init_req(struct rxe_dev *rxe, struct rxe_qp *qp,
err = sock_create_kern(&init_net, AF_INET, SOCK_DGRAM, 0, &qp->sk);
if (err < 0)
return err;
+ rxe_reclassify_send_socket(qp->sk);
qp->sk->sk->sk_user_data = qp;
/* pick a source UDP port number for this QP based on
diff --git a/drivers/infiniband/sw/rxe/rxe_srq.c b/drivers/infiniband/sw/rxe/rxe_srq.c
index 3661cb627d28..2a234f26ac10 100644
--- a/drivers/infiniband/sw/rxe/rxe_srq.c
+++ b/drivers/infiniband/sw/rxe/rxe_srq.c
@@ -171,7 +171,7 @@ int rxe_srq_from_attr(struct rxe_dev *rxe, struct rxe_srq *srq,
udata, mi, &srq->rq.producer_lock,
&srq->rq.consumer_lock);
if (err)
- goto err_free;
+ return err;
srq->rq.max_wr = attr->max_wr;
}
@@ -180,11 +180,6 @@ int rxe_srq_from_attr(struct rxe_dev *rxe, struct rxe_srq *srq,
srq->limit = attr->srq_limit;
return 0;
-
-err_free:
- rxe_queue_cleanup(q);
- srq->rq.queue = NULL;
- return err;
}
void rxe_srq_cleanup(struct rxe_pool_elem *elem)
diff --git a/drivers/infiniband/sw/siw/siw_cm.c b/drivers/infiniband/sw/siw/siw_cm.c
index eb0bd4f79a85..1d3de8209bfa 100644
--- a/drivers/infiniband/sw/siw/siw_cm.c
+++ b/drivers/infiniband/sw/siw/siw_cm.c
@@ -39,6 +39,55 @@ static void siw_cm_llp_error_report(struct sock *s);
static int siw_cm_upcall(struct siw_cep *cep, enum iw_cm_event_type reason,
int status);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+/*
+ * lockdep can detect false positive circular dependencies
+ * when there are user-space socket API users or in kernel
+ * users switching between a tcp and rdma transport.
+ * Maybe also switching between siw and rxe may cause
+ * problems as per default sockets are only classified
+ * by family and not by ip protocol. And there might
+ * be different locks used between the application
+ * and the low level sockets.
+ *
+ * Problems were seen with ksmbd.ko and cifs.ko,
+ * switching transports, use git blame to find
+ * more details.
+ */
+static struct lock_class_key siw_sk_key[2];
+static struct lock_class_key siw_slock_key[2];
+#endif /* CONFIG_DEBUG_LOCK_ALLOC */
+
+static inline void siw_reclassify_socket(struct socket *sock)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ struct sock *sk = sock->sk;
+
+ if (WARN_ON_ONCE(!sock_allow_reclassification(sk)))
+ return;
+
+ switch (sk->sk_family) {
+ case AF_INET:
+ sock_lock_init_class_and_name(sk,
+ "slock-AF_INET-RDMA-SIW",
+ &siw_slock_key[0],
+ "sk_lock-AF_INET-RDMA-SIW",
+ &siw_sk_key[0]);
+ break;
+ case AF_INET6:
+ sock_lock_init_class_and_name(sk,
+ "slock-AF_INET6-RDMA-SIW",
+ &siw_slock_key[1],
+ "sk_lock-AF_INET6-RDMA-SIW",
+ &siw_sk_key[1]);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ }
+#endif /* CONFIG_DEBUG_LOCK_ALLOC */
+}
+
static void siw_sk_assign_cm_upcalls(struct sock *sk)
{
struct siw_cep *cep = sk_to_cep(sk);
@@ -1394,6 +1443,7 @@ int siw_connect(struct iw_cm_id *id, struct iw_cm_conn_param *params)
rv = sock_create(v4 ? AF_INET : AF_INET6, SOCK_STREAM, IPPROTO_TCP, &s);
if (rv < 0)
goto error;
+ siw_reclassify_socket(s);
/*
* NOTE: For simplification, connect() is called in blocking
@@ -1770,6 +1820,7 @@ int siw_create_listen(struct iw_cm_id *id, int backlog)
rv = sock_create(addr_family, SOCK_STREAM, IPPROTO_TCP, &s);
if (rv < 0)
return rv;
+ siw_reclassify_socket(s);
/*
* Allow binding local port when still in TIME_WAIT from last close.
diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c
index 2e3c0516ce8f..dc531fad73de 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.c
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.c
@@ -1029,7 +1029,7 @@ static int __init iser_init(void)
mutex_init(&ig.connlist_mutex);
INIT_LIST_HEAD(&ig.connlist);
- release_wq = alloc_workqueue("release workqueue", 0, 0);
+ release_wq = alloc_workqueue("release workqueue", WQ_PERCPU, 0);
if (!release_wq) {
iser_err("failed to allocate release workqueue\n");
err = -ENOMEM;
diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c
index 42977a5326ee..af811d060cc8 100644
--- a/drivers/infiniband/ulp/isert/ib_isert.c
+++ b/drivers/infiniband/ulp/isert/ib_isert.c
@@ -2613,7 +2613,7 @@ static struct iscsit_transport iser_target_transport = {
static int __init isert_init(void)
{
- isert_login_wq = alloc_workqueue("isert_login_wq", 0, 0);
+ isert_login_wq = alloc_workqueue("isert_login_wq", WQ_PERCPU, 0);
if (!isert_login_wq) {
isert_err("Unable to allocate isert_login_wq\n");
return -ENOMEM;
diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv.c b/drivers/infiniband/ulp/rtrs/rtrs-srv.c
index ef4abdea3c2d..9ecc6343455d 100644
--- a/drivers/infiniband/ulp/rtrs/rtrs-srv.c
+++ b/drivers/infiniband/ulp/rtrs/rtrs-srv.c
@@ -1450,7 +1450,7 @@ err_free_chunks:
kfree(srv->chunks);
err_free_srv:
- kfree(srv);
+ put_device(&srv->dev);
return ERR_PTR(-ENOMEM);
}
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 70d29b14d851..99095645134f 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -40,12 +40,13 @@ config IOMMU_IO_PGTABLE_LPAE
sizes at both stage-1 and stage-2, as well as address spaces
up to 48-bits in size.
-config IOMMU_IO_PGTABLE_LPAE_SELFTEST
- bool "LPAE selftests"
- depends on IOMMU_IO_PGTABLE_LPAE
+config IOMMU_IO_PGTABLE_LPAE_KUNIT_TEST
+ tristate "KUnit tests for LPAE"
+ depends on IOMMU_IO_PGTABLE_LPAE && KUNIT
+ default KUNIT_ALL_TESTS
help
- Enable self-tests for LPAE page table allocator. This performs
- a series of page-table consistency checks during boot.
+ Enable kunit tests for LPAE page table allocator. This performs
+ a series of page-table consistency checks.
If unsure, say N here.
@@ -247,7 +248,7 @@ config SUN50I_IOMMU
config TEGRA_IOMMU_SMMU
bool "NVIDIA Tegra SMMU Support"
- depends on ARCH_TEGRA
+ depends on ARCH_TEGRA || COMPILE_TEST
depends on TEGRA_AHB
depends on TEGRA_MC
select IOMMU_API
@@ -384,3 +385,5 @@ config SPRD_IOMMU
Say Y here if you want to use the multimedia devices listed above.
endif # IOMMU_SUPPORT
+
+source "drivers/iommu/generic_pt/Kconfig"
diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
index 355294fa9033..8e8843316c4b 100644
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -3,6 +3,7 @@ obj-y += arm/ iommufd/
obj-$(CONFIG_AMD_IOMMU) += amd/
obj-$(CONFIG_INTEL_IOMMU) += intel/
obj-$(CONFIG_RISCV_IOMMU) += riscv/
+obj-$(CONFIG_GENERIC_PT) += generic_pt/fmt/
obj-$(CONFIG_IOMMU_API) += iommu.o
obj-$(CONFIG_IOMMU_SUPPORT) += iommu-pages.o
obj-$(CONFIG_IOMMU_API) += iommu-traces.o
@@ -12,6 +13,7 @@ obj-$(CONFIG_IOMMU_DMA) += dma-iommu.o
obj-$(CONFIG_IOMMU_IO_PGTABLE) += io-pgtable.o
obj-$(CONFIG_IOMMU_IO_PGTABLE_ARMV7S) += io-pgtable-arm-v7s.o
obj-$(CONFIG_IOMMU_IO_PGTABLE_LPAE) += io-pgtable-arm.o
+obj-$(CONFIG_IOMMU_IO_PGTABLE_LPAE_KUNIT_TEST) += io-pgtable-arm-selftests.o
obj-$(CONFIG_IOMMU_IO_PGTABLE_DART) += io-pgtable-dart.o
obj-$(CONFIG_IOMMU_IOVA) += iova.o
obj-$(CONFIG_OF_IOMMU) += of_iommu.o
diff --git a/drivers/iommu/amd/Kconfig b/drivers/iommu/amd/Kconfig
index ecef69c11144..f2acf471cb5d 100644
--- a/drivers/iommu/amd/Kconfig
+++ b/drivers/iommu/amd/Kconfig
@@ -11,10 +11,13 @@ config AMD_IOMMU
select MMU_NOTIFIER
select IOMMU_API
select IOMMU_IOVA
- select IOMMU_IO_PGTABLE
select IOMMU_SVA
select IOMMU_IOPF
select IOMMUFD_DRIVER if IOMMUFD
+ select GENERIC_PT
+ select IOMMU_PT
+ select IOMMU_PT_AMDV1
+ select IOMMU_PT_X86_64
depends on X86_64 && PCI && ACPI && HAVE_CMPXCHG_DOUBLE
help
With this option you can enable support for AMD IOMMU hardware in
diff --git a/drivers/iommu/amd/Makefile b/drivers/iommu/amd/Makefile
index 59c04a67f398..5412a563c697 100644
--- a/drivers/iommu/amd/Makefile
+++ b/drivers/iommu/amd/Makefile
@@ -1,3 +1,3 @@
# SPDX-License-Identifier: GPL-2.0-only
-obj-y += iommu.o init.o quirks.o io_pgtable.o io_pgtable_v2.o ppr.o pasid.o
+obj-y += iommu.o init.o quirks.o ppr.o pasid.o
obj-$(CONFIG_AMD_IOMMU_DEBUGFS) += debugfs.o
diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h
index 9b4b589a54b5..25044d28f28a 100644
--- a/drivers/iommu/amd/amd_iommu.h
+++ b/drivers/iommu/amd/amd_iommu.h
@@ -88,7 +88,6 @@ int amd_iommu_complete_ppr(struct device *dev, u32 pasid, int status, int tag);
* the IOMMU used by this driver.
*/
void amd_iommu_flush_all_caches(struct amd_iommu *iommu);
-void amd_iommu_update_and_flush_device_table(struct protection_domain *domain);
void amd_iommu_domain_flush_pages(struct protection_domain *domain,
u64 address, size_t size);
void amd_iommu_dev_flush_pasid_pages(struct iommu_dev_data *dev_data,
diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h
index a698a2e7ce2a..78b1c44bd6b5 100644
--- a/drivers/iommu/amd/amd_iommu_types.h
+++ b/drivers/iommu/amd/amd_iommu_types.h
@@ -18,7 +18,7 @@
#include <linux/spinlock.h>
#include <linux/pci.h>
#include <linux/irqreturn.h>
-#include <linux/io-pgtable.h>
+#include <linux/generic_pt/iommu.h>
/*
* Maximum number of IOMMUs supported
@@ -247,6 +247,10 @@
#define CMD_BUFFER_ENTRIES 512
#define MMIO_CMD_SIZE_SHIFT 56
#define MMIO_CMD_SIZE_512 (0x9ULL << MMIO_CMD_SIZE_SHIFT)
+#define MMIO_CMD_HEAD_MASK GENMASK_ULL(18, 4) /* Command buffer head ptr field [18:4] */
+#define MMIO_CMD_BUFFER_HEAD(x) FIELD_GET(MMIO_CMD_HEAD_MASK, (x))
+#define MMIO_CMD_TAIL_MASK GENMASK_ULL(18, 4) /* Command buffer tail ptr field [18:4] */
+#define MMIO_CMD_BUFFER_TAIL(x) FIELD_GET(MMIO_CMD_TAIL_MASK, (x))
/* constants for event buffer handling */
#define EVT_BUFFER_SIZE 8192 /* 512 entries */
@@ -337,76 +341,7 @@
#define GUEST_PGTABLE_4_LEVEL 0x00
#define GUEST_PGTABLE_5_LEVEL 0x01
-#define PM_LEVEL_SHIFT(x) (12 + ((x) * 9))
-#define PM_LEVEL_SIZE(x) (((x) < 6) ? \
- ((1ULL << PM_LEVEL_SHIFT((x))) - 1): \
- (0xffffffffffffffffULL))
-#define PM_LEVEL_INDEX(x, a) (((a) >> PM_LEVEL_SHIFT((x))) & 0x1ffULL)
-#define PM_LEVEL_ENC(x) (((x) << 9) & 0xe00ULL)
-#define PM_LEVEL_PDE(x, a) ((a) | PM_LEVEL_ENC((x)) | \
- IOMMU_PTE_PR | IOMMU_PTE_IR | IOMMU_PTE_IW)
-#define PM_PTE_LEVEL(pte) (((pte) >> 9) & 0x7ULL)
-
-#define PM_MAP_4k 0
#define PM_ADDR_MASK 0x000ffffffffff000ULL
-#define PM_MAP_MASK(lvl) (PM_ADDR_MASK & \
- (~((1ULL << (12 + ((lvl) * 9))) - 1)))
-#define PM_ALIGNED(lvl, addr) ((PM_MAP_MASK(lvl) & (addr)) == (addr))
-
-/*
- * Returns the page table level to use for a given page size
- * Pagesize is expected to be a power-of-two
- */
-#define PAGE_SIZE_LEVEL(pagesize) \
- ((__ffs(pagesize) - 12) / 9)
-/*
- * Returns the number of ptes to use for a given page size
- * Pagesize is expected to be a power-of-two
- */
-#define PAGE_SIZE_PTE_COUNT(pagesize) \
- (1ULL << ((__ffs(pagesize) - 12) % 9))
-
-/*
- * Aligns a given io-virtual address to a given page size
- * Pagesize is expected to be a power-of-two
- */
-#define PAGE_SIZE_ALIGN(address, pagesize) \
- ((address) & ~((pagesize) - 1))
-/*
- * Creates an IOMMU PTE for an address and a given pagesize
- * The PTE has no permission bits set
- * Pagesize is expected to be a power-of-two larger than 4096
- */
-#define PAGE_SIZE_PTE(address, pagesize) \
- (((address) | ((pagesize) - 1)) & \
- (~(pagesize >> 1)) & PM_ADDR_MASK)
-
-/*
- * Takes a PTE value with mode=0x07 and returns the page size it maps
- */
-#define PTE_PAGE_SIZE(pte) \
- (1ULL << (1 + ffz(((pte) | 0xfffULL))))
-
-/*
- * Takes a page-table level and returns the default page-size for this level
- */
-#define PTE_LEVEL_PAGE_SIZE(level) \
- (1ULL << (12 + (9 * (level))))
-
-/*
- * The IOPTE dirty bit
- */
-#define IOMMU_PTE_HD_BIT (6)
-
-/*
- * Bit value definition for I/O PTE fields
- */
-#define IOMMU_PTE_PR BIT_ULL(0)
-#define IOMMU_PTE_HD BIT_ULL(IOMMU_PTE_HD_BIT)
-#define IOMMU_PTE_U BIT_ULL(59)
-#define IOMMU_PTE_FC BIT_ULL(60)
-#define IOMMU_PTE_IR BIT_ULL(61)
-#define IOMMU_PTE_IW BIT_ULL(62)
/*
* Bit value definition for DTE fields
@@ -436,12 +371,6 @@
/* DTE[128:179] | DTE[184:191] */
#define DTE_DATA2_INTR_MASK ~GENMASK_ULL(55, 52)
-#define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL)
-#define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_PR)
-#define IOMMU_PTE_DIRTY(pte) ((pte) & IOMMU_PTE_HD)
-#define IOMMU_PTE_PAGE(pte) (iommu_phys_to_virt((pte) & IOMMU_PAGE_MASK))
-#define IOMMU_PTE_MODE(pte) (((pte) >> 9) & 0x07)
-
#define IOMMU_PROT_MASK 0x03
#define IOMMU_PROT_IR 0x01
#define IOMMU_PROT_IW 0x02
@@ -534,19 +463,6 @@ struct amd_irte_ops;
#define AMD_IOMMU_FLAG_TRANS_PRE_ENABLED (1 << 0)
-#define io_pgtable_to_data(x) \
- container_of((x), struct amd_io_pgtable, pgtbl)
-
-#define io_pgtable_ops_to_data(x) \
- io_pgtable_to_data(io_pgtable_ops_to_pgtable(x))
-
-#define io_pgtable_ops_to_domain(x) \
- container_of(io_pgtable_ops_to_data(x), \
- struct protection_domain, iop)
-
-#define io_pgtable_cfg_to_data(x) \
- container_of((x), struct amd_io_pgtable, pgtbl.cfg)
-
struct gcr3_tbl_info {
u64 *gcr3_tbl; /* Guest CR3 table */
int glx; /* Number of levels for GCR3 table */
@@ -554,14 +470,6 @@ struct gcr3_tbl_info {
u16 domid; /* Per device domain ID */
};
-struct amd_io_pgtable {
- seqcount_t seqcount; /* Protects root/mode update */
- struct io_pgtable pgtbl;
- int mode;
- u64 *root;
- u64 *pgd; /* v2 pgtable pgd pointer */
-};
-
enum protection_domain_mode {
PD_MODE_NONE,
PD_MODE_V1,
@@ -589,10 +497,13 @@ struct pdom_iommu_info {
* independent of their use.
*/
struct protection_domain {
+ union {
+ struct iommu_domain domain;
+ struct pt_iommu iommu;
+ struct pt_iommu_amdv1 amdv1;
+ struct pt_iommu_x86_64 amdv2;
+ };
struct list_head dev_list; /* List of all devices in this domain */
- struct iommu_domain domain; /* generic domain handle used by
- iommu core code */
- struct amd_io_pgtable iop;
spinlock_t lock; /* mostly used to lock the page table*/
u16 id; /* the domain id written to the device table */
enum protection_domain_mode pd_mode; /* Track page table type */
@@ -602,6 +513,9 @@ struct protection_domain {
struct mmu_notifier mn; /* mmu notifier for the SVA domain */
struct list_head dev_data_list; /* List of pdom_dev_data */
};
+PT_IOMMU_CHECK_DOMAIN(struct protection_domain, iommu, domain);
+PT_IOMMU_CHECK_DOMAIN(struct protection_domain, amdv1.iommu, domain);
+PT_IOMMU_CHECK_DOMAIN(struct protection_domain, amdv2.iommu, domain);
/*
* This structure contains information about one PCI segment in the system.
diff --git a/drivers/iommu/amd/debugfs.c b/drivers/iommu/amd/debugfs.c
index 10fa217a7119..20b04996441d 100644
--- a/drivers/iommu/amd/debugfs.c
+++ b/drivers/iommu/amd/debugfs.c
@@ -37,7 +37,7 @@ static ssize_t iommu_mmio_write(struct file *filp, const char __user *ubuf,
if (ret)
return ret;
- if (iommu->dbg_mmio_offset > iommu->mmio_phys_end - 4) {
+ if (iommu->dbg_mmio_offset > iommu->mmio_phys_end - sizeof(u64)) {
iommu->dbg_mmio_offset = -1;
return -EINVAL;
}
diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c
index f2991c11867c..4f4d4955269e 100644
--- a/drivers/iommu/amd/init.c
+++ b/drivers/iommu/amd/init.c
@@ -1710,13 +1710,22 @@ static struct amd_iommu_pci_seg *__init alloc_pci_segment(u16 id,
list_add_tail(&pci_seg->list, &amd_iommu_pci_seg_list);
if (alloc_dev_table(pci_seg))
- return NULL;
+ goto err_free_pci_seg;
if (alloc_alias_table(pci_seg))
- return NULL;
+ goto err_free_dev_table;
if (alloc_rlookup_table(pci_seg))
- return NULL;
+ goto err_free_alias_table;
return pci_seg;
+
+err_free_alias_table:
+ free_alias_table(pci_seg);
+err_free_dev_table:
+ free_dev_table(pci_seg);
+err_free_pci_seg:
+ list_del(&pci_seg->list);
+ kfree(pci_seg);
+ return NULL;
}
static struct amd_iommu_pci_seg *__init get_pci_segment(u16 id,
diff --git a/drivers/iommu/amd/io_pgtable.c b/drivers/iommu/amd/io_pgtable.c
deleted file mode 100644
index 70c2f5b1631b..000000000000
--- a/drivers/iommu/amd/io_pgtable.c
+++ /dev/null
@@ -1,577 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * CPU-agnostic AMD IO page table allocator.
- *
- * Copyright (C) 2020 Advanced Micro Devices, Inc.
- * Author: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
- */
-
-#define pr_fmt(fmt) "AMD-Vi: " fmt
-#define dev_fmt(fmt) pr_fmt(fmt)
-
-#include <linux/atomic.h>
-#include <linux/bitops.h>
-#include <linux/io-pgtable.h>
-#include <linux/kernel.h>
-#include <linux/sizes.h>
-#include <linux/slab.h>
-#include <linux/types.h>
-#include <linux/dma-mapping.h>
-#include <linux/seqlock.h>
-
-#include <asm/barrier.h>
-
-#include "amd_iommu_types.h"
-#include "amd_iommu.h"
-#include "../iommu-pages.h"
-
-/*
- * Helper function to get the first pte of a large mapping
- */
-static u64 *first_pte_l7(u64 *pte, unsigned long *page_size,
- unsigned long *count)
-{
- unsigned long pte_mask, pg_size, cnt;
- u64 *fpte;
-
- pg_size = PTE_PAGE_SIZE(*pte);
- cnt = PAGE_SIZE_PTE_COUNT(pg_size);
- pte_mask = ~((cnt << 3) - 1);
- fpte = (u64 *)(((unsigned long)pte) & pte_mask);
-
- if (page_size)
- *page_size = pg_size;
-
- if (count)
- *count = cnt;
-
- return fpte;
-}
-
-static void free_pt_lvl(u64 *pt, struct iommu_pages_list *freelist, int lvl)
-{
- u64 *p;
- int i;
-
- for (i = 0; i < 512; ++i) {
- /* PTE present? */
- if (!IOMMU_PTE_PRESENT(pt[i]))
- continue;
-
- /* Large PTE? */
- if (PM_PTE_LEVEL(pt[i]) == 0 ||
- PM_PTE_LEVEL(pt[i]) == 7)
- continue;
-
- /*
- * Free the next level. No need to look at l1 tables here since
- * they can only contain leaf PTEs; just free them directly.
- */
- p = IOMMU_PTE_PAGE(pt[i]);
- if (lvl > 2)
- free_pt_lvl(p, freelist, lvl - 1);
- else
- iommu_pages_list_add(freelist, p);
- }
-
- iommu_pages_list_add(freelist, pt);
-}
-
-static void free_sub_pt(u64 *root, int mode, struct iommu_pages_list *freelist)
-{
- switch (mode) {
- case PAGE_MODE_NONE:
- case PAGE_MODE_7_LEVEL:
- break;
- case PAGE_MODE_1_LEVEL:
- iommu_pages_list_add(freelist, root);
- break;
- case PAGE_MODE_2_LEVEL:
- case PAGE_MODE_3_LEVEL:
- case PAGE_MODE_4_LEVEL:
- case PAGE_MODE_5_LEVEL:
- case PAGE_MODE_6_LEVEL:
- free_pt_lvl(root, freelist, mode);
- break;
- default:
- BUG();
- }
-}
-
-/*
- * This function is used to add another level to an IO page table. Adding
- * another level increases the size of the address space by 9 bits to a size up
- * to 64 bits.
- */
-static bool increase_address_space(struct amd_io_pgtable *pgtable,
- unsigned long address,
- unsigned int page_size_level,
- gfp_t gfp)
-{
- struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg;
- struct protection_domain *domain =
- container_of(pgtable, struct protection_domain, iop);
- unsigned long flags;
- bool ret = true;
- u64 *pte;
-
- pte = iommu_alloc_pages_node_sz(cfg->amd.nid, gfp, SZ_4K);
- if (!pte)
- return false;
-
- spin_lock_irqsave(&domain->lock, flags);
-
- if (address <= PM_LEVEL_SIZE(pgtable->mode) &&
- pgtable->mode - 1 >= page_size_level)
- goto out;
-
- ret = false;
- if (WARN_ON_ONCE(pgtable->mode == amd_iommu_hpt_level))
- goto out;
-
- *pte = PM_LEVEL_PDE(pgtable->mode, iommu_virt_to_phys(pgtable->root));
-
- write_seqcount_begin(&pgtable->seqcount);
- pgtable->root = pte;
- pgtable->mode += 1;
- write_seqcount_end(&pgtable->seqcount);
-
- amd_iommu_update_and_flush_device_table(domain);
-
- pte = NULL;
- ret = true;
-
-out:
- spin_unlock_irqrestore(&domain->lock, flags);
- iommu_free_pages(pte);
-
- return ret;
-}
-
-static u64 *alloc_pte(struct amd_io_pgtable *pgtable,
- unsigned long address,
- unsigned long page_size,
- u64 **pte_page,
- gfp_t gfp,
- bool *updated)
-{
- unsigned long last_addr = address + (page_size - 1);
- struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg;
- unsigned int seqcount;
- int level, end_lvl;
- u64 *pte, *page;
-
- BUG_ON(!is_power_of_2(page_size));
-
- while (last_addr > PM_LEVEL_SIZE(pgtable->mode) ||
- pgtable->mode - 1 < PAGE_SIZE_LEVEL(page_size)) {
- /*
- * Return an error if there is no memory to update the
- * page-table.
- */
- if (!increase_address_space(pgtable, last_addr,
- PAGE_SIZE_LEVEL(page_size), gfp))
- return NULL;
- }
-
-
- do {
- seqcount = read_seqcount_begin(&pgtable->seqcount);
-
- level = pgtable->mode - 1;
- pte = &pgtable->root[PM_LEVEL_INDEX(level, address)];
- } while (read_seqcount_retry(&pgtable->seqcount, seqcount));
-
-
- address = PAGE_SIZE_ALIGN(address, page_size);
- end_lvl = PAGE_SIZE_LEVEL(page_size);
-
- while (level > end_lvl) {
- u64 __pte, __npte;
- int pte_level;
-
- __pte = *pte;
- pte_level = PM_PTE_LEVEL(__pte);
-
- /*
- * If we replace a series of large PTEs, we need
- * to tear down all of them.
- */
- if (IOMMU_PTE_PRESENT(__pte) &&
- pte_level == PAGE_MODE_7_LEVEL) {
- unsigned long count, i;
- u64 *lpte;
-
- lpte = first_pte_l7(pte, NULL, &count);
-
- /*
- * Unmap the replicated PTEs that still match the
- * original large mapping
- */
- for (i = 0; i < count; ++i)
- cmpxchg64(&lpte[i], __pte, 0ULL);
-
- *updated = true;
- continue;
- }
-
- if (!IOMMU_PTE_PRESENT(__pte) ||
- pte_level == PAGE_MODE_NONE) {
- page = iommu_alloc_pages_node_sz(cfg->amd.nid, gfp,
- SZ_4K);
-
- if (!page)
- return NULL;
-
- __npte = PM_LEVEL_PDE(level, iommu_virt_to_phys(page));
-
- /* pte could have been changed somewhere. */
- if (!try_cmpxchg64(pte, &__pte, __npte))
- iommu_free_pages(page);
- else if (IOMMU_PTE_PRESENT(__pte))
- *updated = true;
-
- continue;
- }
-
- /* No level skipping support yet */
- if (pte_level != level)
- return NULL;
-
- level -= 1;
-
- pte = IOMMU_PTE_PAGE(__pte);
-
- if (pte_page && level == end_lvl)
- *pte_page = pte;
-
- pte = &pte[PM_LEVEL_INDEX(level, address)];
- }
-
- return pte;
-}
-
-/*
- * This function checks if there is a PTE for a given dma address. If
- * there is one, it returns the pointer to it.
- */
-static u64 *fetch_pte(struct amd_io_pgtable *pgtable,
- unsigned long address,
- unsigned long *page_size)
-{
- int level;
- unsigned int seqcount;
- u64 *pte;
-
- *page_size = 0;
-
- if (address > PM_LEVEL_SIZE(pgtable->mode))
- return NULL;
-
- do {
- seqcount = read_seqcount_begin(&pgtable->seqcount);
- level = pgtable->mode - 1;
- pte = &pgtable->root[PM_LEVEL_INDEX(level, address)];
- } while (read_seqcount_retry(&pgtable->seqcount, seqcount));
-
- *page_size = PTE_LEVEL_PAGE_SIZE(level);
-
- while (level > 0) {
-
- /* Not Present */
- if (!IOMMU_PTE_PRESENT(*pte))
- return NULL;
-
- /* Large PTE */
- if (PM_PTE_LEVEL(*pte) == PAGE_MODE_7_LEVEL ||
- PM_PTE_LEVEL(*pte) == PAGE_MODE_NONE)
- break;
-
- /* No level skipping support yet */
- if (PM_PTE_LEVEL(*pte) != level)
- return NULL;
-
- level -= 1;
-
- /* Walk to the next level */
- pte = IOMMU_PTE_PAGE(*pte);
- pte = &pte[PM_LEVEL_INDEX(level, address)];
- *page_size = PTE_LEVEL_PAGE_SIZE(level);
- }
-
- /*
- * If we have a series of large PTEs, make
- * sure to return a pointer to the first one.
- */
- if (PM_PTE_LEVEL(*pte) == PAGE_MODE_7_LEVEL)
- pte = first_pte_l7(pte, page_size, NULL);
-
- return pte;
-}
-
-static void free_clear_pte(u64 *pte, u64 pteval,
- struct iommu_pages_list *freelist)
-{
- u64 *pt;
- int mode;
-
- while (!try_cmpxchg64(pte, &pteval, 0))
- pr_warn("AMD-Vi: IOMMU pte changed since we read it\n");
-
- if (!IOMMU_PTE_PRESENT(pteval))
- return;
-
- pt = IOMMU_PTE_PAGE(pteval);
- mode = IOMMU_PTE_MODE(pteval);
-
- free_sub_pt(pt, mode, freelist);
-}
-
-/*
- * Generic mapping functions. It maps a physical address into a DMA
- * address space. It allocates the page table pages if necessary.
- * In the future it can be extended to a generic mapping function
- * supporting all features of AMD IOMMU page tables like level skipping
- * and full 64 bit address spaces.
- */
-static int iommu_v1_map_pages(struct io_pgtable_ops *ops, unsigned long iova,
- phys_addr_t paddr, size_t pgsize, size_t pgcount,
- int prot, gfp_t gfp, size_t *mapped)
-{
- struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops);
- struct iommu_pages_list freelist = IOMMU_PAGES_LIST_INIT(freelist);
- bool updated = false;
- u64 __pte, *pte;
- int ret, i, count;
- size_t size = pgcount << __ffs(pgsize);
- unsigned long o_iova = iova;
-
- BUG_ON(!IS_ALIGNED(iova, pgsize));
- BUG_ON(!IS_ALIGNED(paddr, pgsize));
-
- ret = -EINVAL;
- if (!(prot & IOMMU_PROT_MASK))
- goto out;
-
- while (pgcount > 0) {
- count = PAGE_SIZE_PTE_COUNT(pgsize);
- pte = alloc_pte(pgtable, iova, pgsize, NULL, gfp, &updated);
-
- ret = -ENOMEM;
- if (!pte)
- goto out;
-
- for (i = 0; i < count; ++i)
- free_clear_pte(&pte[i], pte[i], &freelist);
-
- if (!iommu_pages_list_empty(&freelist))
- updated = true;
-
- if (count > 1) {
- __pte = PAGE_SIZE_PTE(__sme_set(paddr), pgsize);
- __pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_PR | IOMMU_PTE_FC;
- } else
- __pte = __sme_set(paddr) | IOMMU_PTE_PR | IOMMU_PTE_FC;
-
- if (prot & IOMMU_PROT_IR)
- __pte |= IOMMU_PTE_IR;
- if (prot & IOMMU_PROT_IW)
- __pte |= IOMMU_PTE_IW;
-
- for (i = 0; i < count; ++i)
- pte[i] = __pte;
-
- iova += pgsize;
- paddr += pgsize;
- pgcount--;
- if (mapped)
- *mapped += pgsize;
- }
-
- ret = 0;
-
-out:
- if (updated) {
- struct protection_domain *dom = io_pgtable_ops_to_domain(ops);
- unsigned long flags;
-
- spin_lock_irqsave(&dom->lock, flags);
- /*
- * Flush domain TLB(s) and wait for completion. Any Device-Table
- * Updates and flushing already happened in
- * increase_address_space().
- */
- amd_iommu_domain_flush_pages(dom, o_iova, size);
- spin_unlock_irqrestore(&dom->lock, flags);
- }
-
- /* Everything flushed out, free pages now */
- iommu_put_pages_list(&freelist);
-
- return ret;
-}
-
-static unsigned long iommu_v1_unmap_pages(struct io_pgtable_ops *ops,
- unsigned long iova,
- size_t pgsize, size_t pgcount,
- struct iommu_iotlb_gather *gather)
-{
- struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops);
- unsigned long long unmapped;
- unsigned long unmap_size;
- u64 *pte;
- size_t size = pgcount << __ffs(pgsize);
-
- BUG_ON(!is_power_of_2(pgsize));
-
- unmapped = 0;
-
- while (unmapped < size) {
- pte = fetch_pte(pgtable, iova, &unmap_size);
- if (pte) {
- int i, count;
-
- count = PAGE_SIZE_PTE_COUNT(unmap_size);
- for (i = 0; i < count; i++)
- pte[i] = 0ULL;
- } else {
- return unmapped;
- }
-
- iova = (iova & ~(unmap_size - 1)) + unmap_size;
- unmapped += unmap_size;
- }
-
- return unmapped;
-}
-
-static phys_addr_t iommu_v1_iova_to_phys(struct io_pgtable_ops *ops, unsigned long iova)
-{
- struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops);
- unsigned long offset_mask, pte_pgsize;
- u64 *pte, __pte;
-
- pte = fetch_pte(pgtable, iova, &pte_pgsize);
-
- if (!pte || !IOMMU_PTE_PRESENT(*pte))
- return 0;
-
- offset_mask = pte_pgsize - 1;
- __pte = __sme_clr(*pte & PM_ADDR_MASK);
-
- return (__pte & ~offset_mask) | (iova & offset_mask);
-}
-
-static bool pte_test_and_clear_dirty(u64 *ptep, unsigned long size,
- unsigned long flags)
-{
- bool test_only = flags & IOMMU_DIRTY_NO_CLEAR;
- bool dirty = false;
- int i, count;
-
- /*
- * 2.2.3.2 Host Dirty Support
- * When a non-default page size is used , software must OR the
- * Dirty bits in all of the replicated host PTEs used to map
- * the page. The IOMMU does not guarantee the Dirty bits are
- * set in all of the replicated PTEs. Any portion of the page
- * may have been written even if the Dirty bit is set in only
- * one of the replicated PTEs.
- */
- count = PAGE_SIZE_PTE_COUNT(size);
- for (i = 0; i < count && test_only; i++) {
- if (test_bit(IOMMU_PTE_HD_BIT, (unsigned long *)&ptep[i])) {
- dirty = true;
- break;
- }
- }
-
- for (i = 0; i < count && !test_only; i++) {
- if (test_and_clear_bit(IOMMU_PTE_HD_BIT,
- (unsigned long *)&ptep[i])) {
- dirty = true;
- }
- }
-
- return dirty;
-}
-
-static int iommu_v1_read_and_clear_dirty(struct io_pgtable_ops *ops,
- unsigned long iova, size_t size,
- unsigned long flags,
- struct iommu_dirty_bitmap *dirty)
-{
- struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops);
- unsigned long end = iova + size - 1;
-
- do {
- unsigned long pgsize = 0;
- u64 *ptep, pte;
-
- ptep = fetch_pte(pgtable, iova, &pgsize);
- if (ptep)
- pte = READ_ONCE(*ptep);
- if (!ptep || !IOMMU_PTE_PRESENT(pte)) {
- pgsize = pgsize ?: PTE_LEVEL_PAGE_SIZE(0);
- iova += pgsize;
- continue;
- }
-
- /*
- * Mark the whole IOVA range as dirty even if only one of
- * the replicated PTEs were marked dirty.
- */
- if (pte_test_and_clear_dirty(ptep, pgsize, flags))
- iommu_dirty_bitmap_record(dirty, iova, pgsize);
- iova += pgsize;
- } while (iova < end);
-
- return 0;
-}
-
-/*
- * ----------------------------------------------------
- */
-static void v1_free_pgtable(struct io_pgtable *iop)
-{
- struct amd_io_pgtable *pgtable = container_of(iop, struct amd_io_pgtable, pgtbl);
- struct iommu_pages_list freelist = IOMMU_PAGES_LIST_INIT(freelist);
-
- if (pgtable->mode == PAGE_MODE_NONE)
- return;
-
- /* Page-table is not visible to IOMMU anymore, so free it */
- BUG_ON(pgtable->mode < PAGE_MODE_NONE ||
- pgtable->mode > amd_iommu_hpt_level);
-
- free_sub_pt(pgtable->root, pgtable->mode, &freelist);
- iommu_put_pages_list(&freelist);
-}
-
-static struct io_pgtable *v1_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie)
-{
- struct amd_io_pgtable *pgtable = io_pgtable_cfg_to_data(cfg);
-
- pgtable->root =
- iommu_alloc_pages_node_sz(cfg->amd.nid, GFP_KERNEL, SZ_4K);
- if (!pgtable->root)
- return NULL;
- pgtable->mode = PAGE_MODE_3_LEVEL;
- seqcount_init(&pgtable->seqcount);
-
- cfg->pgsize_bitmap = amd_iommu_pgsize_bitmap;
- cfg->ias = IOMMU_IN_ADDR_BIT_SIZE;
- cfg->oas = IOMMU_OUT_ADDR_BIT_SIZE;
-
- pgtable->pgtbl.ops.map_pages = iommu_v1_map_pages;
- pgtable->pgtbl.ops.unmap_pages = iommu_v1_unmap_pages;
- pgtable->pgtbl.ops.iova_to_phys = iommu_v1_iova_to_phys;
- pgtable->pgtbl.ops.read_and_clear_dirty = iommu_v1_read_and_clear_dirty;
-
- return &pgtable->pgtbl;
-}
-
-struct io_pgtable_init_fns io_pgtable_amd_iommu_v1_init_fns = {
- .alloc = v1_alloc_pgtable,
- .free = v1_free_pgtable,
-};
diff --git a/drivers/iommu/amd/io_pgtable_v2.c b/drivers/iommu/amd/io_pgtable_v2.c
deleted file mode 100644
index b47941353ccb..000000000000
--- a/drivers/iommu/amd/io_pgtable_v2.c
+++ /dev/null
@@ -1,370 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * CPU-agnostic AMD IO page table v2 allocator.
- *
- * Copyright (C) 2022, 2023 Advanced Micro Devices, Inc.
- * Author: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
- * Author: Vasant Hegde <vasant.hegde@amd.com>
- */
-
-#define pr_fmt(fmt) "AMD-Vi: " fmt
-#define dev_fmt(fmt) pr_fmt(fmt)
-
-#include <linux/bitops.h>
-#include <linux/io-pgtable.h>
-#include <linux/kernel.h>
-
-#include <asm/barrier.h>
-
-#include "amd_iommu_types.h"
-#include "amd_iommu.h"
-#include "../iommu-pages.h"
-
-#define IOMMU_PAGE_PRESENT BIT_ULL(0) /* Is present */
-#define IOMMU_PAGE_RW BIT_ULL(1) /* Writeable */
-#define IOMMU_PAGE_USER BIT_ULL(2) /* Userspace addressable */
-#define IOMMU_PAGE_PWT BIT_ULL(3) /* Page write through */
-#define IOMMU_PAGE_PCD BIT_ULL(4) /* Page cache disabled */
-#define IOMMU_PAGE_ACCESS BIT_ULL(5) /* Was accessed (updated by IOMMU) */
-#define IOMMU_PAGE_DIRTY BIT_ULL(6) /* Was written to (updated by IOMMU) */
-#define IOMMU_PAGE_PSE BIT_ULL(7) /* Page Size Extensions */
-#define IOMMU_PAGE_NX BIT_ULL(63) /* No execute */
-
-#define MAX_PTRS_PER_PAGE 512
-
-#define IOMMU_PAGE_SIZE_2M BIT_ULL(21)
-#define IOMMU_PAGE_SIZE_1G BIT_ULL(30)
-
-
-static inline int get_pgtable_level(void)
-{
- return amd_iommu_gpt_level;
-}
-
-static inline bool is_large_pte(u64 pte)
-{
- return (pte & IOMMU_PAGE_PSE);
-}
-
-static inline u64 set_pgtable_attr(u64 *page)
-{
- u64 prot;
-
- prot = IOMMU_PAGE_PRESENT | IOMMU_PAGE_RW | IOMMU_PAGE_USER;
- prot |= IOMMU_PAGE_ACCESS;
-
- return (iommu_virt_to_phys(page) | prot);
-}
-
-static inline void *get_pgtable_pte(u64 pte)
-{
- return iommu_phys_to_virt(pte & PM_ADDR_MASK);
-}
-
-static u64 set_pte_attr(u64 paddr, u64 pg_size, int prot)
-{
- u64 pte;
-
- pte = __sme_set(paddr & PM_ADDR_MASK);
- pte |= IOMMU_PAGE_PRESENT | IOMMU_PAGE_USER;
- pte |= IOMMU_PAGE_ACCESS | IOMMU_PAGE_DIRTY;
-
- if (prot & IOMMU_PROT_IW)
- pte |= IOMMU_PAGE_RW;
-
- /* Large page */
- if (pg_size == IOMMU_PAGE_SIZE_1G || pg_size == IOMMU_PAGE_SIZE_2M)
- pte |= IOMMU_PAGE_PSE;
-
- return pte;
-}
-
-static inline u64 get_alloc_page_size(u64 size)
-{
- if (size >= IOMMU_PAGE_SIZE_1G)
- return IOMMU_PAGE_SIZE_1G;
-
- if (size >= IOMMU_PAGE_SIZE_2M)
- return IOMMU_PAGE_SIZE_2M;
-
- return PAGE_SIZE;
-}
-
-static inline int page_size_to_level(u64 pg_size)
-{
- if (pg_size == IOMMU_PAGE_SIZE_1G)
- return PAGE_MODE_3_LEVEL;
- if (pg_size == IOMMU_PAGE_SIZE_2M)
- return PAGE_MODE_2_LEVEL;
-
- return PAGE_MODE_1_LEVEL;
-}
-
-static void free_pgtable(u64 *pt, int level)
-{
- u64 *p;
- int i;
-
- for (i = 0; i < MAX_PTRS_PER_PAGE; i++) {
- /* PTE present? */
- if (!IOMMU_PTE_PRESENT(pt[i]))
- continue;
-
- if (is_large_pte(pt[i]))
- continue;
-
- /*
- * Free the next level. No need to look at l1 tables here since
- * they can only contain leaf PTEs; just free them directly.
- */
- p = get_pgtable_pte(pt[i]);
- if (level > 2)
- free_pgtable(p, level - 1);
- else
- iommu_free_pages(p);
- }
-
- iommu_free_pages(pt);
-}
-
-/* Allocate page table */
-static u64 *v2_alloc_pte(int nid, u64 *pgd, unsigned long iova,
- unsigned long pg_size, gfp_t gfp, bool *updated)
-{
- u64 *pte, *page;
- int level, end_level;
-
- level = get_pgtable_level() - 1;
- end_level = page_size_to_level(pg_size);
- pte = &pgd[PM_LEVEL_INDEX(level, iova)];
- iova = PAGE_SIZE_ALIGN(iova, PAGE_SIZE);
-
- while (level >= end_level) {
- u64 __pte, __npte;
-
- __pte = *pte;
-
- if (IOMMU_PTE_PRESENT(__pte) && is_large_pte(__pte)) {
- /* Unmap large pte */
- cmpxchg64(pte, *pte, 0ULL);
- *updated = true;
- continue;
- }
-
- if (!IOMMU_PTE_PRESENT(__pte)) {
- page = iommu_alloc_pages_node_sz(nid, gfp, SZ_4K);
- if (!page)
- return NULL;
-
- __npte = set_pgtable_attr(page);
- /* pte could have been changed somewhere. */
- if (!try_cmpxchg64(pte, &__pte, __npte))
- iommu_free_pages(page);
- else if (IOMMU_PTE_PRESENT(__pte))
- *updated = true;
-
- continue;
- }
-
- level -= 1;
- pte = get_pgtable_pte(__pte);
- pte = &pte[PM_LEVEL_INDEX(level, iova)];
- }
-
- /* Tear down existing pte entries */
- if (IOMMU_PTE_PRESENT(*pte)) {
- u64 *__pte;
-
- *updated = true;
- __pte = get_pgtable_pte(*pte);
- cmpxchg64(pte, *pte, 0ULL);
- if (pg_size == IOMMU_PAGE_SIZE_1G)
- free_pgtable(__pte, end_level - 1);
- else if (pg_size == IOMMU_PAGE_SIZE_2M)
- iommu_free_pages(__pte);
- }
-
- return pte;
-}
-
-/*
- * This function checks if there is a PTE for a given dma address.
- * If there is one, it returns the pointer to it.
- */
-static u64 *fetch_pte(struct amd_io_pgtable *pgtable,
- unsigned long iova, unsigned long *page_size)
-{
- u64 *pte;
- int level;
-
- level = get_pgtable_level() - 1;
- pte = &pgtable->pgd[PM_LEVEL_INDEX(level, iova)];
- /* Default page size is 4K */
- *page_size = PAGE_SIZE;
-
- while (level) {
- /* Not present */
- if (!IOMMU_PTE_PRESENT(*pte))
- return NULL;
-
- /* Walk to the next level */
- pte = get_pgtable_pte(*pte);
- pte = &pte[PM_LEVEL_INDEX(level - 1, iova)];
-
- /* Large page */
- if (is_large_pte(*pte)) {
- if (level == PAGE_MODE_3_LEVEL)
- *page_size = IOMMU_PAGE_SIZE_1G;
- else if (level == PAGE_MODE_2_LEVEL)
- *page_size = IOMMU_PAGE_SIZE_2M;
- else
- return NULL; /* Wrongly set PSE bit in PTE */
-
- break;
- }
-
- level -= 1;
- }
-
- return pte;
-}
-
-static int iommu_v2_map_pages(struct io_pgtable_ops *ops, unsigned long iova,
- phys_addr_t paddr, size_t pgsize, size_t pgcount,
- int prot, gfp_t gfp, size_t *mapped)
-{
- struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops);
- struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg;
- u64 *pte;
- unsigned long map_size;
- unsigned long mapped_size = 0;
- unsigned long o_iova = iova;
- size_t size = pgcount << __ffs(pgsize);
- int ret = 0;
- bool updated = false;
-
- if (WARN_ON(!pgsize || (pgsize & cfg->pgsize_bitmap) != pgsize) || !pgcount)
- return -EINVAL;
-
- if (!(prot & IOMMU_PROT_MASK))
- return -EINVAL;
-
- while (mapped_size < size) {
- map_size = get_alloc_page_size(pgsize);
- pte = v2_alloc_pte(cfg->amd.nid, pgtable->pgd,
- iova, map_size, gfp, &updated);
- if (!pte) {
- ret = -ENOMEM;
- goto out;
- }
-
- *pte = set_pte_attr(paddr, map_size, prot);
-
- iova += map_size;
- paddr += map_size;
- mapped_size += map_size;
- }
-
-out:
- if (updated) {
- struct protection_domain *pdom = io_pgtable_ops_to_domain(ops);
- unsigned long flags;
-
- spin_lock_irqsave(&pdom->lock, flags);
- amd_iommu_domain_flush_pages(pdom, o_iova, size);
- spin_unlock_irqrestore(&pdom->lock, flags);
- }
-
- if (mapped)
- *mapped += mapped_size;
-
- return ret;
-}
-
-static unsigned long iommu_v2_unmap_pages(struct io_pgtable_ops *ops,
- unsigned long iova,
- size_t pgsize, size_t pgcount,
- struct iommu_iotlb_gather *gather)
-{
- struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops);
- struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg;
- unsigned long unmap_size;
- unsigned long unmapped = 0;
- size_t size = pgcount << __ffs(pgsize);
- u64 *pte;
-
- if (WARN_ON(!pgsize || (pgsize & cfg->pgsize_bitmap) != pgsize || !pgcount))
- return 0;
-
- while (unmapped < size) {
- pte = fetch_pte(pgtable, iova, &unmap_size);
- if (!pte)
- return unmapped;
-
- *pte = 0ULL;
-
- iova = (iova & ~(unmap_size - 1)) + unmap_size;
- unmapped += unmap_size;
- }
-
- return unmapped;
-}
-
-static phys_addr_t iommu_v2_iova_to_phys(struct io_pgtable_ops *ops, unsigned long iova)
-{
- struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops);
- unsigned long offset_mask, pte_pgsize;
- u64 *pte, __pte;
-
- pte = fetch_pte(pgtable, iova, &pte_pgsize);
- if (!pte || !IOMMU_PTE_PRESENT(*pte))
- return 0;
-
- offset_mask = pte_pgsize - 1;
- __pte = __sme_clr(*pte & PM_ADDR_MASK);
-
- return (__pte & ~offset_mask) | (iova & offset_mask);
-}
-
-/*
- * ----------------------------------------------------
- */
-static void v2_free_pgtable(struct io_pgtable *iop)
-{
- struct amd_io_pgtable *pgtable = container_of(iop, struct amd_io_pgtable, pgtbl);
-
- if (!pgtable || !pgtable->pgd)
- return;
-
- /* Free page table */
- free_pgtable(pgtable->pgd, get_pgtable_level());
- pgtable->pgd = NULL;
-}
-
-static struct io_pgtable *v2_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie)
-{
- struct amd_io_pgtable *pgtable = io_pgtable_cfg_to_data(cfg);
- int ias = IOMMU_IN_ADDR_BIT_SIZE;
-
- pgtable->pgd = iommu_alloc_pages_node_sz(cfg->amd.nid, GFP_KERNEL, SZ_4K);
- if (!pgtable->pgd)
- return NULL;
-
- if (get_pgtable_level() == PAGE_MODE_5_LEVEL)
- ias = 57;
-
- pgtable->pgtbl.ops.map_pages = iommu_v2_map_pages;
- pgtable->pgtbl.ops.unmap_pages = iommu_v2_unmap_pages;
- pgtable->pgtbl.ops.iova_to_phys = iommu_v2_iova_to_phys;
-
- cfg->pgsize_bitmap = AMD_IOMMU_PGSIZES_V2;
- cfg->ias = ias;
- cfg->oas = IOMMU_OUT_ADDR_BIT_SIZE;
-
- return &pgtable->pgtbl;
-}
-
-struct io_pgtable_init_fns io_pgtable_amd_iommu_v2_init_fns = {
- .alloc = v2_alloc_pgtable,
- .free = v2_free_pgtable,
-};
diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index 2e1865daa1ce..9f1d56a5e145 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -30,7 +30,6 @@
#include <linux/msi.h>
#include <linux/irqdomain.h>
#include <linux/percpu.h>
-#include <linux/io-pgtable.h>
#include <linux/cc_platform.h>
#include <asm/irq_remapping.h>
#include <asm/io_apic.h>
@@ -41,9 +40,9 @@
#include <asm/gart.h>
#include <asm/dma.h>
#include <uapi/linux/iommufd.h>
+#include <linux/generic_pt/iommu.h>
#include "amd_iommu.h"
-#include "../dma-iommu.h"
#include "../irq_remapping.h"
#include "../iommu-pages.h"
@@ -60,7 +59,6 @@ LIST_HEAD(hpet_map);
LIST_HEAD(acpihid_map);
const struct iommu_ops amd_iommu_ops;
-static const struct iommu_dirty_ops amd_dirty_ops;
int amd_iommu_max_glx_val = -1;
@@ -70,15 +68,22 @@ int amd_iommu_max_glx_val = -1;
*/
DEFINE_IDA(pdom_ids);
-static int amd_iommu_attach_device(struct iommu_domain *dom,
- struct device *dev);
+static int amd_iommu_attach_device(struct iommu_domain *dom, struct device *dev,
+ struct iommu_domain *old);
static void set_dte_entry(struct amd_iommu *iommu,
- struct iommu_dev_data *dev_data);
+ struct iommu_dev_data *dev_data,
+ phys_addr_t top_paddr, unsigned int top_level);
+
+static void amd_iommu_change_top(struct pt_iommu *iommu_table,
+ phys_addr_t top_paddr, unsigned int top_level);
static void iommu_flush_dte_sync(struct amd_iommu *iommu, u16 devid);
static struct iommu_dev_data *find_dev_data(struct amd_iommu *iommu, u16 devid);
+static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain);
+static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain,
+ bool enable);
/****************************************************************************
*
@@ -1157,6 +1162,25 @@ irqreturn_t amd_iommu_int_handler(int irq, void *data)
*
****************************************************************************/
+static void dump_command_buffer(struct amd_iommu *iommu)
+{
+ struct iommu_cmd *cmd;
+ u32 head, tail;
+ int i;
+
+ head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
+ tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
+
+ pr_err("CMD Buffer head=%llu tail=%llu\n", MMIO_CMD_BUFFER_HEAD(head),
+ MMIO_CMD_BUFFER_TAIL(tail));
+
+ for (i = 0; i < CMD_BUFFER_ENTRIES; i++) {
+ cmd = (struct iommu_cmd *)(iommu->cmd_buf + i * sizeof(*cmd));
+ pr_err("%3d: %08x %08x %08x %08x\n", i, cmd->data[0], cmd->data[1], cmd->data[2],
+ cmd->data[3]);
+ }
+}
+
static int wait_on_sem(struct amd_iommu *iommu, u64 data)
{
int i = 0;
@@ -1167,7 +1191,14 @@ static int wait_on_sem(struct amd_iommu *iommu, u64 data)
}
if (i == LOOP_TIMEOUT) {
- pr_alert("Completion-Wait loop timed out\n");
+
+ pr_alert("IOMMU %04x:%02x:%02x.%01x: Completion-Wait loop timed out\n",
+ iommu->pci_seg->id, PCI_BUS_NUM(iommu->devid),
+ PCI_SLOT(iommu->devid), PCI_FUNC(iommu->devid));
+
+ if (amd_iommu_dump)
+ DO_ONCE_LITE(dump_command_buffer, iommu);
+
return -EIO;
}
@@ -1756,42 +1787,6 @@ static void dev_flush_pasid_all(struct iommu_dev_data *dev_data,
CMD_INV_IOMMU_ALL_PAGES_ADDRESS);
}
-/* Flush the not present cache if it exists */
-static void domain_flush_np_cache(struct protection_domain *domain,
- dma_addr_t iova, size_t size)
-{
- if (unlikely(amd_iommu_np_cache)) {
- unsigned long flags;
-
- spin_lock_irqsave(&domain->lock, flags);
- amd_iommu_domain_flush_pages(domain, iova, size);
- spin_unlock_irqrestore(&domain->lock, flags);
- }
-}
-
-
-/*
- * This function flushes the DTEs for all devices in domain
- */
-void amd_iommu_update_and_flush_device_table(struct protection_domain *domain)
-{
- struct iommu_dev_data *dev_data;
-
- lockdep_assert_held(&domain->lock);
-
- list_for_each_entry(dev_data, &domain->dev_list, list) {
- struct amd_iommu *iommu = rlookup_amd_iommu(dev_data->dev);
-
- set_dte_entry(iommu, dev_data);
- clone_aliases(iommu, dev_data->dev);
- }
-
- list_for_each_entry(dev_data, &domain->dev_list, list)
- device_flush_dte(dev_data);
-
- domain_flush_complete(domain);
-}
-
int amd_iommu_complete_ppr(struct device *dev, u32 pasid, int status, int tag)
{
struct iommu_dev_data *dev_data;
@@ -2051,7 +2046,8 @@ static void set_dte_gcr3_table(struct amd_iommu *iommu,
}
static void set_dte_entry(struct amd_iommu *iommu,
- struct iommu_dev_data *dev_data)
+ struct iommu_dev_data *dev_data,
+ phys_addr_t top_paddr, unsigned int top_level)
{
u16 domid;
u32 old_domid;
@@ -2060,19 +2056,36 @@ static void set_dte_entry(struct amd_iommu *iommu,
struct protection_domain *domain = dev_data->domain;
struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info;
struct dev_table_entry *dte = &get_dev_table(iommu)[dev_data->devid];
+ struct pt_iommu_amdv1_hw_info pt_info;
+
+ make_clear_dte(dev_data, dte, &new);
if (gcr3_info && gcr3_info->gcr3_tbl)
domid = dev_data->gcr3_info.domid;
- else
+ else {
domid = domain->id;
- make_clear_dte(dev_data, dte, &new);
-
- if (domain->iop.mode != PAGE_MODE_NONE)
- new.data[0] |= iommu_virt_to_phys(domain->iop.root);
+ if (domain->domain.type & __IOMMU_DOMAIN_PAGING) {
+ /*
+ * When updating the IO pagetable, the new top and level
+ * are provided as parameters. For other operations i.e.
+ * device attach, retrieve the current pagetable info
+ * via the IOMMU PT API.
+ */
+ if (top_paddr) {
+ pt_info.host_pt_root = top_paddr;
+ pt_info.mode = top_level + 1;
+ } else {
+ WARN_ON(top_paddr || top_level);
+ pt_iommu_amdv1_hw_info(&domain->amdv1,
+ &pt_info);
+ }
- new.data[0] |= (domain->iop.mode & DEV_ENTRY_MODE_MASK)
- << DEV_ENTRY_MODE_SHIFT;
+ new.data[0] |= __sme_set(pt_info.host_pt_root) |
+ (pt_info.mode & DEV_ENTRY_MODE_MASK)
+ << DEV_ENTRY_MODE_SHIFT;
+ }
+ }
new.data[0] |= DTE_FLAG_IR | DTE_FLAG_IW;
@@ -2138,7 +2151,7 @@ static void dev_update_dte(struct iommu_dev_data *dev_data, bool set)
struct amd_iommu *iommu = get_amd_iommu_from_dev(dev_data->dev);
if (set)
- set_dte_entry(iommu, dev_data);
+ set_dte_entry(iommu, dev_data, 0, 0);
else
clear_dte_entry(iommu, dev_data);
@@ -2156,6 +2169,7 @@ static int init_gcr3_table(struct iommu_dev_data *dev_data,
{
struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data);
int max_pasids = dev_data->max_pasids;
+ struct pt_iommu_x86_64_hw_info pt_info;
int ret = 0;
/*
@@ -2178,7 +2192,8 @@ static int init_gcr3_table(struct iommu_dev_data *dev_data,
if (!pdom_is_v2_pgtbl_mode(pdom))
return ret;
- ret = update_gcr3(dev_data, 0, iommu_virt_to_phys(pdom->iop.pgd), true);
+ pt_iommu_x86_64_hw_info(&pdom->amdv2, &pt_info);
+ ret = update_gcr3(dev_data, 0, __sme_set(pt_info.gcr3_pt), true);
if (ret)
free_gcr3_table(&dev_data->gcr3_info);
@@ -2500,94 +2515,240 @@ struct protection_domain *protection_domain_alloc(void)
return domain;
}
-static int pdom_setup_pgtable(struct protection_domain *domain,
- struct device *dev)
+static bool amd_iommu_hd_support(struct amd_iommu *iommu)
+{
+ if (amd_iommu_hatdis)
+ return false;
+
+ return iommu && (iommu->features & FEATURE_HDSUP);
+}
+
+static spinlock_t *amd_iommu_get_top_lock(struct pt_iommu *iommupt)
{
- struct io_pgtable_ops *pgtbl_ops;
- enum io_pgtable_fmt fmt;
+ struct protection_domain *pdom =
+ container_of(iommupt, struct protection_domain, iommu);
- switch (domain->pd_mode) {
- case PD_MODE_V1:
- fmt = AMD_IOMMU_V1;
- break;
- case PD_MODE_V2:
- fmt = AMD_IOMMU_V2;
- break;
- case PD_MODE_NONE:
- WARN_ON_ONCE(1);
- return -EPERM;
+ return &pdom->lock;
+}
+
+/*
+ * Update all HW references to the domain with a new pgtable configuration.
+ */
+static void amd_iommu_change_top(struct pt_iommu *iommu_table,
+ phys_addr_t top_paddr, unsigned int top_level)
+{
+ struct protection_domain *pdom =
+ container_of(iommu_table, struct protection_domain, iommu);
+ struct iommu_dev_data *dev_data;
+
+ lockdep_assert_held(&pdom->lock);
+
+ /* Update the DTE for all devices attached to this domain */
+ list_for_each_entry(dev_data, &pdom->dev_list, list) {
+ struct amd_iommu *iommu = rlookup_amd_iommu(dev_data->dev);
+
+ /* Update the HW references with the new level and top ptr */
+ set_dte_entry(iommu, dev_data, top_paddr, top_level);
+ clone_aliases(iommu, dev_data->dev);
}
- domain->iop.pgtbl.cfg.amd.nid = dev_to_node(dev);
- pgtbl_ops = alloc_io_pgtable_ops(fmt, &domain->iop.pgtbl.cfg, domain);
- if (!pgtbl_ops)
- return -ENOMEM;
+ list_for_each_entry(dev_data, &pdom->dev_list, list)
+ device_flush_dte(dev_data);
+
+ domain_flush_complete(pdom);
+}
+
+/*
+ * amd_iommu_iotlb_sync_map() is used to generate flushes for non-present to
+ * present (ie mapping) operations. It is a NOP if the IOMMU doesn't have non
+ * present caching (like hypervisor shadowing).
+ */
+static int amd_iommu_iotlb_sync_map(struct iommu_domain *dom,
+ unsigned long iova, size_t size)
+{
+ struct protection_domain *domain = to_pdomain(dom);
+ unsigned long flags;
+ if (likely(!amd_iommu_np_cache))
+ return 0;
+
+ spin_lock_irqsave(&domain->lock, flags);
+ amd_iommu_domain_flush_pages(domain, iova, size);
+ spin_unlock_irqrestore(&domain->lock, flags);
return 0;
}
-static inline u64 dma_max_address(enum protection_domain_mode pgtable)
+static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain)
{
- if (pgtable == PD_MODE_V1)
- return PM_LEVEL_SIZE(amd_iommu_hpt_level);
+ struct protection_domain *dom = to_pdomain(domain);
+ unsigned long flags;
- /*
- * V2 with 4/5 level page table. Note that "2.2.6.5 AMD64 4-Kbyte Page
- * Translation" shows that the V2 table sign extends the top of the
- * address space creating a reserved region in the middle of the
- * translation, just like the CPU does. Further Vasant says the docs are
- * incomplete and this only applies to non-zero PASIDs. If the AMDv2
- * page table is assigned to the 0 PASID then there is no sign extension
- * check.
- *
- * Since the IOMMU must have a fixed geometry, and the core code does
- * not understand sign extended addressing, we have to chop off the high
- * bit to get consistent behavior with attachments of the domain to any
- * PASID.
- */
- return ((1ULL << (PM_LEVEL_SHIFT(amd_iommu_gpt_level) - 1)) - 1);
+ spin_lock_irqsave(&dom->lock, flags);
+ amd_iommu_domain_flush_all(dom);
+ spin_unlock_irqrestore(&dom->lock, flags);
}
-static bool amd_iommu_hd_support(struct amd_iommu *iommu)
+static void amd_iommu_iotlb_sync(struct iommu_domain *domain,
+ struct iommu_iotlb_gather *gather)
{
- if (amd_iommu_hatdis)
- return false;
+ struct protection_domain *dom = to_pdomain(domain);
+ unsigned long flags;
- return iommu && (iommu->features & FEATURE_HDSUP);
+ spin_lock_irqsave(&dom->lock, flags);
+ amd_iommu_domain_flush_pages(dom, gather->start,
+ gather->end - gather->start + 1);
+ spin_unlock_irqrestore(&dom->lock, flags);
+ iommu_put_pages_list(&gather->freelist);
}
-static struct iommu_domain *
-do_iommu_domain_alloc(struct device *dev, u32 flags,
- enum protection_domain_mode pgtable)
+static const struct pt_iommu_driver_ops amd_hw_driver_ops_v1 = {
+ .get_top_lock = amd_iommu_get_top_lock,
+ .change_top = amd_iommu_change_top,
+};
+
+static const struct iommu_domain_ops amdv1_ops = {
+ IOMMU_PT_DOMAIN_OPS(amdv1),
+ .iotlb_sync_map = amd_iommu_iotlb_sync_map,
+ .flush_iotlb_all = amd_iommu_flush_iotlb_all,
+ .iotlb_sync = amd_iommu_iotlb_sync,
+ .attach_dev = amd_iommu_attach_device,
+ .free = amd_iommu_domain_free,
+ .enforce_cache_coherency = amd_iommu_enforce_cache_coherency,
+};
+
+static const struct iommu_dirty_ops amdv1_dirty_ops = {
+ IOMMU_PT_DIRTY_OPS(amdv1),
+ .set_dirty_tracking = amd_iommu_set_dirty_tracking,
+};
+
+static struct iommu_domain *amd_iommu_domain_alloc_paging_v1(struct device *dev,
+ u32 flags)
{
- bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
- struct amd_iommu *iommu = get_amd_iommu_from_dev(dev);
+ struct pt_iommu_amdv1_cfg cfg = {};
struct protection_domain *domain;
int ret;
+ if (amd_iommu_hatdis)
+ return ERR_PTR(-EOPNOTSUPP);
+
domain = protection_domain_alloc();
if (!domain)
return ERR_PTR(-ENOMEM);
- domain->pd_mode = pgtable;
- ret = pdom_setup_pgtable(domain, dev);
+ domain->pd_mode = PD_MODE_V1;
+ domain->iommu.driver_ops = &amd_hw_driver_ops_v1;
+ domain->iommu.nid = dev_to_node(dev);
+ if (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING)
+ domain->domain.dirty_ops = &amdv1_dirty_ops;
+
+ /*
+ * Someday FORCE_COHERENCE should be set by
+ * amd_iommu_enforce_cache_coherency() like VT-d does.
+ */
+ cfg.common.features = BIT(PT_FEAT_DYNAMIC_TOP) |
+ BIT(PT_FEAT_AMDV1_ENCRYPT_TABLES) |
+ BIT(PT_FEAT_AMDV1_FORCE_COHERENCE);
+
+ /*
+ * AMD's IOMMU can flush as many pages as necessary in a single flush.
+ * Unless we run in a virtual machine, which can be inferred according
+ * to whether "non-present cache" is on, it is probably best to prefer
+ * (potentially) too extensive TLB flushing (i.e., more misses) over
+ * multiple TLB flushes (i.e., more flushes). For virtual machines the
+ * hypervisor needs to synchronize the host IOMMU PTEs with those of
+ * the guest, and the trade-off is different: unnecessary TLB flushes
+ * should be avoided.
+ */
+ if (amd_iommu_np_cache)
+ cfg.common.features |= BIT(PT_FEAT_FLUSH_RANGE_NO_GAPS);
+ else
+ cfg.common.features |= BIT(PT_FEAT_FLUSH_RANGE);
+
+ cfg.common.hw_max_vasz_lg2 =
+ min(64, (amd_iommu_hpt_level - 1) * 9 + 21);
+ cfg.common.hw_max_oasz_lg2 = 52;
+ cfg.starting_level = 2;
+ domain->domain.ops = &amdv1_ops;
+
+ ret = pt_iommu_amdv1_init(&domain->amdv1, &cfg, GFP_KERNEL);
if (ret) {
- pdom_id_free(domain->id);
- kfree(domain);
+ amd_iommu_domain_free(&domain->domain);
return ERR_PTR(ret);
}
- domain->domain.geometry.aperture_start = 0;
- domain->domain.geometry.aperture_end = dma_max_address(pgtable);
- domain->domain.geometry.force_aperture = true;
- domain->domain.pgsize_bitmap = domain->iop.pgtbl.cfg.pgsize_bitmap;
+ /*
+ * Narrow the supported page sizes to those selected by the kernel
+ * command line.
+ */
+ domain->domain.pgsize_bitmap &= amd_iommu_pgsize_bitmap;
+ return &domain->domain;
+}
- domain->domain.type = IOMMU_DOMAIN_UNMANAGED;
- domain->domain.ops = iommu->iommu.ops->default_domain_ops;
+static const struct iommu_domain_ops amdv2_ops = {
+ IOMMU_PT_DOMAIN_OPS(x86_64),
+ .iotlb_sync_map = amd_iommu_iotlb_sync_map,
+ .flush_iotlb_all = amd_iommu_flush_iotlb_all,
+ .iotlb_sync = amd_iommu_iotlb_sync,
+ .attach_dev = amd_iommu_attach_device,
+ .free = amd_iommu_domain_free,
+ /*
+ * Note the AMDv2 page table format does not support a Force Coherency
+ * bit, so enforce_cache_coherency should not be set. However VFIO is
+ * not prepared to handle a case where some domains will support
+ * enforcement and others do not. VFIO and iommufd will have to be fixed
+ * before it can fully use the V2 page table. See the comment in
+ * iommufd_hwpt_paging_alloc(). For now leave things as they have
+ * historically been and lie about enforce_cache_coherencey.
+ */
+ .enforce_cache_coherency = amd_iommu_enforce_cache_coherency,
+};
- if (dirty_tracking)
- domain->domain.dirty_ops = &amd_dirty_ops;
+static struct iommu_domain *amd_iommu_domain_alloc_paging_v2(struct device *dev,
+ u32 flags)
+{
+ struct pt_iommu_x86_64_cfg cfg = {};
+ struct protection_domain *domain;
+ int ret;
+ if (!amd_iommu_v2_pgtbl_supported())
+ return ERR_PTR(-EOPNOTSUPP);
+
+ domain = protection_domain_alloc();
+ if (!domain)
+ return ERR_PTR(-ENOMEM);
+
+ domain->pd_mode = PD_MODE_V2;
+ domain->iommu.nid = dev_to_node(dev);
+
+ cfg.common.features = BIT(PT_FEAT_X86_64_AMD_ENCRYPT_TABLES);
+ if (amd_iommu_np_cache)
+ cfg.common.features |= BIT(PT_FEAT_FLUSH_RANGE_NO_GAPS);
+ else
+ cfg.common.features |= BIT(PT_FEAT_FLUSH_RANGE);
+
+ /*
+ * The v2 table behaves differently if it is attached to PASID 0 vs a
+ * non-zero PASID. On PASID 0 it has no sign extension and the full
+ * 57/48 bits decode the lower addresses. Otherwise it behaves like a
+ * normal sign extended x86 page table. Since we want the domain to work
+ * in both modes the top bit is removed and PT_FEAT_SIGN_EXTEND is not
+ * set which creates a table that is compatible in both modes.
+ */
+ if (amd_iommu_gpt_level == PAGE_MODE_5_LEVEL) {
+ cfg.common.hw_max_vasz_lg2 = 56;
+ cfg.top_level = 4;
+ } else {
+ cfg.common.hw_max_vasz_lg2 = 47;
+ cfg.top_level = 3;
+ }
+ cfg.common.hw_max_oasz_lg2 = 52;
+ domain->domain.ops = &amdv2_ops;
+
+ ret = pt_iommu_x86_64_init(&domain->amdv2, &cfg, GFP_KERNEL);
+ if (ret) {
+ amd_iommu_domain_free(&domain->domain);
+ return ERR_PTR(ret);
+ }
return &domain->domain;
}
@@ -2608,15 +2769,27 @@ amd_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags,
/* Allocate domain with v1 page table for dirty tracking */
if (!amd_iommu_hd_support(iommu))
break;
- return do_iommu_domain_alloc(dev, flags, PD_MODE_V1);
+ return amd_iommu_domain_alloc_paging_v1(dev, flags);
case IOMMU_HWPT_ALLOC_PASID:
/* Allocate domain with v2 page table if IOMMU supports PASID. */
if (!amd_iommu_pasid_supported())
break;
- return do_iommu_domain_alloc(dev, flags, PD_MODE_V2);
- case 0:
+ return amd_iommu_domain_alloc_paging_v2(dev, flags);
+ case 0: {
+ struct iommu_domain *ret;
+
/* If nothing specific is required use the kernel commandline default */
- return do_iommu_domain_alloc(dev, 0, amd_iommu_pgtable);
+ if (amd_iommu_pgtable == PD_MODE_V1) {
+ ret = amd_iommu_domain_alloc_paging_v1(dev, flags);
+ if (ret != ERR_PTR(-EOPNOTSUPP))
+ return ret;
+ return amd_iommu_domain_alloc_paging_v2(dev, flags);
+ }
+ ret = amd_iommu_domain_alloc_paging_v2(dev, flags);
+ if (ret != ERR_PTR(-EOPNOTSUPP))
+ return ret;
+ return amd_iommu_domain_alloc_paging_v1(dev, flags);
+ }
default:
break;
}
@@ -2628,14 +2801,14 @@ void amd_iommu_domain_free(struct iommu_domain *dom)
struct protection_domain *domain = to_pdomain(dom);
WARN_ON(!list_empty(&domain->dev_list));
- if (domain->domain.type & __IOMMU_DOMAIN_PAGING)
- free_io_pgtable_ops(&domain->iop.pgtbl.ops);
+ pt_iommu_deinit(&domain->iommu);
pdom_id_free(domain->id);
kfree(domain);
}
static int blocked_domain_attach_device(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev);
@@ -2685,16 +2858,8 @@ void amd_iommu_init_identity_domain(void)
protection_domain_init(&identity_domain);
}
-/* Same as blocked domain except it supports only ops->attach_dev() */
-static struct iommu_domain release_domain = {
- .type = IOMMU_DOMAIN_BLOCKED,
- .ops = &(const struct iommu_domain_ops) {
- .attach_dev = blocked_domain_attach_device,
- }
-};
-
-static int amd_iommu_attach_device(struct iommu_domain *dom,
- struct device *dev)
+static int amd_iommu_attach_device(struct iommu_domain *dom, struct device *dev,
+ struct iommu_domain *old)
{
struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev);
struct protection_domain *domain = to_pdomain(dom);
@@ -2734,93 +2899,6 @@ static int amd_iommu_attach_device(struct iommu_domain *dom,
return ret;
}
-static int amd_iommu_iotlb_sync_map(struct iommu_domain *dom,
- unsigned long iova, size_t size)
-{
- struct protection_domain *domain = to_pdomain(dom);
- struct io_pgtable_ops *ops = &domain->iop.pgtbl.ops;
-
- if (ops->map_pages)
- domain_flush_np_cache(domain, iova, size);
- return 0;
-}
-
-static int amd_iommu_map_pages(struct iommu_domain *dom, unsigned long iova,
- phys_addr_t paddr, size_t pgsize, size_t pgcount,
- int iommu_prot, gfp_t gfp, size_t *mapped)
-{
- struct protection_domain *domain = to_pdomain(dom);
- struct io_pgtable_ops *ops = &domain->iop.pgtbl.ops;
- int prot = 0;
- int ret = -EINVAL;
-
- if ((domain->pd_mode == PD_MODE_V1) &&
- (domain->iop.mode == PAGE_MODE_NONE))
- return -EINVAL;
-
- if (iommu_prot & IOMMU_READ)
- prot |= IOMMU_PROT_IR;
- if (iommu_prot & IOMMU_WRITE)
- prot |= IOMMU_PROT_IW;
-
- if (ops->map_pages) {
- ret = ops->map_pages(ops, iova, paddr, pgsize,
- pgcount, prot, gfp, mapped);
- }
-
- return ret;
-}
-
-static void amd_iommu_iotlb_gather_add_page(struct iommu_domain *domain,
- struct iommu_iotlb_gather *gather,
- unsigned long iova, size_t size)
-{
- /*
- * AMD's IOMMU can flush as many pages as necessary in a single flush.
- * Unless we run in a virtual machine, which can be inferred according
- * to whether "non-present cache" is on, it is probably best to prefer
- * (potentially) too extensive TLB flushing (i.e., more misses) over
- * mutliple TLB flushes (i.e., more flushes). For virtual machines the
- * hypervisor needs to synchronize the host IOMMU PTEs with those of
- * the guest, and the trade-off is different: unnecessary TLB flushes
- * should be avoided.
- */
- if (amd_iommu_np_cache &&
- iommu_iotlb_gather_is_disjoint(gather, iova, size))
- iommu_iotlb_sync(domain, gather);
-
- iommu_iotlb_gather_add_range(gather, iova, size);
-}
-
-static size_t amd_iommu_unmap_pages(struct iommu_domain *dom, unsigned long iova,
- size_t pgsize, size_t pgcount,
- struct iommu_iotlb_gather *gather)
-{
- struct protection_domain *domain = to_pdomain(dom);
- struct io_pgtable_ops *ops = &domain->iop.pgtbl.ops;
- size_t r;
-
- if ((domain->pd_mode == PD_MODE_V1) &&
- (domain->iop.mode == PAGE_MODE_NONE))
- return 0;
-
- r = (ops->unmap_pages) ? ops->unmap_pages(ops, iova, pgsize, pgcount, NULL) : 0;
-
- if (r)
- amd_iommu_iotlb_gather_add_page(dom, gather, iova, r);
-
- return r;
-}
-
-static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
- dma_addr_t iova)
-{
- struct protection_domain *domain = to_pdomain(dom);
- struct io_pgtable_ops *ops = &domain->iop.pgtbl.ops;
-
- return ops->iova_to_phys(ops, iova);
-}
-
static bool amd_iommu_capable(struct device *dev, enum iommu_cap cap)
{
switch (cap) {
@@ -2887,28 +2965,6 @@ static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain,
return 0;
}
-static int amd_iommu_read_and_clear_dirty(struct iommu_domain *domain,
- unsigned long iova, size_t size,
- unsigned long flags,
- struct iommu_dirty_bitmap *dirty)
-{
- struct protection_domain *pdomain = to_pdomain(domain);
- struct io_pgtable_ops *ops = &pdomain->iop.pgtbl.ops;
- unsigned long lflags;
-
- if (!ops || !ops->read_and_clear_dirty)
- return -EOPNOTSUPP;
-
- spin_lock_irqsave(&pdomain->lock, lflags);
- if (!pdomain->dirty_tracking && dirty->bitmap) {
- spin_unlock_irqrestore(&pdomain->lock, lflags);
- return -EINVAL;
- }
- spin_unlock_irqrestore(&pdomain->lock, lflags);
-
- return ops->read_and_clear_dirty(ops, iova, size, flags, dirty);
-}
-
static void amd_iommu_get_resv_regions(struct device *dev,
struct list_head *head)
{
@@ -2978,28 +3034,6 @@ static bool amd_iommu_is_attach_deferred(struct device *dev)
return dev_data->defer_attach;
}
-static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain)
-{
- struct protection_domain *dom = to_pdomain(domain);
- unsigned long flags;
-
- spin_lock_irqsave(&dom->lock, flags);
- amd_iommu_domain_flush_all(dom);
- spin_unlock_irqrestore(&dom->lock, flags);
-}
-
-static void amd_iommu_iotlb_sync(struct iommu_domain *domain,
- struct iommu_iotlb_gather *gather)
-{
- struct protection_domain *dom = to_pdomain(domain);
- unsigned long flags;
-
- spin_lock_irqsave(&dom->lock, flags);
- amd_iommu_domain_flush_pages(dom, gather->start,
- gather->end - gather->start + 1);
- spin_unlock_irqrestore(&dom->lock, flags);
-}
-
static int amd_iommu_def_domain_type(struct device *dev)
{
struct iommu_dev_data *dev_data;
@@ -3034,15 +3068,10 @@ static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain)
return true;
}
-static const struct iommu_dirty_ops amd_dirty_ops = {
- .set_dirty_tracking = amd_iommu_set_dirty_tracking,
- .read_and_clear_dirty = amd_iommu_read_and_clear_dirty,
-};
-
const struct iommu_ops amd_iommu_ops = {
.capable = amd_iommu_capable,
.blocked_domain = &blocked_domain,
- .release_domain = &release_domain,
+ .release_domain = &blocked_domain,
.identity_domain = &identity_domain.domain,
.domain_alloc_paging_flags = amd_iommu_domain_alloc_paging_flags,
.domain_alloc_sva = amd_iommu_domain_alloc_sva,
@@ -3053,17 +3082,6 @@ const struct iommu_ops amd_iommu_ops = {
.is_attach_deferred = amd_iommu_is_attach_deferred,
.def_domain_type = amd_iommu_def_domain_type,
.page_response = amd_iommu_page_response,
- .default_domain_ops = &(const struct iommu_domain_ops) {
- .attach_dev = amd_iommu_attach_device,
- .map_pages = amd_iommu_map_pages,
- .unmap_pages = amd_iommu_unmap_pages,
- .iotlb_sync_map = amd_iommu_iotlb_sync_map,
- .iova_to_phys = amd_iommu_iova_to_phys,
- .flush_iotlb_all = amd_iommu_flush_iotlb_all,
- .iotlb_sync = amd_iommu_iotlb_sync,
- .free = amd_iommu_domain_free,
- .enforce_cache_coherency = amd_iommu_enforce_cache_coherency,
- }
};
#ifdef CONFIG_IRQ_REMAP
@@ -3354,7 +3372,7 @@ static int __modify_irte_ga(struct amd_iommu *iommu, u16 devid, int index,
static int modify_irte_ga(struct amd_iommu *iommu, u16 devid, int index,
struct irte_ga *irte)
{
- bool ret;
+ int ret;
ret = __modify_irte_ga(iommu, devid, index, irte);
if (ret)
@@ -4072,3 +4090,5 @@ int amd_iommu_create_irq_domain(struct amd_iommu *iommu)
return 0;
}
#endif
+
+MODULE_IMPORT_NS("GENERIC_PT_IOMMU");
diff --git a/drivers/iommu/apple-dart.c b/drivers/iommu/apple-dart.c
index 95a4e62b8f63..83a5aabcd15d 100644
--- a/drivers/iommu/apple-dart.c
+++ b/drivers/iommu/apple-dart.c
@@ -672,7 +672,8 @@ static int apple_dart_domain_add_streams(struct apple_dart_domain *domain,
}
static int apple_dart_attach_dev_paging(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
int ret, i;
struct apple_dart_stream_map *stream_map;
@@ -693,7 +694,8 @@ static int apple_dart_attach_dev_paging(struct iommu_domain *domain,
}
static int apple_dart_attach_dev_identity(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
struct apple_dart_master_cfg *cfg = dev_iommu_priv_get(dev);
struct apple_dart_stream_map *stream_map;
@@ -717,7 +719,8 @@ static struct iommu_domain apple_dart_identity_domain = {
};
static int apple_dart_attach_dev_blocked(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
struct apple_dart_master_cfg *cfg = dev_iommu_priv_get(dev);
struct apple_dart_stream_map *stream_map;
@@ -802,6 +805,8 @@ static int apple_dart_of_xlate(struct device *dev,
struct apple_dart *cfg_dart;
int i, sid;
+ put_device(&iommu_pdev->dev);
+
if (args->args_count != 1)
return -EINVAL;
sid = args->args[0];
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c
index 8cd8929bbfdf..93fdadd07431 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c
@@ -99,6 +99,8 @@ static void arm_smmu_make_nested_domain_ste(
int arm_smmu_attach_prepare_vmaster(struct arm_smmu_attach_state *state,
struct arm_smmu_nested_domain *nested_domain)
{
+ unsigned int cfg =
+ FIELD_GET(STRTAB_STE_0_CFG, le64_to_cpu(nested_domain->ste[0]));
struct arm_smmu_vmaster *vmaster;
unsigned long vsid;
int ret;
@@ -107,8 +109,17 @@ int arm_smmu_attach_prepare_vmaster(struct arm_smmu_attach_state *state,
ret = iommufd_viommu_get_vdev_id(&nested_domain->vsmmu->core,
state->master->dev, &vsid);
- if (ret)
+ /*
+ * Attaching to a translate nested domain must allocate a vDEVICE prior,
+ * as CD/ATS invalidations and vevents require a vSID to work properly.
+ * A abort/bypass domain is allowed to attach w/o vmaster for GBPA case.
+ */
+ if (ret) {
+ if (cfg == STRTAB_STE_0_CFG_ABORT ||
+ cfg == STRTAB_STE_0_CFG_BYPASS)
+ return 0;
return ret;
+ }
vmaster = kzalloc(sizeof(*vmaster), GFP_KERNEL);
if (!vmaster)
@@ -138,14 +149,15 @@ void arm_smmu_master_clear_vmaster(struct arm_smmu_master *master)
}
static int arm_smmu_attach_dev_nested(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old_domain)
{
struct arm_smmu_nested_domain *nested_domain =
to_smmu_nested_domain(domain);
struct arm_smmu_master *master = dev_iommu_priv_get(dev);
struct arm_smmu_attach_state state = {
.master = master,
- .old_domain = iommu_get_domain_for_dev(dev),
+ .old_domain = old_domain,
.ssid = IOMMU_NO_PASID,
};
struct arm_smmu_ste ste;
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 2a8b46b948f0..d16d35c78c06 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -1464,7 +1464,7 @@ static int arm_smmu_alloc_cd_tables(struct arm_smmu_master *master)
cd_table->l2.l1tab = dma_alloc_coherent(smmu->dev, l1size,
&cd_table->cdtab_dma,
GFP_KERNEL);
- if (!cd_table->l2.l2ptrs) {
+ if (!cd_table->l2.l1tab) {
ret = -ENOMEM;
goto err_free_l2ptrs;
}
@@ -3002,7 +3002,8 @@ void arm_smmu_attach_commit(struct arm_smmu_attach_state *state)
master->ats_enabled = state->ats_enabled;
}
-static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
+static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev,
+ struct iommu_domain *old_domain)
{
int ret = 0;
struct arm_smmu_ste target;
@@ -3010,7 +3011,7 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
struct arm_smmu_device *smmu;
struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
struct arm_smmu_attach_state state = {
- .old_domain = iommu_get_domain_for_dev(dev),
+ .old_domain = old_domain,
.ssid = IOMMU_NO_PASID,
};
struct arm_smmu_master *master;
@@ -3186,7 +3187,7 @@ static int arm_smmu_blocking_set_dev_pasid(struct iommu_domain *new_domain,
/*
* When the last user of the CD table goes away downgrade the STE back
- * to a non-cd_table one.
+ * to a non-cd_table one, by re-attaching its sid_domain.
*/
if (!arm_smmu_ssids_in_use(&master->cd_table)) {
struct iommu_domain *sid_domain =
@@ -3194,12 +3195,14 @@ static int arm_smmu_blocking_set_dev_pasid(struct iommu_domain *new_domain,
if (sid_domain->type == IOMMU_DOMAIN_IDENTITY ||
sid_domain->type == IOMMU_DOMAIN_BLOCKED)
- sid_domain->ops->attach_dev(sid_domain, dev);
+ sid_domain->ops->attach_dev(sid_domain, dev,
+ sid_domain);
}
return 0;
}
static void arm_smmu_attach_dev_ste(struct iommu_domain *domain,
+ struct iommu_domain *old_domain,
struct device *dev,
struct arm_smmu_ste *ste,
unsigned int s1dss)
@@ -3207,7 +3210,7 @@ static void arm_smmu_attach_dev_ste(struct iommu_domain *domain,
struct arm_smmu_master *master = dev_iommu_priv_get(dev);
struct arm_smmu_attach_state state = {
.master = master,
- .old_domain = iommu_get_domain_for_dev(dev),
+ .old_domain = old_domain,
.ssid = IOMMU_NO_PASID,
};
@@ -3248,14 +3251,16 @@ static void arm_smmu_attach_dev_ste(struct iommu_domain *domain,
}
static int arm_smmu_attach_dev_identity(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old_domain)
{
struct arm_smmu_ste ste;
struct arm_smmu_master *master = dev_iommu_priv_get(dev);
arm_smmu_master_clear_vmaster(master);
arm_smmu_make_bypass_ste(master->smmu, &ste);
- arm_smmu_attach_dev_ste(domain, dev, &ste, STRTAB_STE_1_S1DSS_BYPASS);
+ arm_smmu_attach_dev_ste(domain, old_domain, dev, &ste,
+ STRTAB_STE_1_S1DSS_BYPASS);
return 0;
}
@@ -3269,14 +3274,15 @@ static struct iommu_domain arm_smmu_identity_domain = {
};
static int arm_smmu_attach_dev_blocked(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old_domain)
{
struct arm_smmu_ste ste;
struct arm_smmu_master *master = dev_iommu_priv_get(dev);
arm_smmu_master_clear_vmaster(master);
arm_smmu_make_abort_ste(&ste);
- arm_smmu_attach_dev_ste(domain, dev, &ste,
+ arm_smmu_attach_dev_ste(domain, old_domain, dev, &ste,
STRTAB_STE_1_S1DSS_TERMINATE);
return 0;
}
@@ -3582,12 +3588,6 @@ static void arm_smmu_release_device(struct device *dev)
WARN_ON(master->iopf_refcount);
- /* Put the STE back to what arm_smmu_init_strtab() sets */
- if (dev->iommu->require_direct)
- arm_smmu_attach_dev_identity(&arm_smmu_identity_domain, dev);
- else
- arm_smmu_attach_dev_blocked(&arm_smmu_blocked_domain, dev);
-
arm_smmu_disable_pasid(master);
arm_smmu_remove_master(master);
if (arm_smmu_cdtab_allocated(&master->cd_table))
@@ -3678,6 +3678,7 @@ static int arm_smmu_def_domain_type(struct device *dev)
static const struct iommu_ops arm_smmu_ops = {
.identity_domain = &arm_smmu_identity_domain,
.blocked_domain = &arm_smmu_blocked_domain,
+ .release_domain = &arm_smmu_blocked_domain,
.capable = arm_smmu_capable,
.hw_info = arm_smmu_hw_info,
.domain_alloc_sva = arm_smmu_sva_domain_alloc,
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
index 57c097e87613..573085349df3 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
@@ -367,6 +367,7 @@ static int qcom_adreno_smmu_init_context(struct arm_smmu_domain *smmu_domain,
static const struct of_device_id qcom_smmu_client_of_match[] __maybe_unused = {
{ .compatible = "qcom,adreno" },
{ .compatible = "qcom,adreno-gmu" },
+ { .compatible = "qcom,glymur-mdss" },
{ .compatible = "qcom,mdp4" },
{ .compatible = "qcom,mdss" },
{ .compatible = "qcom,qcm2290-mdss" },
@@ -431,17 +432,19 @@ static int qcom_smmu_cfg_probe(struct arm_smmu_device *smmu)
/*
* Some platforms support more than the Arm SMMU architected maximum of
- * 128 stream matching groups. For unknown reasons, the additional
- * groups don't exhibit the same behavior as the architected registers,
- * so limit the groups to 128 until the behavior is fixed for the other
- * groups.
+ * 128 stream matching groups. The additional registers appear to have
+ * the same behavior as the architected registers in the hardware.
+ * However, on some firmware versions, the hypervisor does not
+ * correctly trap and emulate accesses to the additional registers,
+ * resulting in unexpected behavior.
+ *
+ * If there are more than 128 groups, use the last reliable group to
+ * detect if we need to apply the bypass quirk.
*/
- if (smmu->num_mapping_groups > 128) {
- dev_notice(smmu->dev, "\tLimiting the stream matching groups to 128\n");
- smmu->num_mapping_groups = 128;
- }
-
- last_s2cr = ARM_SMMU_GR0_S2CR(smmu->num_mapping_groups - 1);
+ if (smmu->num_mapping_groups > 128)
+ last_s2cr = ARM_SMMU_GR0_S2CR(127);
+ else
+ last_s2cr = ARM_SMMU_GR0_S2CR(smmu->num_mapping_groups - 1);
/*
* With some firmware versions writes to S2CR of type FAULT are
@@ -464,6 +467,11 @@ static int qcom_smmu_cfg_probe(struct arm_smmu_device *smmu)
reg = FIELD_PREP(ARM_SMMU_CBAR_TYPE, CBAR_TYPE_S1_TRANS_S2_BYPASS);
arm_smmu_gr1_write(smmu, ARM_SMMU_GR1_CBAR(qsmmu->bypass_cbndx), reg);
+
+ if (smmu->num_mapping_groups > 128) {
+ dev_notice(smmu->dev, "\tLimiting the stream matching groups to 128\n");
+ smmu->num_mapping_groups = 128;
+ }
}
for (i = 0; i < smmu->num_mapping_groups; i++) {
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c
index 4ced4b5bee4d..5e690cf85ec9 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
@@ -1165,7 +1165,8 @@ static void arm_smmu_master_install_s2crs(struct arm_smmu_master_cfg *cfg,
}
}
-static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
+static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev,
+ struct iommu_domain *old)
{
struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
@@ -1234,7 +1235,8 @@ static int arm_smmu_attach_dev_type(struct device *dev,
}
static int arm_smmu_attach_dev_identity(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
return arm_smmu_attach_dev_type(dev, S2CR_TYPE_BYPASS);
}
@@ -1249,7 +1251,8 @@ static struct iommu_domain arm_smmu_identity_domain = {
};
static int arm_smmu_attach_dev_blocked(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
return arm_smmu_attach_dev_type(dev, S2CR_TYPE_FAULT);
}
diff --git a/drivers/iommu/arm/arm-smmu/qcom_iommu.c b/drivers/iommu/arm/arm-smmu/qcom_iommu.c
index c5be95e56031..f69d9276dc55 100644
--- a/drivers/iommu/arm/arm-smmu/qcom_iommu.c
+++ b/drivers/iommu/arm/arm-smmu/qcom_iommu.c
@@ -359,7 +359,8 @@ static void qcom_iommu_domain_free(struct iommu_domain *domain)
kfree(qcom_domain);
}
-static int qcom_iommu_attach_dev(struct iommu_domain *domain, struct device *dev)
+static int qcom_iommu_attach_dev(struct iommu_domain *domain,
+ struct device *dev, struct iommu_domain *old)
{
struct qcom_iommu_dev *qcom_iommu = dev_iommu_priv_get(dev);
struct qcom_iommu_domain *qcom_domain = to_qcom_iommu_domain(domain);
@@ -388,18 +389,18 @@ static int qcom_iommu_attach_dev(struct iommu_domain *domain, struct device *dev
}
static int qcom_iommu_identity_attach(struct iommu_domain *identity_domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
- struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
struct qcom_iommu_domain *qcom_domain;
struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
struct qcom_iommu_dev *qcom_iommu = dev_iommu_priv_get(dev);
unsigned int i;
- if (domain == identity_domain || !domain)
+ if (old == identity_domain || !old)
return 0;
- qcom_domain = to_qcom_iommu_domain(domain);
+ qcom_domain = to_qcom_iommu_domain(old);
if (WARN_ON(!qcom_domain->iommu))
return -EINVAL;
@@ -565,14 +566,14 @@ static int qcom_iommu_of_xlate(struct device *dev,
qcom_iommu = platform_get_drvdata(iommu_pdev);
+ put_device(&iommu_pdev->dev);
+
/* make sure the asid specified in dt is valid, so we don't have
* to sanity check this elsewhere:
*/
if (WARN_ON(asid > qcom_iommu->max_asid) ||
- WARN_ON(qcom_iommu->ctxs[asid] == NULL)) {
- put_device(&iommu_pdev->dev);
+ WARN_ON(qcom_iommu->ctxs[asid] == NULL))
return -EINVAL;
- }
if (!dev_iommu_priv_get(dev)) {
dev_iommu_priv_set(dev, qcom_iommu);
@@ -581,10 +582,8 @@ static int qcom_iommu_of_xlate(struct device *dev,
* multiple different iommu devices. Multiple context
* banks are ok, but multiple devices are not:
*/
- if (WARN_ON(qcom_iommu != dev_iommu_priv_get(dev))) {
- put_device(&iommu_pdev->dev);
+ if (WARN_ON(qcom_iommu != dev_iommu_priv_get(dev)))
return -EINVAL;
- }
}
return iommu_fwspec_add_ids(dev, &asid, 1);
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index f1fb27681b0b..c92088855450 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -1439,8 +1439,8 @@ int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
* as a bus address, __finalise_sg() will copy the dma
* address into the output segment.
*/
- s->dma_address = pci_p2pdma_bus_addr_map(&p2pdma_state,
- sg_phys(s));
+ s->dma_address = pci_p2pdma_bus_addr_map(
+ p2pdma_state.mem, sg_phys(s));
sg_dma_len(s) = sg->length;
sg_dma_mark_bus_address(s);
continue;
diff --git a/drivers/iommu/exynos-iommu.c b/drivers/iommu/exynos-iommu.c
index b6edd178fe25..b512c6b939ac 100644
--- a/drivers/iommu/exynos-iommu.c
+++ b/drivers/iommu/exynos-iommu.c
@@ -984,7 +984,8 @@ static void exynos_iommu_domain_free(struct iommu_domain *iommu_domain)
}
static int exynos_iommu_identity_attach(struct iommu_domain *identity_domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
struct exynos_iommu_owner *owner = dev_iommu_priv_get(dev);
struct exynos_iommu_domain *domain;
@@ -1035,7 +1036,8 @@ static struct iommu_domain exynos_identity_domain = {
};
static int exynos_iommu_attach_device(struct iommu_domain *iommu_domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
struct exynos_iommu_domain *domain = to_exynos_domain(iommu_domain);
struct exynos_iommu_owner *owner = dev_iommu_priv_get(dev);
@@ -1044,7 +1046,7 @@ static int exynos_iommu_attach_device(struct iommu_domain *iommu_domain,
unsigned long flags;
int err;
- err = exynos_iommu_identity_attach(&exynos_identity_domain, dev);
+ err = exynos_iommu_identity_attach(&exynos_identity_domain, dev, old);
if (err)
return err;
@@ -1429,8 +1431,6 @@ static void exynos_iommu_release_device(struct device *dev)
struct exynos_iommu_owner *owner = dev_iommu_priv_get(dev);
struct sysmmu_drvdata *data;
- WARN_ON(exynos_iommu_identity_attach(&exynos_identity_domain, dev));
-
list_for_each_entry(data, &owner->controllers, owner_node)
device_link_del(data->link);
}
@@ -1446,17 +1446,14 @@ static int exynos_iommu_of_xlate(struct device *dev,
return -ENODEV;
data = platform_get_drvdata(sysmmu);
- if (!data) {
- put_device(&sysmmu->dev);
+ put_device(&sysmmu->dev);
+ if (!data)
return -ENODEV;
- }
if (!owner) {
owner = kzalloc(sizeof(*owner), GFP_KERNEL);
- if (!owner) {
- put_device(&sysmmu->dev);
+ if (!owner)
return -ENOMEM;
- }
INIT_LIST_HEAD(&owner->controllers);
mutex_init(&owner->rpm_lock);
@@ -1476,6 +1473,7 @@ static int exynos_iommu_of_xlate(struct device *dev,
static const struct iommu_ops exynos_iommu_ops = {
.identity_domain = &exynos_identity_domain,
+ .release_domain = &exynos_identity_domain,
.domain_alloc_paging = exynos_iommu_domain_alloc_paging,
.device_group = generic_device_group,
.probe_device = exynos_iommu_probe_device,
diff --git a/drivers/iommu/fsl_pamu_domain.c b/drivers/iommu/fsl_pamu_domain.c
index 5f08523f97cb..9664ef9840d2 100644
--- a/drivers/iommu/fsl_pamu_domain.c
+++ b/drivers/iommu/fsl_pamu_domain.c
@@ -238,7 +238,7 @@ static int update_domain_stash(struct fsl_dma_domain *dma_domain, u32 val)
}
static int fsl_pamu_attach_device(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev, struct iommu_domain *old)
{
struct fsl_dma_domain *dma_domain = to_fsl_dma_domain(domain);
unsigned long flags;
@@ -298,9 +298,9 @@ static int fsl_pamu_attach_device(struct iommu_domain *domain,
* switches to what looks like BLOCKING.
*/
static int fsl_pamu_platform_attach(struct iommu_domain *platform_domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
- struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
struct fsl_dma_domain *dma_domain;
const u32 *prop;
int len;
@@ -311,11 +311,11 @@ static int fsl_pamu_platform_attach(struct iommu_domain *platform_domain,
* Hack to keep things working as they always have, only leaving an
* UNMANAGED domain makes it BLOCKING.
*/
- if (domain == platform_domain || !domain ||
- domain->type != IOMMU_DOMAIN_UNMANAGED)
+ if (old == platform_domain || !old ||
+ old->type != IOMMU_DOMAIN_UNMANAGED)
return 0;
- dma_domain = to_fsl_dma_domain(domain);
+ dma_domain = to_fsl_dma_domain(old);
/*
* Use LIODN of the PCI controller while detaching a
diff --git a/drivers/iommu/generic_pt/.kunitconfig b/drivers/iommu/generic_pt/.kunitconfig
new file mode 100644
index 000000000000..52ac9e661ffd
--- /dev/null
+++ b/drivers/iommu/generic_pt/.kunitconfig
@@ -0,0 +1,14 @@
+CONFIG_KUNIT=y
+CONFIG_GENERIC_PT=y
+CONFIG_DEBUG_GENERIC_PT=y
+CONFIG_IOMMU_PT=y
+CONFIG_IOMMU_PT_AMDV1=y
+CONFIG_IOMMU_PT_VTDSS=y
+CONFIG_IOMMU_PT_X86_64=y
+CONFIG_IOMMU_PT_KUNIT_TEST=y
+
+CONFIG_IOMMUFD=y
+CONFIG_DEBUG_KERNEL=y
+CONFIG_FAULT_INJECTION=y
+CONFIG_RUNTIME_TESTING_MENU=y
+CONFIG_IOMMUFD_TEST=y
diff --git a/drivers/iommu/generic_pt/Kconfig b/drivers/iommu/generic_pt/Kconfig
new file mode 100644
index 000000000000..ce4fb4786914
--- /dev/null
+++ b/drivers/iommu/generic_pt/Kconfig
@@ -0,0 +1,79 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+menuconfig GENERIC_PT
+ bool "Generic Radix Page Table" if COMPILE_TEST
+ help
+ Generic library for building radix tree page tables.
+
+ Generic PT provides a set of HW page table formats and a common
+ set of APIs to work with them.
+
+if GENERIC_PT
+config DEBUG_GENERIC_PT
+ bool "Extra debugging checks for GENERIC_PT"
+ help
+ Enable extra run time debugging checks for GENERIC_PT code. This
+ incurs a runtime cost and should not be enabled for production
+ kernels.
+
+ The kunit tests require this to be enabled to get full coverage.
+
+config IOMMU_PT
+ tristate "IOMMU Page Tables"
+ select IOMMU_API
+ depends on IOMMU_SUPPORT
+ depends on GENERIC_PT
+ help
+ Generic library for building IOMMU page tables
+
+ IOMMU_PT provides an implementation of the page table operations
+ related to struct iommu_domain using GENERIC_PT. It provides a single
+ implementation of the page table operations that can be shared by
+ multiple drivers.
+
+if IOMMU_PT
+config IOMMU_PT_AMDV1
+ tristate "IOMMU page table for 64-bit AMD IOMMU v1"
+ depends on !GENERIC_ATOMIC64 # for cmpxchg64
+ help
+ iommu_domain implementation for the AMD v1 page table. AMDv1 is the
+ "host" page table. It supports granular page sizes of almost every
+ power of 2 and decodes the full 64-bit IOVA space.
+
+ Selected automatically by an IOMMU driver that uses this format.
+
+config IOMMU_PT_VTDSS
+ tristate "IOMMU page table for Intel VT-d Second Stage"
+ depends on !GENERIC_ATOMIC64 # for cmpxchg64
+ help
+ iommu_domain implementation for the Intel VT-d's 64 bit 3/4/5
+ level Second Stage page table. It is similar to the X86_64 format with
+ 4K/2M/1G page sizes.
+
+ Selected automatically by an IOMMU driver that uses this format.
+
+config IOMMU_PT_X86_64
+ tristate "IOMMU page table for x86 64-bit, 4/5 levels"
+ depends on !GENERIC_ATOMIC64 # for cmpxchg64
+ help
+ iommu_domain implementation for the x86 64-bit 4/5 level page table.
+ It supports 4K/2M/1G page sizes and can decode a sign-extended
+ portion of the 64-bit IOVA space.
+
+ Selected automatically by an IOMMU driver that uses this format.
+
+config IOMMU_PT_KUNIT_TEST
+ tristate "IOMMU Page Table KUnit Test" if !KUNIT_ALL_TESTS
+ depends on KUNIT
+ depends on IOMMU_PT_AMDV1 || !IOMMU_PT_AMDV1
+ depends on IOMMU_PT_X86_64 || !IOMMU_PT_X86_64
+ depends on IOMMU_PT_VTDSS || !IOMMU_PT_VTDSS
+ default KUNIT_ALL_TESTS
+ help
+ Enable kunit tests for GENERIC_PT and IOMMU_PT that covers all the
+ enabled page table formats. The test covers most of the GENERIC_PT
+ functions provided by the page table format, as well as covering the
+ iommu_domain related functions.
+
+endif
+endif
diff --git a/drivers/iommu/generic_pt/fmt/Makefile b/drivers/iommu/generic_pt/fmt/Makefile
new file mode 100644
index 000000000000..976b49ec97dc
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/Makefile
@@ -0,0 +1,28 @@
+# SPDX-License-Identifier: GPL-2.0
+
+iommu_pt_fmt-$(CONFIG_IOMMU_PT_AMDV1) += amdv1
+iommu_pt_fmt-$(CONFIG_IOMMUFD_TEST) += mock
+
+iommu_pt_fmt-$(CONFIG_IOMMU_PT_VTDSS) += vtdss
+
+iommu_pt_fmt-$(CONFIG_IOMMU_PT_X86_64) += x86_64
+
+IOMMU_PT_KUNIT_TEST :=
+define create_format
+obj-$(2) += iommu_$(1).o
+iommu_pt_kunit_test-y += kunit_iommu_$(1).o
+CFLAGS_kunit_iommu_$(1).o += -DGENERIC_PT_KUNIT=1
+IOMMU_PT_KUNIT_TEST := iommu_pt_kunit_test.o
+
+endef
+
+$(eval $(foreach fmt,$(iommu_pt_fmt-y),$(call create_format,$(fmt),y)))
+$(eval $(foreach fmt,$(iommu_pt_fmt-m),$(call create_format,$(fmt),m)))
+
+# The kunit objects are constructed by compiling the main source
+# with -DGENERIC_PT_KUNIT
+$(obj)/kunit_iommu_%.o: $(src)/iommu_%.c FORCE
+ $(call rule_mkdir)
+ $(call if_changed_dep,cc_o_c)
+
+obj-$(CONFIG_IOMMU_PT_KUNIT_TEST) += $(IOMMU_PT_KUNIT_TEST)
diff --git a/drivers/iommu/generic_pt/fmt/amdv1.h b/drivers/iommu/generic_pt/fmt/amdv1.h
new file mode 100644
index 000000000000..aa8e1a8ec95f
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/amdv1.h
@@ -0,0 +1,411 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ *
+ * AMD IOMMU v1 page table
+ *
+ * This is described in Section "2.2.3 I/O Page Tables for Host Translations"
+ * of the "AMD I/O Virtualization Technology (IOMMU) Specification"
+ *
+ * Note the level numbering here matches the core code, so level 0 is the same
+ * as mode 1.
+ *
+ */
+#ifndef __GENERIC_PT_FMT_AMDV1_H
+#define __GENERIC_PT_FMT_AMDV1_H
+
+#include "defs_amdv1.h"
+#include "../pt_defs.h"
+
+#include <asm/page.h>
+#include <linux/bitfield.h>
+#include <linux/container_of.h>
+#include <linux/mem_encrypt.h>
+#include <linux/minmax.h>
+#include <linux/sizes.h>
+#include <linux/string.h>
+
+enum {
+ PT_ITEM_WORD_SIZE = sizeof(u64),
+ /*
+ * The IOMMUFD selftest uses the AMDv1 format with some alterations It
+ * uses a 2k page size to test cases where the CPU page size is not the
+ * same.
+ */
+#ifdef AMDV1_IOMMUFD_SELFTEST
+ PT_MAX_VA_ADDRESS_LG2 = 56,
+ PT_MAX_OUTPUT_ADDRESS_LG2 = 51,
+ PT_MAX_TOP_LEVEL = 4,
+ PT_GRANULE_LG2SZ = 11,
+#else
+ PT_MAX_VA_ADDRESS_LG2 = 64,
+ PT_MAX_OUTPUT_ADDRESS_LG2 = 52,
+ PT_MAX_TOP_LEVEL = 5,
+ PT_GRANULE_LG2SZ = 12,
+#endif
+ PT_TABLEMEM_LG2SZ = 12,
+
+ /* The DTE only has these bits for the top phyiscal address */
+ PT_TOP_PHYS_MASK = GENMASK_ULL(51, 12),
+};
+
+/* PTE bits */
+enum {
+ AMDV1PT_FMT_PR = BIT(0),
+ AMDV1PT_FMT_D = BIT(6),
+ AMDV1PT_FMT_NEXT_LEVEL = GENMASK_ULL(11, 9),
+ AMDV1PT_FMT_OA = GENMASK_ULL(51, 12),
+ AMDV1PT_FMT_FC = BIT_ULL(60),
+ AMDV1PT_FMT_IR = BIT_ULL(61),
+ AMDV1PT_FMT_IW = BIT_ULL(62),
+};
+
+/*
+ * gcc 13 has a bug where it thinks the output of FIELD_GET() is an enum, make
+ * these defines to avoid it.
+ */
+#define AMDV1PT_FMT_NL_DEFAULT 0
+#define AMDV1PT_FMT_NL_SIZE 7
+
+static inline pt_oaddr_t amdv1pt_table_pa(const struct pt_state *pts)
+{
+ u64 entry = pts->entry;
+
+ if (pts_feature(pts, PT_FEAT_AMDV1_ENCRYPT_TABLES))
+ entry = __sme_clr(entry);
+ return oalog2_mul(FIELD_GET(AMDV1PT_FMT_OA, entry), PT_GRANULE_LG2SZ);
+}
+#define pt_table_pa amdv1pt_table_pa
+
+/* Returns the oa for the start of the contiguous entry */
+static inline pt_oaddr_t amdv1pt_entry_oa(const struct pt_state *pts)
+{
+ u64 entry = pts->entry;
+ pt_oaddr_t oa;
+
+ if (pts_feature(pts, PT_FEAT_AMDV1_ENCRYPT_TABLES))
+ entry = __sme_clr(entry);
+ oa = FIELD_GET(AMDV1PT_FMT_OA, entry);
+
+ if (FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, entry) == AMDV1PT_FMT_NL_SIZE) {
+ unsigned int sz_bits = oaffz(oa);
+
+ oa = oalog2_set_mod(oa, 0, sz_bits);
+ } else if (PT_WARN_ON(FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, entry) !=
+ AMDV1PT_FMT_NL_DEFAULT))
+ return 0;
+ return oalog2_mul(oa, PT_GRANULE_LG2SZ);
+}
+#define pt_entry_oa amdv1pt_entry_oa
+
+static inline bool amdv1pt_can_have_leaf(const struct pt_state *pts)
+{
+ /*
+ * Table 15: Page Table Level Parameters
+ * The top most level cannot have translation entries
+ */
+ return pts->level < PT_MAX_TOP_LEVEL;
+}
+#define pt_can_have_leaf amdv1pt_can_have_leaf
+
+/* Body in pt_fmt_defaults.h */
+static inline unsigned int pt_table_item_lg2sz(const struct pt_state *pts);
+
+static inline unsigned int
+amdv1pt_entry_num_contig_lg2(const struct pt_state *pts)
+{
+ u32 code;
+
+ if (FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, pts->entry) ==
+ AMDV1PT_FMT_NL_DEFAULT)
+ return ilog2(1);
+
+ PT_WARN_ON(FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, pts->entry) !=
+ AMDV1PT_FMT_NL_SIZE);
+
+ /*
+ * The contiguous size is encoded in the length of a string of 1's in
+ * the low bits of the OA. Reverse the equation:
+ * code = log2_to_int(num_contig_lg2 + item_lg2sz -
+ * PT_GRANULE_LG2SZ - 1) - 1
+ * Which can be expressed as:
+ * num_contig_lg2 = oalog2_ffz(code) + 1 -
+ * item_lg2sz - PT_GRANULE_LG2SZ
+ *
+ * Assume the bit layout is correct and remove the masking. Reorganize
+ * the equation to move all the arithmetic before the ffz.
+ */
+ code = pts->entry >> (__bf_shf(AMDV1PT_FMT_OA) - 1 +
+ pt_table_item_lg2sz(pts) - PT_GRANULE_LG2SZ);
+ return ffz_t(u32, code);
+}
+#define pt_entry_num_contig_lg2 amdv1pt_entry_num_contig_lg2
+
+static inline unsigned int amdv1pt_num_items_lg2(const struct pt_state *pts)
+{
+ /*
+ * Top entry covers bits [63:57] only, this is handled through
+ * max_vasz_lg2.
+ */
+ if (PT_WARN_ON(pts->level == 5))
+ return 7;
+ return PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64));
+}
+#define pt_num_items_lg2 amdv1pt_num_items_lg2
+
+static inline pt_vaddr_t amdv1pt_possible_sizes(const struct pt_state *pts)
+{
+ unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
+
+ if (!amdv1pt_can_have_leaf(pts))
+ return 0;
+
+ /*
+ * Table 14: Example Page Size Encodings
+ * Address bits 51:32 can be used to encode page sizes greater than 4
+ * Gbytes. Address bits 63:52 are zero-extended.
+ *
+ * 512GB Pages are not supported due to a hardware bug.
+ * Otherwise every power of two size is supported.
+ */
+ return GENMASK_ULL(min(51, isz_lg2 + amdv1pt_num_items_lg2(pts) - 1),
+ isz_lg2) & ~SZ_512G;
+}
+#define pt_possible_sizes amdv1pt_possible_sizes
+
+static inline enum pt_entry_type amdv1pt_load_entry_raw(struct pt_state *pts)
+{
+ const u64 *tablep = pt_cur_table(pts, u64) + pts->index;
+ unsigned int next_level;
+ u64 entry;
+
+ pts->entry = entry = READ_ONCE(*tablep);
+ if (!(entry & AMDV1PT_FMT_PR))
+ return PT_ENTRY_EMPTY;
+
+ next_level = FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, pts->entry);
+ if (pts->level == 0 || next_level == AMDV1PT_FMT_NL_DEFAULT ||
+ next_level == AMDV1PT_FMT_NL_SIZE)
+ return PT_ENTRY_OA;
+ return PT_ENTRY_TABLE;
+}
+#define pt_load_entry_raw amdv1pt_load_entry_raw
+
+static inline void
+amdv1pt_install_leaf_entry(struct pt_state *pts, pt_oaddr_t oa,
+ unsigned int oasz_lg2,
+ const struct pt_write_attrs *attrs)
+{
+ unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
+ u64 *tablep = pt_cur_table(pts, u64) + pts->index;
+ u64 entry;
+
+ if (!pt_check_install_leaf_args(pts, oa, oasz_lg2))
+ return;
+
+ entry = AMDV1PT_FMT_PR |
+ FIELD_PREP(AMDV1PT_FMT_OA, log2_div(oa, PT_GRANULE_LG2SZ)) |
+ attrs->descriptor_bits;
+
+ if (oasz_lg2 == isz_lg2) {
+ entry |= FIELD_PREP(AMDV1PT_FMT_NEXT_LEVEL,
+ AMDV1PT_FMT_NL_DEFAULT);
+ WRITE_ONCE(*tablep, entry);
+ } else {
+ unsigned int num_contig_lg2 = oasz_lg2 - isz_lg2;
+ u64 *end = tablep + log2_to_int(num_contig_lg2);
+
+ entry |= FIELD_PREP(AMDV1PT_FMT_NEXT_LEVEL,
+ AMDV1PT_FMT_NL_SIZE) |
+ FIELD_PREP(AMDV1PT_FMT_OA,
+ oalog2_to_int(oasz_lg2 - PT_GRANULE_LG2SZ -
+ 1) -
+ 1);
+
+ /* See amdv1pt_clear_entries() */
+ if (num_contig_lg2 <= ilog2(32)) {
+ for (; tablep != end; tablep++)
+ WRITE_ONCE(*tablep, entry);
+ } else {
+ memset64(tablep, entry, log2_to_int(num_contig_lg2));
+ }
+ }
+ pts->entry = entry;
+}
+#define pt_install_leaf_entry amdv1pt_install_leaf_entry
+
+static inline bool amdv1pt_install_table(struct pt_state *pts,
+ pt_oaddr_t table_pa,
+ const struct pt_write_attrs *attrs)
+{
+ u64 entry;
+
+ /*
+ * IR and IW are ANDed from the table levels along with the PTE. We
+ * always control permissions from the PTE, so always set IR and IW for
+ * tables.
+ */
+ entry = AMDV1PT_FMT_PR |
+ FIELD_PREP(AMDV1PT_FMT_NEXT_LEVEL, pts->level) |
+ FIELD_PREP(AMDV1PT_FMT_OA,
+ log2_div(table_pa, PT_GRANULE_LG2SZ)) |
+ AMDV1PT_FMT_IR | AMDV1PT_FMT_IW;
+ if (pts_feature(pts, PT_FEAT_AMDV1_ENCRYPT_TABLES))
+ entry = __sme_set(entry);
+ return pt_table_install64(pts, entry);
+}
+#define pt_install_table amdv1pt_install_table
+
+static inline void amdv1pt_attr_from_entry(const struct pt_state *pts,
+ struct pt_write_attrs *attrs)
+{
+ attrs->descriptor_bits =
+ pts->entry & (AMDV1PT_FMT_FC | AMDV1PT_FMT_IR | AMDV1PT_FMT_IW);
+}
+#define pt_attr_from_entry amdv1pt_attr_from_entry
+
+static inline void amdv1pt_clear_entries(struct pt_state *pts,
+ unsigned int num_contig_lg2)
+{
+ u64 *tablep = pt_cur_table(pts, u64) + pts->index;
+ u64 *end = tablep + log2_to_int(num_contig_lg2);
+
+ /*
+ * gcc generates rep stos for the io-pgtable code, and this difference
+ * can show in microbenchmarks with larger contiguous page sizes.
+ * rep is slower for small cases.
+ */
+ if (num_contig_lg2 <= ilog2(32)) {
+ for (; tablep != end; tablep++)
+ WRITE_ONCE(*tablep, 0);
+ } else {
+ memset64(tablep, 0, log2_to_int(num_contig_lg2));
+ }
+}
+#define pt_clear_entries amdv1pt_clear_entries
+
+static inline bool amdv1pt_entry_is_write_dirty(const struct pt_state *pts)
+{
+ unsigned int num_contig_lg2 = amdv1pt_entry_num_contig_lg2(pts);
+ u64 *tablep = pt_cur_table(pts, u64) +
+ log2_set_mod(pts->index, 0, num_contig_lg2);
+ u64 *end = tablep + log2_to_int(num_contig_lg2);
+
+ for (; tablep != end; tablep++)
+ if (READ_ONCE(*tablep) & AMDV1PT_FMT_D)
+ return true;
+ return false;
+}
+#define pt_entry_is_write_dirty amdv1pt_entry_is_write_dirty
+
+static inline void amdv1pt_entry_make_write_clean(struct pt_state *pts)
+{
+ unsigned int num_contig_lg2 = amdv1pt_entry_num_contig_lg2(pts);
+ u64 *tablep = pt_cur_table(pts, u64) +
+ log2_set_mod(pts->index, 0, num_contig_lg2);
+ u64 *end = tablep + log2_to_int(num_contig_lg2);
+
+ for (; tablep != end; tablep++)
+ WRITE_ONCE(*tablep, READ_ONCE(*tablep) & ~(u64)AMDV1PT_FMT_D);
+}
+#define pt_entry_make_write_clean amdv1pt_entry_make_write_clean
+
+static inline bool amdv1pt_entry_make_write_dirty(struct pt_state *pts)
+{
+ u64 *tablep = pt_cur_table(pts, u64) + pts->index;
+ u64 new = pts->entry | AMDV1PT_FMT_D;
+
+ return try_cmpxchg64(tablep, &pts->entry, new);
+}
+#define pt_entry_make_write_dirty amdv1pt_entry_make_write_dirty
+
+/* --- iommu */
+#include <linux/generic_pt/iommu.h>
+#include <linux/iommu.h>
+
+#define pt_iommu_table pt_iommu_amdv1
+
+/* The common struct is in the per-format common struct */
+static inline struct pt_common *common_from_iommu(struct pt_iommu *iommu_table)
+{
+ return &container_of(iommu_table, struct pt_iommu_amdv1, iommu)
+ ->amdpt.common;
+}
+
+static inline struct pt_iommu *iommu_from_common(struct pt_common *common)
+{
+ return &container_of(common, struct pt_iommu_amdv1, amdpt.common)->iommu;
+}
+
+static inline int amdv1pt_iommu_set_prot(struct pt_common *common,
+ struct pt_write_attrs *attrs,
+ unsigned int iommu_prot)
+{
+ u64 pte = 0;
+
+ if (pt_feature(common, PT_FEAT_AMDV1_FORCE_COHERENCE))
+ pte |= AMDV1PT_FMT_FC;
+ if (iommu_prot & IOMMU_READ)
+ pte |= AMDV1PT_FMT_IR;
+ if (iommu_prot & IOMMU_WRITE)
+ pte |= AMDV1PT_FMT_IW;
+
+ /*
+ * Ideally we'd have an IOMMU_ENCRYPTED flag set by higher levels to
+ * control this. For now if the tables use sme_set then so do the ptes.
+ */
+ if (pt_feature(common, PT_FEAT_AMDV1_ENCRYPT_TABLES))
+ pte = __sme_set(pte);
+
+ attrs->descriptor_bits = pte;
+ return 0;
+}
+#define pt_iommu_set_prot amdv1pt_iommu_set_prot
+
+static inline int amdv1pt_iommu_fmt_init(struct pt_iommu_amdv1 *iommu_table,
+ const struct pt_iommu_amdv1_cfg *cfg)
+{
+ struct pt_amdv1 *table = &iommu_table->amdpt;
+ unsigned int max_vasz_lg2 = PT_MAX_VA_ADDRESS_LG2;
+
+ if (cfg->starting_level == 0 || cfg->starting_level > PT_MAX_TOP_LEVEL)
+ return -EINVAL;
+
+ if (!pt_feature(&table->common, PT_FEAT_DYNAMIC_TOP) &&
+ cfg->starting_level != PT_MAX_TOP_LEVEL)
+ max_vasz_lg2 = PT_GRANULE_LG2SZ +
+ (PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64))) *
+ (cfg->starting_level + 1);
+
+ table->common.max_vasz_lg2 =
+ min(max_vasz_lg2, cfg->common.hw_max_vasz_lg2);
+ table->common.max_oasz_lg2 =
+ min(PT_MAX_OUTPUT_ADDRESS_LG2, cfg->common.hw_max_oasz_lg2);
+ pt_top_set_level(&table->common, cfg->starting_level);
+ return 0;
+}
+#define pt_iommu_fmt_init amdv1pt_iommu_fmt_init
+
+#ifndef PT_FMT_VARIANT
+static inline void
+amdv1pt_iommu_fmt_hw_info(struct pt_iommu_amdv1 *table,
+ const struct pt_range *top_range,
+ struct pt_iommu_amdv1_hw_info *info)
+{
+ info->host_pt_root = virt_to_phys(top_range->top_table);
+ PT_WARN_ON(info->host_pt_root & ~PT_TOP_PHYS_MASK);
+ info->mode = top_range->top_level + 1;
+}
+#define pt_iommu_fmt_hw_info amdv1pt_iommu_fmt_hw_info
+#endif
+
+#if defined(GENERIC_PT_KUNIT)
+static const struct pt_iommu_amdv1_cfg amdv1_kunit_fmt_cfgs[] = {
+ /* Matches what io_pgtable does */
+ [0] = { .starting_level = 2 },
+};
+#define kunit_fmt_cfgs amdv1_kunit_fmt_cfgs
+enum { KUNIT_FMT_FEATURES = 0 };
+#endif
+
+#endif
diff --git a/drivers/iommu/generic_pt/fmt/defs_amdv1.h b/drivers/iommu/generic_pt/fmt/defs_amdv1.h
new file mode 100644
index 000000000000..0b9614ca6d10
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/defs_amdv1.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ *
+ */
+#ifndef __GENERIC_PT_FMT_DEFS_AMDV1_H
+#define __GENERIC_PT_FMT_DEFS_AMDV1_H
+
+#include <linux/generic_pt/common.h>
+#include <linux/types.h>
+
+typedef u64 pt_vaddr_t;
+typedef u64 pt_oaddr_t;
+
+struct amdv1pt_write_attrs {
+ u64 descriptor_bits;
+ gfp_t gfp;
+};
+#define pt_write_attrs amdv1pt_write_attrs
+
+#endif
diff --git a/drivers/iommu/generic_pt/fmt/defs_vtdss.h b/drivers/iommu/generic_pt/fmt/defs_vtdss.h
new file mode 100644
index 000000000000..4a239bcaae2a
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/defs_vtdss.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+ *
+ */
+#ifndef __GENERIC_PT_FMT_DEFS_VTDSS_H
+#define __GENERIC_PT_FMT_DEFS_VTDSS_H
+
+#include <linux/generic_pt/common.h>
+#include <linux/types.h>
+
+typedef u64 pt_vaddr_t;
+typedef u64 pt_oaddr_t;
+
+struct vtdss_pt_write_attrs {
+ u64 descriptor_bits;
+ gfp_t gfp;
+};
+#define pt_write_attrs vtdss_pt_write_attrs
+
+#endif
diff --git a/drivers/iommu/generic_pt/fmt/defs_x86_64.h b/drivers/iommu/generic_pt/fmt/defs_x86_64.h
new file mode 100644
index 000000000000..6f589e1f55d3
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/defs_x86_64.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ *
+ */
+#ifndef __GENERIC_PT_FMT_DEFS_X86_64_H
+#define __GENERIC_PT_FMT_DEFS_X86_64_H
+
+#include <linux/generic_pt/common.h>
+#include <linux/types.h>
+
+typedef u64 pt_vaddr_t;
+typedef u64 pt_oaddr_t;
+
+struct x86_64_pt_write_attrs {
+ u64 descriptor_bits;
+ gfp_t gfp;
+};
+#define pt_write_attrs x86_64_pt_write_attrs
+
+#endif
diff --git a/drivers/iommu/generic_pt/fmt/iommu_amdv1.c b/drivers/iommu/generic_pt/fmt/iommu_amdv1.c
new file mode 100644
index 000000000000..72a2337d0c55
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/iommu_amdv1.c
@@ -0,0 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ */
+#define PT_FMT amdv1
+#define PT_SUPPORTED_FEATURES \
+ (BIT(PT_FEAT_FULL_VA) | BIT(PT_FEAT_DYNAMIC_TOP) | \
+ BIT(PT_FEAT_FLUSH_RANGE) | BIT(PT_FEAT_FLUSH_RANGE_NO_GAPS) | \
+ BIT(PT_FEAT_AMDV1_ENCRYPT_TABLES) | \
+ BIT(PT_FEAT_AMDV1_FORCE_COHERENCE))
+#define PT_FORCE_ENABLED_FEATURES \
+ (BIT(PT_FEAT_DYNAMIC_TOP) | BIT(PT_FEAT_AMDV1_ENCRYPT_TABLES) | \
+ BIT(PT_FEAT_AMDV1_FORCE_COHERENCE))
+
+#include "iommu_template.h"
diff --git a/drivers/iommu/generic_pt/fmt/iommu_mock.c b/drivers/iommu/generic_pt/fmt/iommu_mock.c
new file mode 100644
index 000000000000..74e597cba9d9
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/iommu_mock.c
@@ -0,0 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ */
+#define AMDV1_IOMMUFD_SELFTEST 1
+#define PT_FMT amdv1
+#define PT_FMT_VARIANT mock
+#define PT_SUPPORTED_FEATURES 0
+
+#include "iommu_template.h"
diff --git a/drivers/iommu/generic_pt/fmt/iommu_template.h b/drivers/iommu/generic_pt/fmt/iommu_template.h
new file mode 100644
index 000000000000..d28e86abdf2e
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/iommu_template.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ *
+ * Template to build the iommu module and kunit from the format and
+ * implementation headers.
+ *
+ * The format should have:
+ * #define PT_FMT <name>
+ * #define PT_SUPPORTED_FEATURES (BIT(PT_FEAT_xx) | BIT(PT_FEAT_yy))
+ * And optionally:
+ * #define PT_FORCE_ENABLED_FEATURES ..
+ * #define PT_FMT_VARIANT <suffix>
+ */
+#include <linux/args.h>
+#include <linux/stringify.h>
+
+#ifdef PT_FMT_VARIANT
+#define PTPFX_RAW \
+ CONCATENATE(CONCATENATE(PT_FMT, _), PT_FMT_VARIANT)
+#else
+#define PTPFX_RAW PT_FMT
+#endif
+
+#define PTPFX CONCATENATE(PTPFX_RAW, _)
+
+#define _PT_FMT_H PT_FMT.h
+#define PT_FMT_H __stringify(_PT_FMT_H)
+
+#define _PT_DEFS_H CONCATENATE(defs_, _PT_FMT_H)
+#define PT_DEFS_H __stringify(_PT_DEFS_H)
+
+#include <linux/generic_pt/common.h>
+#include PT_DEFS_H
+#include "../pt_defs.h"
+#include PT_FMT_H
+#include "../pt_common.h"
+
+#ifndef GENERIC_PT_KUNIT
+#include "../iommu_pt.h"
+#else
+/*
+ * The makefile will compile the .c file twice, once with GENERIC_PT_KUNIT set
+ * which means we are building the kunit modle.
+ */
+#include "../kunit_generic_pt.h"
+#include "../kunit_iommu_pt.h"
+#endif
diff --git a/drivers/iommu/generic_pt/fmt/iommu_vtdss.c b/drivers/iommu/generic_pt/fmt/iommu_vtdss.c
new file mode 100644
index 000000000000..f551711e2a33
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/iommu_vtdss.c
@@ -0,0 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+ */
+#define PT_FMT vtdss
+#define PT_SUPPORTED_FEATURES \
+ (BIT(PT_FEAT_FLUSH_RANGE) | BIT(PT_FEAT_VTDSS_FORCE_COHERENCE) | \
+ BIT(PT_FEAT_VTDSS_FORCE_WRITEABLE) | BIT(PT_FEAT_DMA_INCOHERENT))
+
+#include "iommu_template.h"
diff --git a/drivers/iommu/generic_pt/fmt/iommu_x86_64.c b/drivers/iommu/generic_pt/fmt/iommu_x86_64.c
new file mode 100644
index 000000000000..5472660c2d71
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/iommu_x86_64.c
@@ -0,0 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ */
+#define PT_FMT x86_64
+#define PT_SUPPORTED_FEATURES \
+ (BIT(PT_FEAT_SIGN_EXTEND) | BIT(PT_FEAT_FLUSH_RANGE) | \
+ BIT(PT_FEAT_FLUSH_RANGE_NO_GAPS) | \
+ BIT(PT_FEAT_X86_64_AMD_ENCRYPT_TABLES) | BIT(PT_FEAT_DMA_INCOHERENT))
+
+#include "iommu_template.h"
diff --git a/drivers/iommu/generic_pt/fmt/vtdss.h b/drivers/iommu/generic_pt/fmt/vtdss.h
new file mode 100644
index 000000000000..f5f8981edde7
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/vtdss.h
@@ -0,0 +1,285 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+ *
+ * Intel VT-d Second Stange 5/4 level page table
+ *
+ * This is described in
+ * Section "3.7 Second-Stage Translation"
+ * Section "9.8 Second-Stage Paging Entries"
+ *
+ * Of the "Intel Virtualization Technology for Directed I/O Architecture
+ * Specification".
+ *
+ * The named levels in the spec map to the pts->level as:
+ * Table/SS-PTE - 0
+ * Directory/SS-PDE - 1
+ * Directory Ptr/SS-PDPTE - 2
+ * PML4/SS-PML4E - 3
+ * PML5/SS-PML5E - 4
+ */
+#ifndef __GENERIC_PT_FMT_VTDSS_H
+#define __GENERIC_PT_FMT_VTDSS_H
+
+#include "defs_vtdss.h"
+#include "../pt_defs.h"
+
+#include <linux/bitfield.h>
+#include <linux/container_of.h>
+#include <linux/log2.h>
+
+enum {
+ PT_MAX_OUTPUT_ADDRESS_LG2 = 52,
+ PT_MAX_VA_ADDRESS_LG2 = 57,
+ PT_ITEM_WORD_SIZE = sizeof(u64),
+ PT_MAX_TOP_LEVEL = 4,
+ PT_GRANULE_LG2SZ = 12,
+ PT_TABLEMEM_LG2SZ = 12,
+
+ /* SSPTPTR is 4k aligned and limited by HAW */
+ PT_TOP_PHYS_MASK = GENMASK_ULL(63, 12),
+};
+
+/* Shared descriptor bits */
+enum {
+ VTDSS_FMT_R = BIT(0),
+ VTDSS_FMT_W = BIT(1),
+ VTDSS_FMT_A = BIT(8),
+ VTDSS_FMT_D = BIT(9),
+ VTDSS_FMT_SNP = BIT(11),
+ VTDSS_FMT_OA = GENMASK_ULL(51, 12),
+};
+
+/* PDPTE/PDE */
+enum {
+ VTDSS_FMT_PS = BIT(7),
+};
+
+#define common_to_vtdss_pt(common_ptr) \
+ container_of_const(common_ptr, struct pt_vtdss, common)
+#define to_vtdss_pt(pts) common_to_vtdss_pt((pts)->range->common)
+
+static inline pt_oaddr_t vtdss_pt_table_pa(const struct pt_state *pts)
+{
+ return oalog2_mul(FIELD_GET(VTDSS_FMT_OA, pts->entry),
+ PT_TABLEMEM_LG2SZ);
+}
+#define pt_table_pa vtdss_pt_table_pa
+
+static inline pt_oaddr_t vtdss_pt_entry_oa(const struct pt_state *pts)
+{
+ return oalog2_mul(FIELD_GET(VTDSS_FMT_OA, pts->entry),
+ PT_GRANULE_LG2SZ);
+}
+#define pt_entry_oa vtdss_pt_entry_oa
+
+static inline bool vtdss_pt_can_have_leaf(const struct pt_state *pts)
+{
+ return pts->level <= 2;
+}
+#define pt_can_have_leaf vtdss_pt_can_have_leaf
+
+static inline unsigned int vtdss_pt_num_items_lg2(const struct pt_state *pts)
+{
+ return PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64));
+}
+#define pt_num_items_lg2 vtdss_pt_num_items_lg2
+
+static inline enum pt_entry_type vtdss_pt_load_entry_raw(struct pt_state *pts)
+{
+ const u64 *tablep = pt_cur_table(pts, u64);
+ u64 entry;
+
+ pts->entry = entry = READ_ONCE(tablep[pts->index]);
+ if (!entry)
+ return PT_ENTRY_EMPTY;
+ if (pts->level == 0 ||
+ (vtdss_pt_can_have_leaf(pts) && (pts->entry & VTDSS_FMT_PS)))
+ return PT_ENTRY_OA;
+ return PT_ENTRY_TABLE;
+}
+#define pt_load_entry_raw vtdss_pt_load_entry_raw
+
+static inline void
+vtdss_pt_install_leaf_entry(struct pt_state *pts, pt_oaddr_t oa,
+ unsigned int oasz_lg2,
+ const struct pt_write_attrs *attrs)
+{
+ u64 *tablep = pt_cur_table(pts, u64);
+ u64 entry;
+
+ if (!pt_check_install_leaf_args(pts, oa, oasz_lg2))
+ return;
+
+ entry = FIELD_PREP(VTDSS_FMT_OA, log2_div(oa, PT_GRANULE_LG2SZ)) |
+ attrs->descriptor_bits;
+ if (pts->level != 0)
+ entry |= VTDSS_FMT_PS;
+
+ WRITE_ONCE(tablep[pts->index], entry);
+ pts->entry = entry;
+}
+#define pt_install_leaf_entry vtdss_pt_install_leaf_entry
+
+static inline bool vtdss_pt_install_table(struct pt_state *pts,
+ pt_oaddr_t table_pa,
+ const struct pt_write_attrs *attrs)
+{
+ u64 entry;
+
+ entry = VTDSS_FMT_R | VTDSS_FMT_W |
+ FIELD_PREP(VTDSS_FMT_OA, log2_div(table_pa, PT_GRANULE_LG2SZ));
+ return pt_table_install64(pts, entry);
+}
+#define pt_install_table vtdss_pt_install_table
+
+static inline void vtdss_pt_attr_from_entry(const struct pt_state *pts,
+ struct pt_write_attrs *attrs)
+{
+ attrs->descriptor_bits = pts->entry &
+ (VTDSS_FMT_R | VTDSS_FMT_W | VTDSS_FMT_SNP);
+}
+#define pt_attr_from_entry vtdss_pt_attr_from_entry
+
+static inline bool vtdss_pt_entry_is_write_dirty(const struct pt_state *pts)
+{
+ u64 *tablep = pt_cur_table(pts, u64) + pts->index;
+
+ return READ_ONCE(*tablep) & VTDSS_FMT_D;
+}
+#define pt_entry_is_write_dirty vtdss_pt_entry_is_write_dirty
+
+static inline void vtdss_pt_entry_make_write_clean(struct pt_state *pts)
+{
+ u64 *tablep = pt_cur_table(pts, u64) + pts->index;
+
+ WRITE_ONCE(*tablep, READ_ONCE(*tablep) & ~(u64)VTDSS_FMT_D);
+}
+#define pt_entry_make_write_clean vtdss_pt_entry_make_write_clean
+
+static inline bool vtdss_pt_entry_make_write_dirty(struct pt_state *pts)
+{
+ u64 *tablep = pt_cur_table(pts, u64) + pts->index;
+ u64 new = pts->entry | VTDSS_FMT_D;
+
+ return try_cmpxchg64(tablep, &pts->entry, new);
+}
+#define pt_entry_make_write_dirty vtdss_pt_entry_make_write_dirty
+
+static inline unsigned int vtdss_pt_max_sw_bit(struct pt_common *common)
+{
+ return 10;
+}
+#define pt_max_sw_bit vtdss_pt_max_sw_bit
+
+static inline u64 vtdss_pt_sw_bit(unsigned int bitnr)
+{
+ if (__builtin_constant_p(bitnr) && bitnr > 10)
+ BUILD_BUG();
+
+ /* Bits marked Ignored in the specification */
+ switch (bitnr) {
+ case 0:
+ return BIT(10);
+ case 1 ... 9:
+ return BIT_ULL((bitnr - 1) + 52);
+ case 10:
+ return BIT_ULL(63);
+ /* Some bits in 9-3 are available in some entries */
+ default:
+ PT_WARN_ON(true);
+ return 0;
+ }
+}
+#define pt_sw_bit vtdss_pt_sw_bit
+
+/* --- iommu */
+#include <linux/generic_pt/iommu.h>
+#include <linux/iommu.h>
+
+#define pt_iommu_table pt_iommu_vtdss
+
+/* The common struct is in the per-format common struct */
+static inline struct pt_common *common_from_iommu(struct pt_iommu *iommu_table)
+{
+ return &container_of(iommu_table, struct pt_iommu_table, iommu)
+ ->vtdss_pt.common;
+}
+
+static inline struct pt_iommu *iommu_from_common(struct pt_common *common)
+{
+ return &container_of(common, struct pt_iommu_table, vtdss_pt.common)
+ ->iommu;
+}
+
+static inline int vtdss_pt_iommu_set_prot(struct pt_common *common,
+ struct pt_write_attrs *attrs,
+ unsigned int iommu_prot)
+{
+ u64 pte = 0;
+
+ /*
+ * VTDSS does not have a present bit, so we tell if any entry is present
+ * by checking for R or W.
+ */
+ if (!(iommu_prot & (IOMMU_READ | IOMMU_WRITE)))
+ return -EINVAL;
+
+ if (iommu_prot & IOMMU_READ)
+ pte |= VTDSS_FMT_R;
+ if (iommu_prot & IOMMU_WRITE)
+ pte |= VTDSS_FMT_W;
+ if (pt_feature(common, PT_FEAT_VTDSS_FORCE_COHERENCE))
+ pte |= VTDSS_FMT_SNP;
+
+ if (pt_feature(common, PT_FEAT_VTDSS_FORCE_WRITEABLE) &&
+ !(iommu_prot & IOMMU_WRITE)) {
+ pr_err_ratelimited(
+ "Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
+ return -EINVAL;
+ }
+
+ attrs->descriptor_bits = pte;
+ return 0;
+}
+#define pt_iommu_set_prot vtdss_pt_iommu_set_prot
+
+static inline int vtdss_pt_iommu_fmt_init(struct pt_iommu_vtdss *iommu_table,
+ const struct pt_iommu_vtdss_cfg *cfg)
+{
+ struct pt_vtdss *table = &iommu_table->vtdss_pt;
+
+ if (cfg->top_level > 4 || cfg->top_level < 2)
+ return -EOPNOTSUPP;
+
+ pt_top_set_level(&table->common, cfg->top_level);
+ return 0;
+}
+#define pt_iommu_fmt_init vtdss_pt_iommu_fmt_init
+
+static inline void
+vtdss_pt_iommu_fmt_hw_info(struct pt_iommu_vtdss *table,
+ const struct pt_range *top_range,
+ struct pt_iommu_vtdss_hw_info *info)
+{
+ info->ssptptr = virt_to_phys(top_range->top_table);
+ PT_WARN_ON(info->ssptptr & ~PT_TOP_PHYS_MASK);
+ /*
+ * top_level = 2 = 3 level table aw=1
+ * top_level = 3 = 4 level table aw=2
+ * top_level = 4 = 5 level table aw=3
+ */
+ info->aw = top_range->top_level - 1;
+}
+#define pt_iommu_fmt_hw_info vtdss_pt_iommu_fmt_hw_info
+
+#if defined(GENERIC_PT_KUNIT)
+static const struct pt_iommu_vtdss_cfg vtdss_kunit_fmt_cfgs[] = {
+ [0] = { .common.hw_max_vasz_lg2 = 39, .top_level = 2},
+ [1] = { .common.hw_max_vasz_lg2 = 48, .top_level = 3},
+ [2] = { .common.hw_max_vasz_lg2 = 57, .top_level = 4},
+};
+#define kunit_fmt_cfgs vtdss_kunit_fmt_cfgs
+enum { KUNIT_FMT_FEATURES = BIT(PT_FEAT_VTDSS_FORCE_WRITEABLE) };
+#endif
+#endif
diff --git a/drivers/iommu/generic_pt/fmt/x86_64.h b/drivers/iommu/generic_pt/fmt/x86_64.h
new file mode 100644
index 000000000000..210748d9d6e8
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/x86_64.h
@@ -0,0 +1,279 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ *
+ * x86 page table. Supports the 4 and 5 level variations.
+ *
+ * The 4 and 5 level version is described in:
+ * Section "4.4 4-Level Paging and 5-Level Paging" of the Intel Software
+ * Developer's Manual Volume 3
+ *
+ * Section "9.7 First-Stage Paging Entries" of the "Intel Virtualization
+ * Technology for Directed I/O Architecture Specification"
+ *
+ * Section "2.2.6 I/O Page Tables for Guest Translations" of the "AMD I/O
+ * Virtualization Technology (IOMMU) Specification"
+ *
+ * It is used by x86 CPUs, AMD and VT-d IOMMU HW.
+ *
+ * Note the 3 level format is very similar and almost implemented here. The
+ * reserved/ignored layout is different and there are functional bit
+ * differences.
+ *
+ * This format uses PT_FEAT_SIGN_EXTEND to have a upper/non-canonical/lower
+ * split. PT_FEAT_SIGN_EXTEND is optional as AMD IOMMU sometimes uses non-sign
+ * extended addressing with this page table format.
+ *
+ * The named levels in the spec map to the pts->level as:
+ * Table/PTE - 0
+ * Directory/PDE - 1
+ * Directory Ptr/PDPTE - 2
+ * PML4/PML4E - 3
+ * PML5/PML5E - 4
+ */
+#ifndef __GENERIC_PT_FMT_X86_64_H
+#define __GENERIC_PT_FMT_X86_64_H
+
+#include "defs_x86_64.h"
+#include "../pt_defs.h"
+
+#include <linux/bitfield.h>
+#include <linux/container_of.h>
+#include <linux/log2.h>
+#include <linux/mem_encrypt.h>
+
+enum {
+ PT_MAX_OUTPUT_ADDRESS_LG2 = 52,
+ PT_MAX_VA_ADDRESS_LG2 = 57,
+ PT_ITEM_WORD_SIZE = sizeof(u64),
+ PT_MAX_TOP_LEVEL = 4,
+ PT_GRANULE_LG2SZ = 12,
+ PT_TABLEMEM_LG2SZ = 12,
+
+ /*
+ * For AMD the GCR3 Base only has these bits. For VT-d FSPTPTR is 4k
+ * aligned and is limited by the architected HAW
+ */
+ PT_TOP_PHYS_MASK = GENMASK_ULL(51, 12),
+};
+
+/* Shared descriptor bits */
+enum {
+ X86_64_FMT_P = BIT(0),
+ X86_64_FMT_RW = BIT(1),
+ X86_64_FMT_U = BIT(2),
+ X86_64_FMT_A = BIT(5),
+ X86_64_FMT_D = BIT(6),
+ X86_64_FMT_OA = GENMASK_ULL(51, 12),
+ X86_64_FMT_XD = BIT_ULL(63),
+};
+
+/* PDPTE/PDE */
+enum {
+ X86_64_FMT_PS = BIT(7),
+};
+
+static inline pt_oaddr_t x86_64_pt_table_pa(const struct pt_state *pts)
+{
+ u64 entry = pts->entry;
+
+ if (pts_feature(pts, PT_FEAT_X86_64_AMD_ENCRYPT_TABLES))
+ entry = __sme_clr(entry);
+ return oalog2_mul(FIELD_GET(X86_64_FMT_OA, entry),
+ PT_TABLEMEM_LG2SZ);
+}
+#define pt_table_pa x86_64_pt_table_pa
+
+static inline pt_oaddr_t x86_64_pt_entry_oa(const struct pt_state *pts)
+{
+ u64 entry = pts->entry;
+
+ if (pts_feature(pts, PT_FEAT_X86_64_AMD_ENCRYPT_TABLES))
+ entry = __sme_clr(entry);
+ return oalog2_mul(FIELD_GET(X86_64_FMT_OA, entry),
+ PT_GRANULE_LG2SZ);
+}
+#define pt_entry_oa x86_64_pt_entry_oa
+
+static inline bool x86_64_pt_can_have_leaf(const struct pt_state *pts)
+{
+ return pts->level <= 2;
+}
+#define pt_can_have_leaf x86_64_pt_can_have_leaf
+
+static inline unsigned int x86_64_pt_num_items_lg2(const struct pt_state *pts)
+{
+ return PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64));
+}
+#define pt_num_items_lg2 x86_64_pt_num_items_lg2
+
+static inline enum pt_entry_type x86_64_pt_load_entry_raw(struct pt_state *pts)
+{
+ const u64 *tablep = pt_cur_table(pts, u64);
+ u64 entry;
+
+ pts->entry = entry = READ_ONCE(tablep[pts->index]);
+ if (!(entry & X86_64_FMT_P))
+ return PT_ENTRY_EMPTY;
+ if (pts->level == 0 ||
+ (x86_64_pt_can_have_leaf(pts) && (entry & X86_64_FMT_PS)))
+ return PT_ENTRY_OA;
+ return PT_ENTRY_TABLE;
+}
+#define pt_load_entry_raw x86_64_pt_load_entry_raw
+
+static inline void
+x86_64_pt_install_leaf_entry(struct pt_state *pts, pt_oaddr_t oa,
+ unsigned int oasz_lg2,
+ const struct pt_write_attrs *attrs)
+{
+ u64 *tablep = pt_cur_table(pts, u64);
+ u64 entry;
+
+ if (!pt_check_install_leaf_args(pts, oa, oasz_lg2))
+ return;
+
+ entry = X86_64_FMT_P |
+ FIELD_PREP(X86_64_FMT_OA, log2_div(oa, PT_GRANULE_LG2SZ)) |
+ attrs->descriptor_bits;
+ if (pts->level != 0)
+ entry |= X86_64_FMT_PS;
+
+ WRITE_ONCE(tablep[pts->index], entry);
+ pts->entry = entry;
+}
+#define pt_install_leaf_entry x86_64_pt_install_leaf_entry
+
+static inline bool x86_64_pt_install_table(struct pt_state *pts,
+ pt_oaddr_t table_pa,
+ const struct pt_write_attrs *attrs)
+{
+ u64 entry;
+
+ entry = X86_64_FMT_P | X86_64_FMT_RW | X86_64_FMT_U | X86_64_FMT_A |
+ FIELD_PREP(X86_64_FMT_OA, log2_div(table_pa, PT_GRANULE_LG2SZ));
+ if (pts_feature(pts, PT_FEAT_X86_64_AMD_ENCRYPT_TABLES))
+ entry = __sme_set(entry);
+ return pt_table_install64(pts, entry);
+}
+#define pt_install_table x86_64_pt_install_table
+
+static inline void x86_64_pt_attr_from_entry(const struct pt_state *pts,
+ struct pt_write_attrs *attrs)
+{
+ attrs->descriptor_bits = pts->entry &
+ (X86_64_FMT_RW | X86_64_FMT_U | X86_64_FMT_A |
+ X86_64_FMT_D | X86_64_FMT_XD);
+}
+#define pt_attr_from_entry x86_64_pt_attr_from_entry
+
+static inline unsigned int x86_64_pt_max_sw_bit(struct pt_common *common)
+{
+ return 12;
+}
+#define pt_max_sw_bit x86_64_pt_max_sw_bit
+
+static inline u64 x86_64_pt_sw_bit(unsigned int bitnr)
+{
+ if (__builtin_constant_p(bitnr) && bitnr > 12)
+ BUILD_BUG();
+
+ /* Bits marked Ignored/AVL in the specification */
+ switch (bitnr) {
+ case 0:
+ return BIT(9);
+ case 1:
+ return BIT(11);
+ case 2 ... 12:
+ return BIT_ULL((bitnr - 2) + 52);
+ /* Some bits in 8,6,4,3 are available in some entries */
+ default:
+ PT_WARN_ON(true);
+ return 0;
+ }
+}
+#define pt_sw_bit x86_64_pt_sw_bit
+
+/* --- iommu */
+#include <linux/generic_pt/iommu.h>
+#include <linux/iommu.h>
+
+#define pt_iommu_table pt_iommu_x86_64
+
+/* The common struct is in the per-format common struct */
+static inline struct pt_common *common_from_iommu(struct pt_iommu *iommu_table)
+{
+ return &container_of(iommu_table, struct pt_iommu_table, iommu)
+ ->x86_64_pt.common;
+}
+
+static inline struct pt_iommu *iommu_from_common(struct pt_common *common)
+{
+ return &container_of(common, struct pt_iommu_table, x86_64_pt.common)
+ ->iommu;
+}
+
+static inline int x86_64_pt_iommu_set_prot(struct pt_common *common,
+ struct pt_write_attrs *attrs,
+ unsigned int iommu_prot)
+{
+ u64 pte;
+
+ pte = X86_64_FMT_U | X86_64_FMT_A;
+ if (iommu_prot & IOMMU_WRITE)
+ pte |= X86_64_FMT_RW | X86_64_FMT_D;
+
+ /*
+ * Ideally we'd have an IOMMU_ENCRYPTED flag set by higher levels to
+ * control this. For now if the tables use sme_set then so do the ptes.
+ */
+ if (pt_feature(common, PT_FEAT_X86_64_AMD_ENCRYPT_TABLES))
+ pte = __sme_set(pte);
+
+ attrs->descriptor_bits = pte;
+ return 0;
+}
+#define pt_iommu_set_prot x86_64_pt_iommu_set_prot
+
+static inline int
+x86_64_pt_iommu_fmt_init(struct pt_iommu_x86_64 *iommu_table,
+ const struct pt_iommu_x86_64_cfg *cfg)
+{
+ struct pt_x86_64 *table = &iommu_table->x86_64_pt;
+
+ if (cfg->top_level < 3 || cfg->top_level > 4)
+ return -EOPNOTSUPP;
+
+ pt_top_set_level(&table->common, cfg->top_level);
+
+ table->common.max_oasz_lg2 =
+ min(PT_MAX_OUTPUT_ADDRESS_LG2, cfg->common.hw_max_oasz_lg2);
+ return 0;
+}
+#define pt_iommu_fmt_init x86_64_pt_iommu_fmt_init
+
+static inline void
+x86_64_pt_iommu_fmt_hw_info(struct pt_iommu_x86_64 *table,
+ const struct pt_range *top_range,
+ struct pt_iommu_x86_64_hw_info *info)
+{
+ info->gcr3_pt = virt_to_phys(top_range->top_table);
+ PT_WARN_ON(info->gcr3_pt & ~PT_TOP_PHYS_MASK);
+ info->levels = top_range->top_level + 1;
+}
+#define pt_iommu_fmt_hw_info x86_64_pt_iommu_fmt_hw_info
+
+#if defined(GENERIC_PT_KUNIT)
+static const struct pt_iommu_x86_64_cfg x86_64_kunit_fmt_cfgs[] = {
+ [0] = { .common.features = BIT(PT_FEAT_SIGN_EXTEND),
+ .common.hw_max_vasz_lg2 = 48, .top_level = 3 },
+ [1] = { .common.features = BIT(PT_FEAT_SIGN_EXTEND),
+ .common.hw_max_vasz_lg2 = 57, .top_level = 4 },
+ /* AMD IOMMU PASID 0 formats with no SIGN_EXTEND */
+ [2] = { .common.hw_max_vasz_lg2 = 47, .top_level = 3 },
+ [3] = { .common.hw_max_vasz_lg2 = 56, .top_level = 4},
+};
+#define kunit_fmt_cfgs x86_64_kunit_fmt_cfgs
+enum { KUNIT_FMT_FEATURES = BIT(PT_FEAT_SIGN_EXTEND)};
+#endif
+#endif
diff --git a/drivers/iommu/generic_pt/iommu_pt.h b/drivers/iommu/generic_pt/iommu_pt.h
new file mode 100644
index 000000000000..97aeda1ad01c
--- /dev/null
+++ b/drivers/iommu/generic_pt/iommu_pt.h
@@ -0,0 +1,1289 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ *
+ * "Templated C code" for implementing the iommu operations for page tables.
+ * This is compiled multiple times, over all the page table formats to pick up
+ * the per-format definitions.
+ */
+#ifndef __GENERIC_PT_IOMMU_PT_H
+#define __GENERIC_PT_IOMMU_PT_H
+
+#include "pt_iter.h"
+
+#include <linux/export.h>
+#include <linux/iommu.h>
+#include "../iommu-pages.h"
+#include <linux/cleanup.h>
+#include <linux/dma-mapping.h>
+
+enum {
+ SW_BIT_CACHE_FLUSH_DONE = 0,
+};
+
+static void flush_writes_range(const struct pt_state *pts,
+ unsigned int start_index, unsigned int end_index)
+{
+ if (pts_feature(pts, PT_FEAT_DMA_INCOHERENT))
+ iommu_pages_flush_incoherent(
+ iommu_from_common(pts->range->common)->iommu_device,
+ pts->table, start_index * PT_ITEM_WORD_SIZE,
+ (end_index - start_index) * PT_ITEM_WORD_SIZE);
+}
+
+static void flush_writes_item(const struct pt_state *pts)
+{
+ if (pts_feature(pts, PT_FEAT_DMA_INCOHERENT))
+ iommu_pages_flush_incoherent(
+ iommu_from_common(pts->range->common)->iommu_device,
+ pts->table, pts->index * PT_ITEM_WORD_SIZE,
+ PT_ITEM_WORD_SIZE);
+}
+
+static void gather_range_pages(struct iommu_iotlb_gather *iotlb_gather,
+ struct pt_iommu *iommu_table, pt_vaddr_t iova,
+ pt_vaddr_t len,
+ struct iommu_pages_list *free_list)
+{
+ struct pt_common *common = common_from_iommu(iommu_table);
+
+ if (pt_feature(common, PT_FEAT_DMA_INCOHERENT))
+ iommu_pages_stop_incoherent_list(free_list,
+ iommu_table->iommu_device);
+
+ if (pt_feature(common, PT_FEAT_FLUSH_RANGE_NO_GAPS) &&
+ iommu_iotlb_gather_is_disjoint(iotlb_gather, iova, len)) {
+ iommu_iotlb_sync(&iommu_table->domain, iotlb_gather);
+ /*
+ * Note that the sync frees the gather's free list, so we must
+ * not have any pages on that list that are covered by iova/len
+ */
+ } else if (pt_feature(common, PT_FEAT_FLUSH_RANGE)) {
+ iommu_iotlb_gather_add_range(iotlb_gather, iova, len);
+ }
+
+ iommu_pages_list_splice(free_list, &iotlb_gather->freelist);
+}
+
+#define DOMAIN_NS(op) CONCATENATE(CONCATENATE(pt_iommu_, PTPFX), op)
+
+static int make_range_ul(struct pt_common *common, struct pt_range *range,
+ unsigned long iova, unsigned long len)
+{
+ unsigned long last;
+
+ if (unlikely(len == 0))
+ return -EINVAL;
+
+ if (check_add_overflow(iova, len - 1, &last))
+ return -EOVERFLOW;
+
+ *range = pt_make_range(common, iova, last);
+ if (sizeof(iova) > sizeof(range->va)) {
+ if (unlikely(range->va != iova || range->last_va != last))
+ return -EOVERFLOW;
+ }
+ return 0;
+}
+
+static __maybe_unused int make_range_u64(struct pt_common *common,
+ struct pt_range *range, u64 iova,
+ u64 len)
+{
+ if (unlikely(iova > ULONG_MAX || len > ULONG_MAX))
+ return -EOVERFLOW;
+ return make_range_ul(common, range, iova, len);
+}
+
+/*
+ * Some APIs use unsigned long, while othersuse dma_addr_t as the type. Dispatch
+ * to the correct validation based on the type.
+ */
+#define make_range_no_check(common, range, iova, len) \
+ ({ \
+ int ret; \
+ if (sizeof(iova) > sizeof(unsigned long) || \
+ sizeof(len) > sizeof(unsigned long)) \
+ ret = make_range_u64(common, range, iova, len); \
+ else \
+ ret = make_range_ul(common, range, iova, len); \
+ ret; \
+ })
+
+#define make_range(common, range, iova, len) \
+ ({ \
+ int ret = make_range_no_check(common, range, iova, len); \
+ if (!ret) \
+ ret = pt_check_range(range); \
+ ret; \
+ })
+
+static inline unsigned int compute_best_pgsize(struct pt_state *pts,
+ pt_oaddr_t oa)
+{
+ struct pt_iommu *iommu_table = iommu_from_common(pts->range->common);
+
+ if (!pt_can_have_leaf(pts))
+ return 0;
+
+ /*
+ * The page size is limited by the domain's bitmap. This allows the core
+ * code to reduce the supported page sizes by changing the bitmap.
+ */
+ return pt_compute_best_pgsize(pt_possible_sizes(pts) &
+ iommu_table->domain.pgsize_bitmap,
+ pts->range->va, pts->range->last_va, oa);
+}
+
+static __always_inline int __do_iova_to_phys(struct pt_range *range, void *arg,
+ unsigned int level,
+ struct pt_table_p *table,
+ pt_level_fn_t descend_fn)
+{
+ struct pt_state pts = pt_init(range, level, table);
+ pt_oaddr_t *res = arg;
+
+ switch (pt_load_single_entry(&pts)) {
+ case PT_ENTRY_EMPTY:
+ return -ENOENT;
+ case PT_ENTRY_TABLE:
+ return pt_descend(&pts, arg, descend_fn);
+ case PT_ENTRY_OA:
+ *res = pt_entry_oa_exact(&pts);
+ return 0;
+ }
+ return -ENOENT;
+}
+PT_MAKE_LEVELS(__iova_to_phys, __do_iova_to_phys);
+
+/**
+ * iova_to_phys() - Return the output address for the given IOVA
+ * @domain: Table to query
+ * @iova: IO virtual address to query
+ *
+ * Determine the output address from the given IOVA. @iova may have any
+ * alignment, the returned physical will be adjusted with any sub page offset.
+ *
+ * Context: The caller must hold a read range lock that includes @iova.
+ *
+ * Return: 0 if there is no translation for the given iova.
+ */
+phys_addr_t DOMAIN_NS(iova_to_phys)(struct iommu_domain *domain,
+ dma_addr_t iova)
+{
+ struct pt_iommu *iommu_table =
+ container_of(domain, struct pt_iommu, domain);
+ struct pt_range range;
+ pt_oaddr_t res;
+ int ret;
+
+ ret = make_range(common_from_iommu(iommu_table), &range, iova, 1);
+ if (ret)
+ return ret;
+
+ ret = pt_walk_range(&range, __iova_to_phys, &res);
+ /* PHYS_ADDR_MAX would be a better error code */
+ if (ret)
+ return 0;
+ return res;
+}
+EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(iova_to_phys), "GENERIC_PT_IOMMU");
+
+struct pt_iommu_dirty_args {
+ struct iommu_dirty_bitmap *dirty;
+ unsigned int flags;
+};
+
+static void record_dirty(struct pt_state *pts,
+ struct pt_iommu_dirty_args *dirty,
+ unsigned int num_contig_lg2)
+{
+ pt_vaddr_t dirty_len;
+
+ if (num_contig_lg2 != ilog2(1)) {
+ unsigned int index = pts->index;
+ unsigned int end_index = log2_set_mod_max_t(
+ unsigned int, pts->index, num_contig_lg2);
+
+ /* Adjust for being contained inside a contiguous page */
+ end_index = min(end_index, pts->end_index);
+ dirty_len = (end_index - index) *
+ log2_to_int(pt_table_item_lg2sz(pts));
+ } else {
+ dirty_len = log2_to_int(pt_table_item_lg2sz(pts));
+ }
+
+ if (dirty->dirty->bitmap)
+ iova_bitmap_set(dirty->dirty->bitmap, pts->range->va,
+ dirty_len);
+
+ if (!(dirty->flags & IOMMU_DIRTY_NO_CLEAR)) {
+ /*
+ * No write log required because DMA incoherence and atomic
+ * dirty tracking bits can't work together
+ */
+ pt_entry_make_write_clean(pts);
+ iommu_iotlb_gather_add_range(dirty->dirty->gather,
+ pts->range->va, dirty_len);
+ }
+}
+
+static inline int __read_and_clear_dirty(struct pt_range *range, void *arg,
+ unsigned int level,
+ struct pt_table_p *table)
+{
+ struct pt_state pts = pt_init(range, level, table);
+ struct pt_iommu_dirty_args *dirty = arg;
+ int ret;
+
+ for_each_pt_level_entry(&pts) {
+ if (pts.type == PT_ENTRY_TABLE) {
+ ret = pt_descend(&pts, arg, __read_and_clear_dirty);
+ if (ret)
+ return ret;
+ continue;
+ }
+ if (pts.type == PT_ENTRY_OA && pt_entry_is_write_dirty(&pts))
+ record_dirty(&pts, dirty,
+ pt_entry_num_contig_lg2(&pts));
+ }
+ return 0;
+}
+
+/**
+ * read_and_clear_dirty() - Manipulate the HW set write dirty state
+ * @domain: Domain to manipulate
+ * @iova: IO virtual address to start
+ * @size: Length of the IOVA
+ * @flags: A bitmap of IOMMU_DIRTY_NO_CLEAR
+ * @dirty: Place to store the dirty bits
+ *
+ * Iterate over all the entries in the mapped range and record their write dirty
+ * status in iommu_dirty_bitmap. If IOMMU_DIRTY_NO_CLEAR is not specified then
+ * the entries will be left dirty, otherwise they are returned to being not
+ * write dirty.
+ *
+ * Context: The caller must hold a read range lock that includes @iova.
+ *
+ * Returns: -ERRNO on failure, 0 on success.
+ */
+int DOMAIN_NS(read_and_clear_dirty)(struct iommu_domain *domain,
+ unsigned long iova, size_t size,
+ unsigned long flags,
+ struct iommu_dirty_bitmap *dirty)
+{
+ struct pt_iommu *iommu_table =
+ container_of(domain, struct pt_iommu, domain);
+ struct pt_iommu_dirty_args dirty_args = {
+ .dirty = dirty,
+ .flags = flags,
+ };
+ struct pt_range range;
+ int ret;
+
+#if !IS_ENABLED(CONFIG_IOMMUFD_DRIVER) || !defined(pt_entry_is_write_dirty)
+ return -EOPNOTSUPP;
+#endif
+
+ ret = make_range(common_from_iommu(iommu_table), &range, iova, size);
+ if (ret)
+ return ret;
+
+ ret = pt_walk_range(&range, __read_and_clear_dirty, &dirty_args);
+ PT_WARN_ON(ret);
+ return ret;
+}
+EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(read_and_clear_dirty), "GENERIC_PT_IOMMU");
+
+static inline int __set_dirty(struct pt_range *range, void *arg,
+ unsigned int level, struct pt_table_p *table)
+{
+ struct pt_state pts = pt_init(range, level, table);
+
+ switch (pt_load_single_entry(&pts)) {
+ case PT_ENTRY_EMPTY:
+ return -ENOENT;
+ case PT_ENTRY_TABLE:
+ return pt_descend(&pts, arg, __set_dirty);
+ case PT_ENTRY_OA:
+ if (!pt_entry_make_write_dirty(&pts))
+ return -EAGAIN;
+ return 0;
+ }
+ return -ENOENT;
+}
+
+static int __maybe_unused NS(set_dirty)(struct pt_iommu *iommu_table,
+ dma_addr_t iova)
+{
+ struct pt_range range;
+ int ret;
+
+ ret = make_range(common_from_iommu(iommu_table), &range, iova, 1);
+ if (ret)
+ return ret;
+
+ /*
+ * Note: There is no locking here yet, if the test suite races this it
+ * can crash. It should use RCU locking eventually.
+ */
+ return pt_walk_range(&range, __set_dirty, NULL);
+}
+
+struct pt_iommu_collect_args {
+ struct iommu_pages_list free_list;
+ /* Fail if any OAs are within the range */
+ u8 check_mapped : 1;
+};
+
+static int __collect_tables(struct pt_range *range, void *arg,
+ unsigned int level, struct pt_table_p *table)
+{
+ struct pt_state pts = pt_init(range, level, table);
+ struct pt_iommu_collect_args *collect = arg;
+ int ret;
+
+ if (!collect->check_mapped && !pt_can_have_table(&pts))
+ return 0;
+
+ for_each_pt_level_entry(&pts) {
+ if (pts.type == PT_ENTRY_TABLE) {
+ iommu_pages_list_add(&collect->free_list, pts.table_lower);
+ ret = pt_descend(&pts, arg, __collect_tables);
+ if (ret)
+ return ret;
+ continue;
+ }
+ if (pts.type == PT_ENTRY_OA && collect->check_mapped)
+ return -EADDRINUSE;
+ }
+ return 0;
+}
+
+enum alloc_mode {ALLOC_NORMAL, ALLOC_DEFER_COHERENT_FLUSH};
+
+/* Allocate a table, the empty table will be ready to be installed. */
+static inline struct pt_table_p *_table_alloc(struct pt_common *common,
+ size_t lg2sz, gfp_t gfp,
+ enum alloc_mode mode)
+{
+ struct pt_iommu *iommu_table = iommu_from_common(common);
+ struct pt_table_p *table_mem;
+
+ table_mem = iommu_alloc_pages_node_sz(iommu_table->nid, gfp,
+ log2_to_int(lg2sz));
+ if (pt_feature(common, PT_FEAT_DMA_INCOHERENT) &&
+ mode == ALLOC_NORMAL) {
+ int ret = iommu_pages_start_incoherent(
+ table_mem, iommu_table->iommu_device);
+ if (ret) {
+ iommu_free_pages(table_mem);
+ return ERR_PTR(ret);
+ }
+ }
+ return table_mem;
+}
+
+static inline struct pt_table_p *table_alloc_top(struct pt_common *common,
+ uintptr_t top_of_table,
+ gfp_t gfp,
+ enum alloc_mode mode)
+{
+ /*
+ * Top doesn't need the free list or otherwise, so it technically
+ * doesn't need to use iommu pages. Use the API anyhow as the top is
+ * usually not smaller than PAGE_SIZE to keep things simple.
+ */
+ return _table_alloc(common, pt_top_memsize_lg2(common, top_of_table),
+ gfp, mode);
+}
+
+/* Allocate an interior table */
+static inline struct pt_table_p *table_alloc(const struct pt_state *parent_pts,
+ gfp_t gfp, enum alloc_mode mode)
+{
+ struct pt_state child_pts =
+ pt_init(parent_pts->range, parent_pts->level - 1, NULL);
+
+ return _table_alloc(parent_pts->range->common,
+ pt_num_items_lg2(&child_pts) +
+ ilog2(PT_ITEM_WORD_SIZE),
+ gfp, mode);
+}
+
+static inline int pt_iommu_new_table(struct pt_state *pts,
+ struct pt_write_attrs *attrs)
+{
+ struct pt_table_p *table_mem;
+ phys_addr_t phys;
+
+ /* Given PA/VA/length can't be represented */
+ if (PT_WARN_ON(!pt_can_have_table(pts)))
+ return -ENXIO;
+
+ table_mem = table_alloc(pts, attrs->gfp, ALLOC_NORMAL);
+ if (IS_ERR(table_mem))
+ return PTR_ERR(table_mem);
+
+ phys = virt_to_phys(table_mem);
+ if (!pt_install_table(pts, phys, attrs)) {
+ iommu_pages_free_incoherent(
+ table_mem,
+ iommu_from_common(pts->range->common)->iommu_device);
+ return -EAGAIN;
+ }
+
+ if (pts_feature(pts, PT_FEAT_DMA_INCOHERENT)) {
+ flush_writes_item(pts);
+ pt_set_sw_bit_release(pts, SW_BIT_CACHE_FLUSH_DONE);
+ }
+
+ if (IS_ENABLED(CONFIG_DEBUG_GENERIC_PT)) {
+ /*
+ * The underlying table can't store the physical table address.
+ * This happens when kunit testing tables outside their normal
+ * environment where a CPU might be limited.
+ */
+ pt_load_single_entry(pts);
+ if (PT_WARN_ON(pt_table_pa(pts) != phys)) {
+ pt_clear_entries(pts, ilog2(1));
+ iommu_pages_free_incoherent(
+ table_mem, iommu_from_common(pts->range->common)
+ ->iommu_device);
+ return -EINVAL;
+ }
+ }
+
+ pts->table_lower = table_mem;
+ return 0;
+}
+
+struct pt_iommu_map_args {
+ struct iommu_iotlb_gather *iotlb_gather;
+ struct pt_write_attrs attrs;
+ pt_oaddr_t oa;
+ unsigned int leaf_pgsize_lg2;
+ unsigned int leaf_level;
+};
+
+/*
+ * This will recursively check any tables in the block to validate they are
+ * empty and then free them through the gather.
+ */
+static int clear_contig(const struct pt_state *start_pts,
+ struct iommu_iotlb_gather *iotlb_gather,
+ unsigned int step, unsigned int pgsize_lg2)
+{
+ struct pt_iommu *iommu_table =
+ iommu_from_common(start_pts->range->common);
+ struct pt_range range = *start_pts->range;
+ struct pt_state pts =
+ pt_init(&range, start_pts->level, start_pts->table);
+ struct pt_iommu_collect_args collect = { .check_mapped = true };
+ int ret;
+
+ pts.index = start_pts->index;
+ pts.end_index = start_pts->index + step;
+ for (; _pt_iter_load(&pts); pt_next_entry(&pts)) {
+ if (pts.type == PT_ENTRY_TABLE) {
+ collect.free_list =
+ IOMMU_PAGES_LIST_INIT(collect.free_list);
+ ret = pt_walk_descend_all(&pts, __collect_tables,
+ &collect);
+ if (ret)
+ return ret;
+
+ /*
+ * The table item must be cleared before we can update
+ * the gather
+ */
+ pt_clear_entries(&pts, ilog2(1));
+ flush_writes_item(&pts);
+
+ iommu_pages_list_add(&collect.free_list,
+ pt_table_ptr(&pts));
+ gather_range_pages(
+ iotlb_gather, iommu_table, range.va,
+ log2_to_int(pt_table_item_lg2sz(&pts)),
+ &collect.free_list);
+ } else if (pts.type != PT_ENTRY_EMPTY) {
+ return -EADDRINUSE;
+ }
+ }
+ return 0;
+}
+
+static int __map_range_leaf(struct pt_range *range, void *arg,
+ unsigned int level, struct pt_table_p *table)
+{
+ struct pt_state pts = pt_init(range, level, table);
+ struct pt_iommu_map_args *map = arg;
+ unsigned int leaf_pgsize_lg2 = map->leaf_pgsize_lg2;
+ unsigned int start_index;
+ pt_oaddr_t oa = map->oa;
+ unsigned int step;
+ bool need_contig;
+ int ret = 0;
+
+ PT_WARN_ON(map->leaf_level != level);
+ PT_WARN_ON(!pt_can_have_leaf(&pts));
+
+ step = log2_to_int_t(unsigned int,
+ leaf_pgsize_lg2 - pt_table_item_lg2sz(&pts));
+ need_contig = leaf_pgsize_lg2 != pt_table_item_lg2sz(&pts);
+
+ _pt_iter_first(&pts);
+ start_index = pts.index;
+ do {
+ pts.type = pt_load_entry_raw(&pts);
+ if (pts.type != PT_ENTRY_EMPTY || need_contig) {
+ if (pts.index != start_index)
+ pt_index_to_va(&pts);
+ ret = clear_contig(&pts, map->iotlb_gather, step,
+ leaf_pgsize_lg2);
+ if (ret)
+ break;
+ }
+
+ if (IS_ENABLED(CONFIG_DEBUG_GENERIC_PT)) {
+ pt_index_to_va(&pts);
+ PT_WARN_ON(compute_best_pgsize(&pts, oa) !=
+ leaf_pgsize_lg2);
+ }
+ pt_install_leaf_entry(&pts, oa, leaf_pgsize_lg2, &map->attrs);
+
+ oa += log2_to_int(leaf_pgsize_lg2);
+ pts.index += step;
+ } while (pts.index < pts.end_index);
+
+ flush_writes_range(&pts, start_index, pts.index);
+
+ map->oa = oa;
+ return ret;
+}
+
+static int __map_range(struct pt_range *range, void *arg, unsigned int level,
+ struct pt_table_p *table)
+{
+ struct pt_state pts = pt_init(range, level, table);
+ struct pt_iommu_map_args *map = arg;
+ int ret;
+
+ PT_WARN_ON(map->leaf_level == level);
+ PT_WARN_ON(!pt_can_have_table(&pts));
+
+ _pt_iter_first(&pts);
+
+ /* Descend to a child table */
+ do {
+ pts.type = pt_load_entry_raw(&pts);
+
+ if (pts.type != PT_ENTRY_TABLE) {
+ if (pts.type != PT_ENTRY_EMPTY)
+ return -EADDRINUSE;
+ ret = pt_iommu_new_table(&pts, &map->attrs);
+ if (ret) {
+ /*
+ * Racing with another thread installing a table
+ */
+ if (ret == -EAGAIN)
+ continue;
+ return ret;
+ }
+ } else {
+ pts.table_lower = pt_table_ptr(&pts);
+ /*
+ * Racing with a shared pt_iommu_new_table()? The other
+ * thread is still flushing the cache, so we have to
+ * also flush it to ensure that when our thread's map
+ * completes all the table items leading to our mapping
+ * are visible.
+ *
+ * This requires the pt_set_bit_release() to be a
+ * release of the cache flush so that this can acquire
+ * visibility at the iommu.
+ */
+ if (pts_feature(&pts, PT_FEAT_DMA_INCOHERENT) &&
+ !pt_test_sw_bit_acquire(&pts,
+ SW_BIT_CACHE_FLUSH_DONE))
+ flush_writes_item(&pts);
+ }
+
+ /*
+ * The already present table can possibly be shared with another
+ * concurrent map.
+ */
+ if (map->leaf_level == level - 1)
+ ret = pt_descend(&pts, arg, __map_range_leaf);
+ else
+ ret = pt_descend(&pts, arg, __map_range);
+ if (ret)
+ return ret;
+
+ pts.index++;
+ pt_index_to_va(&pts);
+ if (pts.index >= pts.end_index)
+ break;
+ } while (true);
+ return 0;
+}
+
+/*
+ * Fast path for the easy case of mapping a 4k page to an already allocated
+ * table. This is a common workload. If it returns EAGAIN run the full algorithm
+ * instead.
+ */
+static __always_inline int __do_map_single_page(struct pt_range *range,
+ void *arg, unsigned int level,
+ struct pt_table_p *table,
+ pt_level_fn_t descend_fn)
+{
+ struct pt_state pts = pt_init(range, level, table);
+ struct pt_iommu_map_args *map = arg;
+
+ pts.type = pt_load_single_entry(&pts);
+ if (level == 0) {
+ if (pts.type != PT_ENTRY_EMPTY)
+ return -EADDRINUSE;
+ pt_install_leaf_entry(&pts, map->oa, PAGE_SHIFT,
+ &map->attrs);
+ /* No flush, not used when incoherent */
+ map->oa += PAGE_SIZE;
+ return 0;
+ }
+ if (pts.type == PT_ENTRY_TABLE)
+ return pt_descend(&pts, arg, descend_fn);
+ /* Something else, use the slow path */
+ return -EAGAIN;
+}
+PT_MAKE_LEVELS(__map_single_page, __do_map_single_page);
+
+/*
+ * Add a table to the top, increasing the top level as much as necessary to
+ * encompass range.
+ */
+static int increase_top(struct pt_iommu *iommu_table, struct pt_range *range,
+ struct pt_iommu_map_args *map)
+{
+ struct iommu_pages_list free_list = IOMMU_PAGES_LIST_INIT(free_list);
+ struct pt_common *common = common_from_iommu(iommu_table);
+ uintptr_t top_of_table = READ_ONCE(common->top_of_table);
+ uintptr_t new_top_of_table = top_of_table;
+ struct pt_table_p *table_mem;
+ unsigned int new_level;
+ spinlock_t *domain_lock;
+ unsigned long flags;
+ int ret;
+
+ while (true) {
+ struct pt_range top_range =
+ _pt_top_range(common, new_top_of_table);
+ struct pt_state pts = pt_init_top(&top_range);
+
+ top_range.va = range->va;
+ top_range.last_va = range->last_va;
+
+ if (!pt_check_range(&top_range) &&
+ map->leaf_level <= pts.level) {
+ new_level = pts.level;
+ break;
+ }
+
+ pts.level++;
+ if (pts.level > PT_MAX_TOP_LEVEL ||
+ pt_table_item_lg2sz(&pts) >= common->max_vasz_lg2) {
+ ret = -ERANGE;
+ goto err_free;
+ }
+
+ table_mem =
+ table_alloc_top(common, _pt_top_set(NULL, pts.level),
+ map->attrs.gfp, ALLOC_DEFER_COHERENT_FLUSH);
+ if (IS_ERR(table_mem)) {
+ ret = PTR_ERR(table_mem);
+ goto err_free;
+ }
+ iommu_pages_list_add(&free_list, table_mem);
+
+ /* The new table links to the lower table always at index 0 */
+ top_range.va = 0;
+ top_range.top_level = pts.level;
+ pts.table_lower = pts.table;
+ pts.table = table_mem;
+ pt_load_single_entry(&pts);
+ PT_WARN_ON(pts.index != 0);
+ pt_install_table(&pts, virt_to_phys(pts.table_lower),
+ &map->attrs);
+ new_top_of_table = _pt_top_set(pts.table, pts.level);
+ }
+
+ /*
+ * Avoid double flushing, flush it once after all pt_install_table()
+ */
+ if (pt_feature(common, PT_FEAT_DMA_INCOHERENT)) {
+ ret = iommu_pages_start_incoherent_list(
+ &free_list, iommu_table->iommu_device);
+ if (ret)
+ goto err_free;
+ }
+
+ /*
+ * top_of_table is write locked by the spinlock, but readers can use
+ * READ_ONCE() to get the value. Since we encode both the level and the
+ * pointer in one quanta the lockless reader will always see something
+ * valid. The HW must be updated to the new level under the spinlock
+ * before top_of_table is updated so that concurrent readers don't map
+ * into the new level until it is fully functional. If another thread
+ * already updated it while we were working then throw everything away
+ * and try again.
+ */
+ domain_lock = iommu_table->driver_ops->get_top_lock(iommu_table);
+ spin_lock_irqsave(domain_lock, flags);
+ if (common->top_of_table != top_of_table ||
+ top_of_table == new_top_of_table) {
+ spin_unlock_irqrestore(domain_lock, flags);
+ ret = -EAGAIN;
+ goto err_free;
+ }
+
+ /*
+ * We do not issue any flushes for change_top on the expectation that
+ * any walk cache will not become a problem by adding another layer to
+ * the tree. Misses will rewalk from the updated top pointer, hits
+ * continue to be correct. Negative caching is fine too since all the
+ * new IOVA added by the new top is non-present.
+ */
+ iommu_table->driver_ops->change_top(
+ iommu_table, virt_to_phys(table_mem), new_level);
+ WRITE_ONCE(common->top_of_table, new_top_of_table);
+ spin_unlock_irqrestore(domain_lock, flags);
+ return 0;
+
+err_free:
+ if (pt_feature(common, PT_FEAT_DMA_INCOHERENT))
+ iommu_pages_stop_incoherent_list(&free_list,
+ iommu_table->iommu_device);
+ iommu_put_pages_list(&free_list);
+ return ret;
+}
+
+static int check_map_range(struct pt_iommu *iommu_table, struct pt_range *range,
+ struct pt_iommu_map_args *map)
+{
+ struct pt_common *common = common_from_iommu(iommu_table);
+ int ret;
+
+ do {
+ ret = pt_check_range(range);
+ if (!pt_feature(common, PT_FEAT_DYNAMIC_TOP))
+ return ret;
+
+ if (!ret && map->leaf_level <= range->top_level)
+ break;
+
+ ret = increase_top(iommu_table, range, map);
+ if (ret && ret != -EAGAIN)
+ return ret;
+
+ /* Reload the new top */
+ *range = pt_make_range(common, range->va, range->last_va);
+ } while (ret);
+ PT_WARN_ON(pt_check_range(range));
+ return 0;
+}
+
+static int do_map(struct pt_range *range, struct pt_common *common,
+ bool single_page, struct pt_iommu_map_args *map)
+{
+ /*
+ * The __map_single_page() fast path does not support DMA_INCOHERENT
+ * flushing to keep its .text small.
+ */
+ if (single_page && !pt_feature(common, PT_FEAT_DMA_INCOHERENT)) {
+ int ret;
+
+ ret = pt_walk_range(range, __map_single_page, map);
+ if (ret != -EAGAIN)
+ return ret;
+ /* EAGAIN falls through to the full path */
+ }
+
+ if (map->leaf_level == range->top_level)
+ return pt_walk_range(range, __map_range_leaf, map);
+ return pt_walk_range(range, __map_range, map);
+}
+
+/**
+ * map_pages() - Install translation for an IOVA range
+ * @domain: Domain to manipulate
+ * @iova: IO virtual address to start
+ * @paddr: Physical/Output address to start
+ * @pgsize: Length of each page
+ * @pgcount: Length of the range in pgsize units starting from @iova
+ * @prot: A bitmap of IOMMU_READ/WRITE/CACHE/NOEXEC/MMIO
+ * @gfp: GFP flags for any memory allocations
+ * @mapped: Total bytes successfully mapped
+ *
+ * The range starting at IOVA will have paddr installed into it. The caller
+ * must specify a valid pgsize and pgcount to segment the range into compatible
+ * blocks.
+ *
+ * On error the caller will probably want to invoke unmap on the range from iova
+ * up to the amount indicated by @mapped to return the table back to an
+ * unchanged state.
+ *
+ * Context: The caller must hold a write range lock that includes the whole
+ * range.
+ *
+ * Returns: -ERRNO on failure, 0 on success. The number of bytes of VA that were
+ * mapped are added to @mapped, @mapped is not zerod first.
+ */
+int DOMAIN_NS(map_pages)(struct iommu_domain *domain, unsigned long iova,
+ phys_addr_t paddr, size_t pgsize, size_t pgcount,
+ int prot, gfp_t gfp, size_t *mapped)
+{
+ struct pt_iommu *iommu_table =
+ container_of(domain, struct pt_iommu, domain);
+ pt_vaddr_t pgsize_bitmap = iommu_table->domain.pgsize_bitmap;
+ struct pt_common *common = common_from_iommu(iommu_table);
+ struct iommu_iotlb_gather iotlb_gather;
+ pt_vaddr_t len = pgsize * pgcount;
+ struct pt_iommu_map_args map = {
+ .iotlb_gather = &iotlb_gather,
+ .oa = paddr,
+ .leaf_pgsize_lg2 = vaffs(pgsize),
+ };
+ bool single_page = false;
+ struct pt_range range;
+ int ret;
+
+ iommu_iotlb_gather_init(&iotlb_gather);
+
+ if (WARN_ON(!(prot & (IOMMU_READ | IOMMU_WRITE))))
+ return -EINVAL;
+
+ /* Check the paddr doesn't exceed what the table can store */
+ if ((sizeof(pt_oaddr_t) < sizeof(paddr) &&
+ (pt_vaddr_t)paddr > PT_VADDR_MAX) ||
+ (common->max_oasz_lg2 != PT_VADDR_MAX_LG2 &&
+ oalog2_div(paddr, common->max_oasz_lg2)))
+ return -ERANGE;
+
+ ret = pt_iommu_set_prot(common, &map.attrs, prot);
+ if (ret)
+ return ret;
+ map.attrs.gfp = gfp;
+
+ ret = make_range_no_check(common, &range, iova, len);
+ if (ret)
+ return ret;
+
+ /* Calculate target page size and level for the leaves */
+ if (pt_has_system_page_size(common) && pgsize == PAGE_SIZE &&
+ pgcount == 1) {
+ PT_WARN_ON(!(pgsize_bitmap & PAGE_SIZE));
+ if (log2_mod(iova | paddr, PAGE_SHIFT))
+ return -ENXIO;
+ map.leaf_pgsize_lg2 = PAGE_SHIFT;
+ map.leaf_level = 0;
+ single_page = true;
+ } else {
+ map.leaf_pgsize_lg2 = pt_compute_best_pgsize(
+ pgsize_bitmap, range.va, range.last_va, paddr);
+ if (!map.leaf_pgsize_lg2)
+ return -ENXIO;
+ map.leaf_level =
+ pt_pgsz_lg2_to_level(common, map.leaf_pgsize_lg2);
+ }
+
+ ret = check_map_range(iommu_table, &range, &map);
+ if (ret)
+ return ret;
+
+ PT_WARN_ON(map.leaf_level > range.top_level);
+
+ ret = do_map(&range, common, single_page, &map);
+
+ /*
+ * Table levels were freed and replaced with large items, flush any walk
+ * cache that may refer to the freed levels.
+ */
+ if (!iommu_pages_list_empty(&iotlb_gather.freelist))
+ iommu_iotlb_sync(&iommu_table->domain, &iotlb_gather);
+
+ /* Bytes successfully mapped */
+ PT_WARN_ON(!ret && map.oa - paddr != len);
+ *mapped += map.oa - paddr;
+ return ret;
+}
+EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(map_pages), "GENERIC_PT_IOMMU");
+
+struct pt_unmap_args {
+ struct iommu_pages_list free_list;
+ pt_vaddr_t unmapped;
+};
+
+static __maybe_unused int __unmap_range(struct pt_range *range, void *arg,
+ unsigned int level,
+ struct pt_table_p *table)
+{
+ struct pt_state pts = pt_init(range, level, table);
+ struct pt_unmap_args *unmap = arg;
+ unsigned int num_oas = 0;
+ unsigned int start_index;
+ int ret = 0;
+
+ _pt_iter_first(&pts);
+ start_index = pts.index;
+ pts.type = pt_load_entry_raw(&pts);
+ /*
+ * A starting index is in the middle of a contiguous entry
+ *
+ * The IOMMU API does not require drivers to support unmapping parts of
+ * large pages. Long ago VFIO would try to split maps but the current
+ * version never does.
+ *
+ * Instead when unmap reaches a partial unmap of the start of a large
+ * IOPTE it should remove the entire IOPTE and return that size to the
+ * caller.
+ */
+ if (pts.type == PT_ENTRY_OA) {
+ if (log2_mod(range->va, pt_entry_oa_lg2sz(&pts)))
+ return -EINVAL;
+ /* Micro optimization */
+ goto start_oa;
+ }
+
+ do {
+ if (pts.type != PT_ENTRY_OA) {
+ bool fully_covered;
+
+ if (pts.type != PT_ENTRY_TABLE) {
+ ret = -EINVAL;
+ break;
+ }
+
+ if (pts.index != start_index)
+ pt_index_to_va(&pts);
+ pts.table_lower = pt_table_ptr(&pts);
+
+ fully_covered = pt_entry_fully_covered(
+ &pts, pt_table_item_lg2sz(&pts));
+
+ ret = pt_descend(&pts, arg, __unmap_range);
+ if (ret)
+ break;
+
+ /*
+ * If the unmapping range fully covers the table then we
+ * can free it as well. The clear is delayed until we
+ * succeed in clearing the lower table levels.
+ */
+ if (fully_covered) {
+ iommu_pages_list_add(&unmap->free_list,
+ pts.table_lower);
+ pt_clear_entries(&pts, ilog2(1));
+ }
+ pts.index++;
+ } else {
+ unsigned int num_contig_lg2;
+start_oa:
+ /*
+ * If the caller requested an last that falls within a
+ * single entry then the entire entry is unmapped and
+ * the length returned will be larger than requested.
+ */
+ num_contig_lg2 = pt_entry_num_contig_lg2(&pts);
+ pt_clear_entries(&pts, num_contig_lg2);
+ num_oas += log2_to_int(num_contig_lg2);
+ pts.index += log2_to_int(num_contig_lg2);
+ }
+ if (pts.index >= pts.end_index)
+ break;
+ pts.type = pt_load_entry_raw(&pts);
+ } while (true);
+
+ unmap->unmapped += log2_mul(num_oas, pt_table_item_lg2sz(&pts));
+ flush_writes_range(&pts, start_index, pts.index);
+
+ return ret;
+}
+
+/**
+ * unmap_pages() - Make a range of IOVA empty/not present
+ * @domain: Domain to manipulate
+ * @iova: IO virtual address to start
+ * @pgsize: Length of each page
+ * @pgcount: Length of the range in pgsize units starting from @iova
+ * @iotlb_gather: Gather struct that must be flushed on return
+ *
+ * unmap_pages() will remove a translation created by map_pages(). It cannot
+ * subdivide a mapping created by map_pages(), so it should be called with IOVA
+ * ranges that match those passed to map_pages(). The IOVA range can aggregate
+ * contiguous map_pages() calls so long as no individual range is split.
+ *
+ * Context: The caller must hold a write range lock that includes
+ * the whole range.
+ *
+ * Returns: Number of bytes of VA unmapped. iova + res will be the point
+ * unmapping stopped.
+ */
+size_t DOMAIN_NS(unmap_pages)(struct iommu_domain *domain, unsigned long iova,
+ size_t pgsize, size_t pgcount,
+ struct iommu_iotlb_gather *iotlb_gather)
+{
+ struct pt_iommu *iommu_table =
+ container_of(domain, struct pt_iommu, domain);
+ struct pt_unmap_args unmap = { .free_list = IOMMU_PAGES_LIST_INIT(
+ unmap.free_list) };
+ pt_vaddr_t len = pgsize * pgcount;
+ struct pt_range range;
+ int ret;
+
+ ret = make_range(common_from_iommu(iommu_table), &range, iova, len);
+ if (ret)
+ return 0;
+
+ pt_walk_range(&range, __unmap_range, &unmap);
+
+ gather_range_pages(iotlb_gather, iommu_table, iova, len,
+ &unmap.free_list);
+
+ return unmap.unmapped;
+}
+EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(unmap_pages), "GENERIC_PT_IOMMU");
+
+static void NS(get_info)(struct pt_iommu *iommu_table,
+ struct pt_iommu_info *info)
+{
+ struct pt_common *common = common_from_iommu(iommu_table);
+ struct pt_range range = pt_top_range(common);
+ struct pt_state pts = pt_init_top(&range);
+ pt_vaddr_t pgsize_bitmap = 0;
+
+ if (pt_feature(common, PT_FEAT_DYNAMIC_TOP)) {
+ for (pts.level = 0; pts.level <= PT_MAX_TOP_LEVEL;
+ pts.level++) {
+ if (pt_table_item_lg2sz(&pts) >= common->max_vasz_lg2)
+ break;
+ pgsize_bitmap |= pt_possible_sizes(&pts);
+ }
+ } else {
+ for (pts.level = 0; pts.level <= range.top_level; pts.level++)
+ pgsize_bitmap |= pt_possible_sizes(&pts);
+ }
+
+ /* Hide page sizes larger than the maximum OA */
+ info->pgsize_bitmap = oalog2_mod(pgsize_bitmap, common->max_oasz_lg2);
+}
+
+static void NS(deinit)(struct pt_iommu *iommu_table)
+{
+ struct pt_common *common = common_from_iommu(iommu_table);
+ struct pt_range range = pt_all_range(common);
+ struct pt_iommu_collect_args collect = {
+ .free_list = IOMMU_PAGES_LIST_INIT(collect.free_list),
+ };
+
+ iommu_pages_list_add(&collect.free_list, range.top_table);
+ pt_walk_range(&range, __collect_tables, &collect);
+
+ /*
+ * The driver has to already have fenced the HW access to the page table
+ * and invalidated any caching referring to this memory.
+ */
+ if (pt_feature(common, PT_FEAT_DMA_INCOHERENT))
+ iommu_pages_stop_incoherent_list(&collect.free_list,
+ iommu_table->iommu_device);
+ iommu_put_pages_list(&collect.free_list);
+}
+
+static const struct pt_iommu_ops NS(ops) = {
+#if IS_ENABLED(CONFIG_IOMMUFD_DRIVER) && defined(pt_entry_is_write_dirty) && \
+ IS_ENABLED(CONFIG_IOMMUFD_TEST) && defined(pt_entry_make_write_dirty)
+ .set_dirty = NS(set_dirty),
+#endif
+ .get_info = NS(get_info),
+ .deinit = NS(deinit),
+};
+
+static int pt_init_common(struct pt_common *common)
+{
+ struct pt_range top_range = pt_top_range(common);
+
+ if (PT_WARN_ON(top_range.top_level > PT_MAX_TOP_LEVEL))
+ return -EINVAL;
+
+ if (top_range.top_level == PT_MAX_TOP_LEVEL ||
+ common->max_vasz_lg2 == top_range.max_vasz_lg2)
+ common->features &= ~BIT(PT_FEAT_DYNAMIC_TOP);
+
+ if (top_range.max_vasz_lg2 == PT_VADDR_MAX_LG2)
+ common->features |= BIT(PT_FEAT_FULL_VA);
+
+ /* Requested features must match features compiled into this format */
+ if ((common->features & ~(unsigned int)PT_SUPPORTED_FEATURES) ||
+ (!IS_ENABLED(CONFIG_DEBUG_GENERIC_PT) &&
+ (common->features & PT_FORCE_ENABLED_FEATURES) !=
+ PT_FORCE_ENABLED_FEATURES))
+ return -EOPNOTSUPP;
+
+ /*
+ * Check if the top level of the page table is too small to hold the
+ * specified maxvasz.
+ */
+ if (!pt_feature(common, PT_FEAT_DYNAMIC_TOP) &&
+ top_range.top_level != PT_MAX_TOP_LEVEL) {
+ struct pt_state pts = { .range = &top_range,
+ .level = top_range.top_level };
+
+ if (common->max_vasz_lg2 >
+ pt_num_items_lg2(&pts) + pt_table_item_lg2sz(&pts))
+ return -EOPNOTSUPP;
+ }
+
+ if (common->max_oasz_lg2 == 0)
+ common->max_oasz_lg2 = pt_max_oa_lg2(common);
+ else
+ common->max_oasz_lg2 = min(common->max_oasz_lg2,
+ pt_max_oa_lg2(common));
+ return 0;
+}
+
+static int pt_iommu_init_domain(struct pt_iommu *iommu_table,
+ struct iommu_domain *domain)
+{
+ struct pt_common *common = common_from_iommu(iommu_table);
+ struct pt_iommu_info info;
+ struct pt_range range;
+
+ NS(get_info)(iommu_table, &info);
+
+ domain->type = __IOMMU_DOMAIN_PAGING;
+ domain->pgsize_bitmap = info.pgsize_bitmap;
+
+ if (pt_feature(common, PT_FEAT_DYNAMIC_TOP))
+ range = _pt_top_range(common,
+ _pt_top_set(NULL, PT_MAX_TOP_LEVEL));
+ else
+ range = pt_top_range(common);
+
+ /* A 64-bit high address space table on a 32-bit system cannot work. */
+ domain->geometry.aperture_start = (unsigned long)range.va;
+ if ((pt_vaddr_t)domain->geometry.aperture_start != range.va)
+ return -EOVERFLOW;
+
+ /*
+ * The aperture is limited to what the API can do after considering all
+ * the different types dma_addr_t/unsigned long/pt_vaddr_t that are used
+ * to store a VA. Set the aperture to something that is valid for all
+ * cases. Saturate instead of truncate the end if the types are smaller
+ * than the top range. aperture_end should be called aperture_last.
+ */
+ domain->geometry.aperture_end = (unsigned long)range.last_va;
+ if ((pt_vaddr_t)domain->geometry.aperture_end != range.last_va) {
+ domain->geometry.aperture_end = ULONG_MAX;
+ domain->pgsize_bitmap &= ULONG_MAX;
+ }
+ domain->geometry.force_aperture = true;
+
+ return 0;
+}
+
+static void pt_iommu_zero(struct pt_iommu_table *fmt_table)
+{
+ struct pt_iommu *iommu_table = &fmt_table->iommu;
+ struct pt_iommu cfg = *iommu_table;
+
+ static_assert(offsetof(struct pt_iommu_table, iommu.domain) == 0);
+ memset_after(fmt_table, 0, iommu.domain);
+
+ /* The caller can initialize some of these values */
+ iommu_table->iommu_device = cfg.iommu_device;
+ iommu_table->driver_ops = cfg.driver_ops;
+ iommu_table->nid = cfg.nid;
+}
+
+#define pt_iommu_table_cfg CONCATENATE(pt_iommu_table, _cfg)
+#define pt_iommu_init CONCATENATE(CONCATENATE(pt_iommu_, PTPFX), init)
+
+int pt_iommu_init(struct pt_iommu_table *fmt_table,
+ const struct pt_iommu_table_cfg *cfg, gfp_t gfp)
+{
+ struct pt_iommu *iommu_table = &fmt_table->iommu;
+ struct pt_common *common = common_from_iommu(iommu_table);
+ struct pt_table_p *table_mem;
+ int ret;
+
+ if (cfg->common.hw_max_vasz_lg2 > PT_MAX_VA_ADDRESS_LG2 ||
+ !cfg->common.hw_max_vasz_lg2 || !cfg->common.hw_max_oasz_lg2)
+ return -EINVAL;
+
+ pt_iommu_zero(fmt_table);
+ common->features = cfg->common.features;
+ common->max_vasz_lg2 = cfg->common.hw_max_vasz_lg2;
+ common->max_oasz_lg2 = cfg->common.hw_max_oasz_lg2;
+ ret = pt_iommu_fmt_init(fmt_table, cfg);
+ if (ret)
+ return ret;
+
+ if (cfg->common.hw_max_oasz_lg2 > pt_max_oa_lg2(common))
+ return -EINVAL;
+
+ ret = pt_init_common(common);
+ if (ret)
+ return ret;
+
+ if (pt_feature(common, PT_FEAT_DYNAMIC_TOP) &&
+ WARN_ON(!iommu_table->driver_ops ||
+ !iommu_table->driver_ops->change_top ||
+ !iommu_table->driver_ops->get_top_lock))
+ return -EINVAL;
+
+ if (pt_feature(common, PT_FEAT_SIGN_EXTEND) &&
+ (pt_feature(common, PT_FEAT_FULL_VA) ||
+ pt_feature(common, PT_FEAT_DYNAMIC_TOP)))
+ return -EINVAL;
+
+ if (pt_feature(common, PT_FEAT_DMA_INCOHERENT) &&
+ WARN_ON(!iommu_table->iommu_device))
+ return -EINVAL;
+
+ ret = pt_iommu_init_domain(iommu_table, &iommu_table->domain);
+ if (ret)
+ return ret;
+
+ table_mem = table_alloc_top(common, common->top_of_table, gfp,
+ ALLOC_NORMAL);
+ if (IS_ERR(table_mem))
+ return PTR_ERR(table_mem);
+ pt_top_set(common, table_mem, pt_top_get_level(common));
+
+ /* Must be last, see pt_iommu_deinit() */
+ iommu_table->ops = &NS(ops);
+ return 0;
+}
+EXPORT_SYMBOL_NS_GPL(pt_iommu_init, "GENERIC_PT_IOMMU");
+
+#ifdef pt_iommu_fmt_hw_info
+#define pt_iommu_table_hw_info CONCATENATE(pt_iommu_table, _hw_info)
+#define pt_iommu_hw_info CONCATENATE(CONCATENATE(pt_iommu_, PTPFX), hw_info)
+void pt_iommu_hw_info(struct pt_iommu_table *fmt_table,
+ struct pt_iommu_table_hw_info *info)
+{
+ struct pt_iommu *iommu_table = &fmt_table->iommu;
+ struct pt_common *common = common_from_iommu(iommu_table);
+ struct pt_range top_range = pt_top_range(common);
+
+ pt_iommu_fmt_hw_info(fmt_table, &top_range, info);
+}
+EXPORT_SYMBOL_NS_GPL(pt_iommu_hw_info, "GENERIC_PT_IOMMU");
+#endif
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("IOMMU Page table implementation for " __stringify(PTPFX_RAW));
+MODULE_IMPORT_NS("GENERIC_PT");
+/* For iommu_dirty_bitmap_record() */
+MODULE_IMPORT_NS("IOMMUFD");
+
+#endif /* __GENERIC_PT_IOMMU_PT_H */
diff --git a/drivers/iommu/generic_pt/kunit_generic_pt.h b/drivers/iommu/generic_pt/kunit_generic_pt.h
new file mode 100644
index 000000000000..68278bf15cfe
--- /dev/null
+++ b/drivers/iommu/generic_pt/kunit_generic_pt.h
@@ -0,0 +1,823 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ *
+ * Test the format API directly.
+ *
+ */
+#include "kunit_iommu.h"
+#include "pt_iter.h"
+
+static void do_map(struct kunit *test, pt_vaddr_t va, pt_oaddr_t pa,
+ pt_vaddr_t len)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ int ret;
+
+ KUNIT_ASSERT_EQ(test, len, (size_t)len);
+
+ ret = iommu_map(&priv->domain, va, pa, len, IOMMU_READ | IOMMU_WRITE,
+ GFP_KERNEL);
+ KUNIT_ASSERT_NO_ERRNO_FN(test, "map_pages", ret);
+}
+
+#define KUNIT_ASSERT_PT_LOAD(test, pts, entry) \
+ ({ \
+ pt_load_entry(pts); \
+ KUNIT_ASSERT_EQ(test, (pts)->type, entry); \
+ })
+
+struct check_levels_arg {
+ struct kunit *test;
+ void *fn_arg;
+ void (*fn)(struct kunit *test, struct pt_state *pts, void *arg);
+};
+
+static int __check_all_levels(struct pt_range *range, void *arg,
+ unsigned int level, struct pt_table_p *table)
+{
+ struct pt_state pts = pt_init(range, level, table);
+ struct check_levels_arg *chk = arg;
+ struct kunit *test = chk->test;
+ int ret;
+
+ _pt_iter_first(&pts);
+
+
+ /*
+ * If we were able to use the full VA space this should always be the
+ * last index in each table.
+ */
+ if (!(IS_32BIT && range->max_vasz_lg2 > 32)) {
+ if (pt_feature(range->common, PT_FEAT_SIGN_EXTEND) &&
+ pts.level == pts.range->top_level)
+ KUNIT_ASSERT_EQ(test, pts.index,
+ log2_to_int(range->max_vasz_lg2 - 1 -
+ pt_table_item_lg2sz(&pts)) -
+ 1);
+ else
+ KUNIT_ASSERT_EQ(test, pts.index,
+ log2_to_int(pt_table_oa_lg2sz(&pts) -
+ pt_table_item_lg2sz(&pts)) -
+ 1);
+ }
+
+ if (pt_can_have_table(&pts)) {
+ pt_load_single_entry(&pts);
+ KUNIT_ASSERT_EQ(test, pts.type, PT_ENTRY_TABLE);
+ ret = pt_descend(&pts, arg, __check_all_levels);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+
+ /* Index 0 is used by the test */
+ if (IS_32BIT && !pts.index)
+ return 0;
+ KUNIT_ASSERT_NE(chk->test, pts.index, 0);
+ }
+
+ /*
+ * A format should not create a table with only one entry, at least this
+ * test approach won't work.
+ */
+ KUNIT_ASSERT_GT(chk->test, pts.end_index, 1);
+
+ /*
+ * For increase top we end up using index 0 for the original top's tree,
+ * so use index 1 for testing instead.
+ */
+ pts.index = 0;
+ pt_index_to_va(&pts);
+ pt_load_single_entry(&pts);
+ if (pts.type == PT_ENTRY_TABLE && pts.end_index > 2) {
+ pts.index = 1;
+ pt_index_to_va(&pts);
+ }
+ (*chk->fn)(chk->test, &pts, chk->fn_arg);
+ return 0;
+}
+
+/*
+ * Call fn for each level in the table with a pts setup to index 0 in a table
+ * for that level. This allows writing tests that run on every level.
+ * The test can use every index in the table except the last one.
+ */
+static void check_all_levels(struct kunit *test,
+ void (*fn)(struct kunit *test,
+ struct pt_state *pts, void *arg),
+ void *fn_arg)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ struct pt_range range = pt_top_range(priv->common);
+ struct check_levels_arg chk = {
+ .test = test,
+ .fn = fn,
+ .fn_arg = fn_arg,
+ };
+ int ret;
+
+ if (pt_feature(priv->common, PT_FEAT_DYNAMIC_TOP) &&
+ priv->common->max_vasz_lg2 > range.max_vasz_lg2)
+ range.last_va = fvalog2_set_mod_max(range.va,
+ priv->common->max_vasz_lg2);
+
+ /*
+ * Map a page at the highest VA, this will populate all the levels so we
+ * can then iterate over them. Index 0 will be used for testing.
+ */
+ if (IS_32BIT && range.max_vasz_lg2 > 32)
+ range.last_va = (u32)range.last_va;
+ range.va = range.last_va - (priv->smallest_pgsz - 1);
+ do_map(test, range.va, 0, priv->smallest_pgsz);
+
+ range = pt_make_range(priv->common, range.va, range.last_va);
+ ret = pt_walk_range(&range, __check_all_levels, &chk);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+}
+
+static void test_init(struct kunit *test)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+
+ /* Fixture does the setup */
+ KUNIT_ASSERT_NE(test, priv->info.pgsize_bitmap, 0);
+}
+
+/*
+ * Basic check that the log2_* functions are working, especially at the integer
+ * limits.
+ */
+static void test_bitops(struct kunit *test)
+{
+ int i;
+
+ KUNIT_ASSERT_EQ(test, fls_t(u32, 0), 0);
+ KUNIT_ASSERT_EQ(test, fls_t(u32, 1), 1);
+ KUNIT_ASSERT_EQ(test, fls_t(u32, BIT(2)), 3);
+ KUNIT_ASSERT_EQ(test, fls_t(u32, U32_MAX), 32);
+
+ KUNIT_ASSERT_EQ(test, fls_t(u64, 0), 0);
+ KUNIT_ASSERT_EQ(test, fls_t(u64, 1), 1);
+ KUNIT_ASSERT_EQ(test, fls_t(u64, BIT(2)), 3);
+ KUNIT_ASSERT_EQ(test, fls_t(u64, U64_MAX), 64);
+
+ KUNIT_ASSERT_EQ(test, ffs_t(u32, 1), 0);
+ KUNIT_ASSERT_EQ(test, ffs_t(u32, BIT(2)), 2);
+ KUNIT_ASSERT_EQ(test, ffs_t(u32, BIT(31)), 31);
+
+ KUNIT_ASSERT_EQ(test, ffs_t(u64, 1), 0);
+ KUNIT_ASSERT_EQ(test, ffs_t(u64, BIT(2)), 2);
+ KUNIT_ASSERT_EQ(test, ffs_t(u64, BIT_ULL(63)), 63);
+
+ for (i = 0; i != 31; i++)
+ KUNIT_ASSERT_EQ(test, ffz_t(u64, BIT_ULL(i) - 1), i);
+
+ for (i = 0; i != 63; i++)
+ KUNIT_ASSERT_EQ(test, ffz_t(u64, BIT_ULL(i) - 1), i);
+
+ for (i = 0; i != 32; i++) {
+ u64 val = get_random_u64();
+
+ KUNIT_ASSERT_EQ(test, log2_mod_t(u32, val, ffs_t(u32, val)), 0);
+ KUNIT_ASSERT_EQ(test, log2_mod_t(u64, val, ffs_t(u64, val)), 0);
+
+ KUNIT_ASSERT_EQ(test, log2_mod_t(u32, val, ffz_t(u32, val)),
+ log2_to_max_int_t(u32, ffz_t(u32, val)));
+ KUNIT_ASSERT_EQ(test, log2_mod_t(u64, val, ffz_t(u64, val)),
+ log2_to_max_int_t(u64, ffz_t(u64, val)));
+ }
+}
+
+static unsigned int ref_best_pgsize(pt_vaddr_t pgsz_bitmap, pt_vaddr_t va,
+ pt_vaddr_t last_va, pt_oaddr_t oa)
+{
+ pt_vaddr_t pgsz_lg2;
+
+ /* Brute force the constraints described in pt_compute_best_pgsize() */
+ for (pgsz_lg2 = PT_VADDR_MAX_LG2 - 1; pgsz_lg2 != 0; pgsz_lg2--) {
+ if ((pgsz_bitmap & log2_to_int(pgsz_lg2)) &&
+ log2_mod(va, pgsz_lg2) == 0 &&
+ oalog2_mod(oa, pgsz_lg2) == 0 &&
+ va + log2_to_int(pgsz_lg2) - 1 <= last_va &&
+ log2_div_eq(va, va + log2_to_int(pgsz_lg2) - 1, pgsz_lg2) &&
+ oalog2_div_eq(oa, oa + log2_to_int(pgsz_lg2) - 1, pgsz_lg2))
+ return pgsz_lg2;
+ }
+ return 0;
+}
+
+/* Check that the bit logic in pt_compute_best_pgsize() works. */
+static void test_best_pgsize(struct kunit *test)
+{
+ unsigned int a_lg2;
+ unsigned int b_lg2;
+ unsigned int c_lg2;
+
+ /* Try random prefixes with every suffix combination */
+ for (a_lg2 = 1; a_lg2 != 10; a_lg2++) {
+ for (b_lg2 = 1; b_lg2 != 10; b_lg2++) {
+ for (c_lg2 = 1; c_lg2 != 10; c_lg2++) {
+ pt_vaddr_t pgsz_bitmap = get_random_u64();
+ pt_vaddr_t va = get_random_u64() << a_lg2;
+ pt_oaddr_t oa = get_random_u64() << b_lg2;
+ pt_vaddr_t last_va = log2_set_mod_max(
+ get_random_u64(), c_lg2);
+
+ if (va > last_va)
+ swap(va, last_va);
+ KUNIT_ASSERT_EQ(
+ test,
+ pt_compute_best_pgsize(pgsz_bitmap, va,
+ last_va, oa),
+ ref_best_pgsize(pgsz_bitmap, va,
+ last_va, oa));
+ }
+ }
+ }
+
+ /* 0 prefix, every suffix */
+ for (c_lg2 = 1; c_lg2 != PT_VADDR_MAX_LG2 - 1; c_lg2++) {
+ pt_vaddr_t pgsz_bitmap = get_random_u64();
+ pt_vaddr_t va = 0;
+ pt_oaddr_t oa = 0;
+ pt_vaddr_t last_va = log2_set_mod_max(0, c_lg2);
+
+ KUNIT_ASSERT_EQ(test,
+ pt_compute_best_pgsize(pgsz_bitmap, va, last_va,
+ oa),
+ ref_best_pgsize(pgsz_bitmap, va, last_va, oa));
+ }
+
+ /* 1's prefix, every suffix */
+ for (a_lg2 = 1; a_lg2 != 10; a_lg2++) {
+ for (b_lg2 = 1; b_lg2 != 10; b_lg2++) {
+ for (c_lg2 = 1; c_lg2 != 10; c_lg2++) {
+ pt_vaddr_t pgsz_bitmap = get_random_u64();
+ pt_vaddr_t va = PT_VADDR_MAX << a_lg2;
+ pt_oaddr_t oa = PT_VADDR_MAX << b_lg2;
+ pt_vaddr_t last_va = PT_VADDR_MAX;
+
+ KUNIT_ASSERT_EQ(
+ test,
+ pt_compute_best_pgsize(pgsz_bitmap, va,
+ last_va, oa),
+ ref_best_pgsize(pgsz_bitmap, va,
+ last_va, oa));
+ }
+ }
+ }
+
+ /* pgsize_bitmap is always 0 */
+ for (a_lg2 = 1; a_lg2 != 10; a_lg2++) {
+ for (b_lg2 = 1; b_lg2 != 10; b_lg2++) {
+ for (c_lg2 = 1; c_lg2 != 10; c_lg2++) {
+ pt_vaddr_t pgsz_bitmap = 0;
+ pt_vaddr_t va = get_random_u64() << a_lg2;
+ pt_oaddr_t oa = get_random_u64() << b_lg2;
+ pt_vaddr_t last_va = log2_set_mod_max(
+ get_random_u64(), c_lg2);
+
+ if (va > last_va)
+ swap(va, last_va);
+ KUNIT_ASSERT_EQ(
+ test,
+ pt_compute_best_pgsize(pgsz_bitmap, va,
+ last_va, oa),
+ 0);
+ }
+ }
+ }
+
+ if (sizeof(pt_vaddr_t) <= 4)
+ return;
+
+ /* over 32 bit page sizes */
+ for (a_lg2 = 32; a_lg2 != 42; a_lg2++) {
+ for (b_lg2 = 32; b_lg2 != 42; b_lg2++) {
+ for (c_lg2 = 32; c_lg2 != 42; c_lg2++) {
+ pt_vaddr_t pgsz_bitmap = get_random_u64();
+ pt_vaddr_t va = get_random_u64() << a_lg2;
+ pt_oaddr_t oa = get_random_u64() << b_lg2;
+ pt_vaddr_t last_va = log2_set_mod_max(
+ get_random_u64(), c_lg2);
+
+ if (va > last_va)
+ swap(va, last_va);
+ KUNIT_ASSERT_EQ(
+ test,
+ pt_compute_best_pgsize(pgsz_bitmap, va,
+ last_va, oa),
+ ref_best_pgsize(pgsz_bitmap, va,
+ last_va, oa));
+ }
+ }
+ }
+}
+
+/*
+ * Check that pt_install_table() and pt_table_pa() match
+ */
+static void test_lvl_table_ptr(struct kunit *test, struct pt_state *pts,
+ void *arg)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ pt_oaddr_t paddr =
+ log2_set_mod(priv->test_oa, 0, priv->smallest_pgsz_lg2);
+ struct pt_write_attrs attrs = {};
+
+ if (!pt_can_have_table(pts))
+ return;
+
+ KUNIT_ASSERT_NO_ERRNO_FN(test, "pt_iommu_set_prot",
+ pt_iommu_set_prot(pts->range->common, &attrs,
+ IOMMU_READ));
+
+ pt_load_single_entry(pts);
+ KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_EMPTY);
+
+ KUNIT_ASSERT_TRUE(test, pt_install_table(pts, paddr, &attrs));
+
+ /* A second install should pass because install updates pts->entry. */
+ KUNIT_ASSERT_EQ(test, pt_install_table(pts, paddr, &attrs), true);
+
+ KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_TABLE);
+ KUNIT_ASSERT_EQ(test, pt_table_pa(pts), paddr);
+
+ pt_clear_entries(pts, ilog2(1));
+ KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_EMPTY);
+}
+
+static void test_table_ptr(struct kunit *test)
+{
+ check_all_levels(test, test_lvl_table_ptr, NULL);
+}
+
+struct lvl_radix_arg {
+ pt_vaddr_t vbits;
+};
+
+/*
+ * Check pt_table_oa_lg2sz() and pt_table_item_lg2sz() they need to decode a
+ * continuous list of VA across all the levels that covers the entire advertised
+ * VA space.
+ */
+static void test_lvl_radix(struct kunit *test, struct pt_state *pts, void *arg)
+{
+ unsigned int table_lg2sz = pt_table_oa_lg2sz(pts);
+ unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
+ struct lvl_radix_arg *radix = arg;
+
+ /* Every bit below us is decoded */
+ KUNIT_ASSERT_EQ(test, log2_set_mod_max(0, isz_lg2), radix->vbits);
+
+ /* We are not decoding bits someone else is */
+ KUNIT_ASSERT_EQ(test, log2_div(radix->vbits, isz_lg2), 0);
+
+ /* Can't decode past the pt_vaddr_t size */
+ KUNIT_ASSERT_LE(test, table_lg2sz, PT_VADDR_MAX_LG2);
+ KUNIT_ASSERT_EQ(test, fvalog2_div(table_lg2sz, PT_MAX_VA_ADDRESS_LG2),
+ 0);
+
+ radix->vbits = fvalog2_set_mod_max(0, table_lg2sz);
+}
+
+static void test_max_va(struct kunit *test)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ struct pt_range range = pt_top_range(priv->common);
+
+ KUNIT_ASSERT_GE(test, priv->common->max_vasz_lg2, range.max_vasz_lg2);
+}
+
+static void test_table_radix(struct kunit *test)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ struct lvl_radix_arg radix = { .vbits = priv->smallest_pgsz - 1 };
+ struct pt_range range;
+
+ check_all_levels(test, test_lvl_radix, &radix);
+
+ range = pt_top_range(priv->common);
+ if (range.max_vasz_lg2 == PT_VADDR_MAX_LG2) {
+ KUNIT_ASSERT_EQ(test, radix.vbits, PT_VADDR_MAX);
+ } else {
+ if (!IS_32BIT)
+ KUNIT_ASSERT_EQ(test,
+ log2_set_mod_max(0, range.max_vasz_lg2),
+ radix.vbits);
+ KUNIT_ASSERT_EQ(test, log2_div(radix.vbits, range.max_vasz_lg2),
+ 0);
+ }
+}
+
+static unsigned int safe_pt_num_items_lg2(const struct pt_state *pts)
+{
+ struct pt_range top_range = pt_top_range(pts->range->common);
+ struct pt_state top_pts = pt_init_top(&top_range);
+
+ /*
+ * Avoid calling pt_num_items_lg2() on the top, instead we can derive
+ * the size of the top table from the top range.
+ */
+ if (pts->level == top_range.top_level)
+ return ilog2(pt_range_to_end_index(&top_pts));
+ return pt_num_items_lg2(pts);
+}
+
+static void test_lvl_possible_sizes(struct kunit *test, struct pt_state *pts,
+ void *arg)
+{
+ unsigned int num_items_lg2 = safe_pt_num_items_lg2(pts);
+ pt_vaddr_t pgsize_bitmap = pt_possible_sizes(pts);
+ unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
+
+ if (!pt_can_have_leaf(pts)) {
+ KUNIT_ASSERT_EQ(test, pgsize_bitmap, 0);
+ return;
+ }
+
+ /* No bits for sizes that would be outside this table */
+ KUNIT_ASSERT_EQ(test, log2_mod(pgsize_bitmap, isz_lg2), 0);
+ KUNIT_ASSERT_EQ(
+ test, fvalog2_div(pgsize_bitmap, num_items_lg2 + isz_lg2), 0);
+
+ /*
+ * Non contiguous must be supported. AMDv1 has a HW bug where it does
+ * not support it on one of the levels.
+ */
+ if ((u64)pgsize_bitmap != 0xff0000000000ULL ||
+ strcmp(__stringify(PTPFX_RAW), "amdv1") != 0)
+ KUNIT_ASSERT_TRUE(test, pgsize_bitmap & log2_to_int(isz_lg2));
+ else
+ KUNIT_ASSERT_NE(test, pgsize_bitmap, 0);
+
+ /* A contiguous entry should not span the whole table */
+ if (num_items_lg2 + isz_lg2 != PT_VADDR_MAX_LG2)
+ KUNIT_ASSERT_FALSE(
+ test,
+ pgsize_bitmap & log2_to_int(num_items_lg2 + isz_lg2));
+}
+
+static void test_entry_possible_sizes(struct kunit *test)
+{
+ check_all_levels(test, test_lvl_possible_sizes, NULL);
+}
+
+static void sweep_all_pgsizes(struct kunit *test, struct pt_state *pts,
+ struct pt_write_attrs *attrs,
+ pt_oaddr_t test_oaddr)
+{
+ pt_vaddr_t pgsize_bitmap = pt_possible_sizes(pts);
+ unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
+ unsigned int len_lg2;
+
+ if (pts->index != 0)
+ return;
+
+ for (len_lg2 = 0; len_lg2 < PT_VADDR_MAX_LG2 - 1; len_lg2++) {
+ struct pt_state sub_pts = *pts;
+ pt_oaddr_t oaddr;
+
+ if (!(pgsize_bitmap & log2_to_int(len_lg2)))
+ continue;
+
+ oaddr = log2_set_mod(test_oaddr, 0, len_lg2);
+ pt_install_leaf_entry(pts, oaddr, len_lg2, attrs);
+ /* Verify that every contiguous item translates correctly */
+ for (sub_pts.index = 0;
+ sub_pts.index != log2_to_int(len_lg2 - isz_lg2);
+ sub_pts.index++) {
+ KUNIT_ASSERT_PT_LOAD(test, &sub_pts, PT_ENTRY_OA);
+ KUNIT_ASSERT_EQ(test, pt_item_oa(&sub_pts),
+ oaddr + sub_pts.index *
+ oalog2_mul(1, isz_lg2));
+ KUNIT_ASSERT_EQ(test, pt_entry_oa(&sub_pts), oaddr);
+ KUNIT_ASSERT_EQ(test, pt_entry_num_contig_lg2(&sub_pts),
+ len_lg2 - isz_lg2);
+ }
+
+ pt_clear_entries(pts, len_lg2 - isz_lg2);
+ KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_EMPTY);
+ }
+}
+
+/*
+ * Check that pt_install_leaf_entry() and pt_entry_oa() match.
+ * Check that pt_clear_entries() works.
+ */
+static void test_lvl_entry_oa(struct kunit *test, struct pt_state *pts,
+ void *arg)
+{
+ unsigned int max_oa_lg2 = pts->range->common->max_oasz_lg2;
+ struct kunit_iommu_priv *priv = test->priv;
+ struct pt_write_attrs attrs = {};
+
+ if (!pt_can_have_leaf(pts))
+ return;
+
+ KUNIT_ASSERT_NO_ERRNO_FN(test, "pt_iommu_set_prot",
+ pt_iommu_set_prot(pts->range->common, &attrs,
+ IOMMU_READ));
+
+ sweep_all_pgsizes(test, pts, &attrs, priv->test_oa);
+
+ /* Check that the table can store the boundary OAs */
+ sweep_all_pgsizes(test, pts, &attrs, 0);
+ if (max_oa_lg2 == PT_OADDR_MAX_LG2)
+ sweep_all_pgsizes(test, pts, &attrs, PT_OADDR_MAX);
+ else
+ sweep_all_pgsizes(test, pts, &attrs,
+ oalog2_to_max_int(max_oa_lg2));
+}
+
+static void test_entry_oa(struct kunit *test)
+{
+ check_all_levels(test, test_lvl_entry_oa, NULL);
+}
+
+/* Test pt_attr_from_entry() */
+static void test_lvl_attr_from_entry(struct kunit *test, struct pt_state *pts,
+ void *arg)
+{
+ pt_vaddr_t pgsize_bitmap = pt_possible_sizes(pts);
+ unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
+ struct kunit_iommu_priv *priv = test->priv;
+ unsigned int len_lg2;
+ unsigned int prot;
+
+ if (!pt_can_have_leaf(pts))
+ return;
+
+ for (len_lg2 = 0; len_lg2 < PT_VADDR_MAX_LG2; len_lg2++) {
+ if (!(pgsize_bitmap & log2_to_int(len_lg2)))
+ continue;
+ for (prot = 0; prot <= (IOMMU_READ | IOMMU_WRITE | IOMMU_CACHE |
+ IOMMU_NOEXEC | IOMMU_MMIO);
+ prot++) {
+ pt_oaddr_t oaddr;
+ struct pt_write_attrs attrs = {};
+ u64 good_entry;
+
+ /*
+ * If the format doesn't support this combination of
+ * prot bits skip it
+ */
+ if (pt_iommu_set_prot(pts->range->common, &attrs,
+ prot)) {
+ /* But RW has to be supported */
+ KUNIT_ASSERT_NE(test, prot,
+ IOMMU_READ | IOMMU_WRITE);
+ continue;
+ }
+
+ oaddr = log2_set_mod(priv->test_oa, 0, len_lg2);
+ pt_install_leaf_entry(pts, oaddr, len_lg2, &attrs);
+ KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_OA);
+
+ good_entry = pts->entry;
+
+ memset(&attrs, 0, sizeof(attrs));
+ pt_attr_from_entry(pts, &attrs);
+
+ pt_clear_entries(pts, len_lg2 - isz_lg2);
+ KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_EMPTY);
+
+ pt_install_leaf_entry(pts, oaddr, len_lg2, &attrs);
+ KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_OA);
+
+ /*
+ * The descriptor produced by pt_attr_from_entry()
+ * produce an identical entry value when re-written
+ */
+ KUNIT_ASSERT_EQ(test, good_entry, pts->entry);
+
+ pt_clear_entries(pts, len_lg2 - isz_lg2);
+ }
+ }
+}
+
+static void test_attr_from_entry(struct kunit *test)
+{
+ check_all_levels(test, test_lvl_attr_from_entry, NULL);
+}
+
+static void test_lvl_dirty(struct kunit *test, struct pt_state *pts, void *arg)
+{
+ pt_vaddr_t pgsize_bitmap = pt_possible_sizes(pts);
+ unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
+ struct kunit_iommu_priv *priv = test->priv;
+ unsigned int start_idx = pts->index;
+ struct pt_write_attrs attrs = {};
+ unsigned int len_lg2;
+
+ if (!pt_can_have_leaf(pts))
+ return;
+
+ KUNIT_ASSERT_NO_ERRNO_FN(test, "pt_iommu_set_prot",
+ pt_iommu_set_prot(pts->range->common, &attrs,
+ IOMMU_READ | IOMMU_WRITE));
+
+ for (len_lg2 = 0; len_lg2 < PT_VADDR_MAX_LG2; len_lg2++) {
+ pt_oaddr_t oaddr;
+ unsigned int i;
+
+ if (!(pgsize_bitmap & log2_to_int(len_lg2)))
+ continue;
+
+ oaddr = log2_set_mod(priv->test_oa, 0, len_lg2);
+ pt_install_leaf_entry(pts, oaddr, len_lg2, &attrs);
+ KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_OA);
+
+ pt_load_entry(pts);
+ pt_entry_make_write_clean(pts);
+ pt_load_entry(pts);
+ KUNIT_ASSERT_FALSE(test, pt_entry_is_write_dirty(pts));
+
+ for (i = 0; i != log2_to_int(len_lg2 - isz_lg2); i++) {
+ /* dirty every contiguous entry */
+ pts->index = start_idx + i;
+ pt_load_entry(pts);
+ KUNIT_ASSERT_TRUE(test, pt_entry_make_write_dirty(pts));
+ pts->index = start_idx;
+ pt_load_entry(pts);
+ KUNIT_ASSERT_TRUE(test, pt_entry_is_write_dirty(pts));
+
+ pt_entry_make_write_clean(pts);
+ pt_load_entry(pts);
+ KUNIT_ASSERT_FALSE(test, pt_entry_is_write_dirty(pts));
+ }
+
+ pt_clear_entries(pts, len_lg2 - isz_lg2);
+ }
+}
+
+static __maybe_unused void test_dirty(struct kunit *test)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+
+ if (!pt_dirty_supported(priv->common))
+ kunit_skip(test,
+ "Page table features do not support dirty tracking");
+
+ check_all_levels(test, test_lvl_dirty, NULL);
+}
+
+static void test_lvl_sw_bit_leaf(struct kunit *test, struct pt_state *pts,
+ void *arg)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ pt_vaddr_t pgsize_bitmap = pt_possible_sizes(pts);
+ unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
+ struct pt_write_attrs attrs = {};
+ unsigned int len_lg2;
+
+ if (!pt_can_have_leaf(pts))
+ return;
+ if (pts->index != 0)
+ return;
+
+ KUNIT_ASSERT_NO_ERRNO_FN(test, "pt_iommu_set_prot",
+ pt_iommu_set_prot(pts->range->common, &attrs,
+ IOMMU_READ));
+
+ for (len_lg2 = 0; len_lg2 < PT_VADDR_MAX_LG2 - 1; len_lg2++) {
+ pt_oaddr_t paddr = log2_set_mod(priv->test_oa, 0, len_lg2);
+ struct pt_write_attrs new_attrs = {};
+ unsigned int bitnr;
+
+ if (!(pgsize_bitmap & log2_to_int(len_lg2)))
+ continue;
+
+ pt_install_leaf_entry(pts, paddr, len_lg2, &attrs);
+
+ for (bitnr = 0; bitnr <= pt_max_sw_bit(pts->range->common);
+ bitnr++)
+ KUNIT_ASSERT_FALSE(test,
+ pt_test_sw_bit_acquire(pts, bitnr));
+
+ for (bitnr = 0; bitnr <= pt_max_sw_bit(pts->range->common);
+ bitnr++) {
+ KUNIT_ASSERT_FALSE(test,
+ pt_test_sw_bit_acquire(pts, bitnr));
+ pt_set_sw_bit_release(pts, bitnr);
+ KUNIT_ASSERT_TRUE(test,
+ pt_test_sw_bit_acquire(pts, bitnr));
+ }
+
+ for (bitnr = 0; bitnr <= pt_max_sw_bit(pts->range->common);
+ bitnr++)
+ KUNIT_ASSERT_TRUE(test,
+ pt_test_sw_bit_acquire(pts, bitnr));
+
+ KUNIT_ASSERT_EQ(test, pt_item_oa(pts), paddr);
+
+ /* SW bits didn't leak into the attrs */
+ pt_attr_from_entry(pts, &new_attrs);
+ KUNIT_ASSERT_MEMEQ(test, &new_attrs, &attrs, sizeof(attrs));
+
+ pt_clear_entries(pts, len_lg2 - isz_lg2);
+ KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_EMPTY);
+ }
+}
+
+static __maybe_unused void test_sw_bit_leaf(struct kunit *test)
+{
+ check_all_levels(test, test_lvl_sw_bit_leaf, NULL);
+}
+
+static void test_lvl_sw_bit_table(struct kunit *test, struct pt_state *pts,
+ void *arg)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ struct pt_write_attrs attrs = {};
+ pt_oaddr_t paddr =
+ log2_set_mod(priv->test_oa, 0, priv->smallest_pgsz_lg2);
+ unsigned int bitnr;
+
+ if (!pt_can_have_leaf(pts))
+ return;
+ if (pts->index != 0)
+ return;
+
+ KUNIT_ASSERT_NO_ERRNO_FN(test, "pt_iommu_set_prot",
+ pt_iommu_set_prot(pts->range->common, &attrs,
+ IOMMU_READ));
+
+ KUNIT_ASSERT_TRUE(test, pt_install_table(pts, paddr, &attrs));
+
+ for (bitnr = 0; bitnr <= pt_max_sw_bit(pts->range->common); bitnr++)
+ KUNIT_ASSERT_FALSE(test, pt_test_sw_bit_acquire(pts, bitnr));
+
+ for (bitnr = 0; bitnr <= pt_max_sw_bit(pts->range->common); bitnr++) {
+ KUNIT_ASSERT_FALSE(test, pt_test_sw_bit_acquire(pts, bitnr));
+ pt_set_sw_bit_release(pts, bitnr);
+ KUNIT_ASSERT_TRUE(test, pt_test_sw_bit_acquire(pts, bitnr));
+ }
+
+ for (bitnr = 0; bitnr <= pt_max_sw_bit(pts->range->common); bitnr++)
+ KUNIT_ASSERT_TRUE(test, pt_test_sw_bit_acquire(pts, bitnr));
+
+ KUNIT_ASSERT_EQ(test, pt_table_pa(pts), paddr);
+
+ pt_clear_entries(pts, ilog2(1));
+ KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_EMPTY);
+}
+
+static __maybe_unused void test_sw_bit_table(struct kunit *test)
+{
+ check_all_levels(test, test_lvl_sw_bit_table, NULL);
+}
+
+static struct kunit_case generic_pt_test_cases[] = {
+ KUNIT_CASE_FMT(test_init),
+ KUNIT_CASE_FMT(test_bitops),
+ KUNIT_CASE_FMT(test_best_pgsize),
+ KUNIT_CASE_FMT(test_table_ptr),
+ KUNIT_CASE_FMT(test_max_va),
+ KUNIT_CASE_FMT(test_table_radix),
+ KUNIT_CASE_FMT(test_entry_possible_sizes),
+ KUNIT_CASE_FMT(test_entry_oa),
+ KUNIT_CASE_FMT(test_attr_from_entry),
+#ifdef pt_entry_is_write_dirty
+ KUNIT_CASE_FMT(test_dirty),
+#endif
+#ifdef pt_sw_bit
+ KUNIT_CASE_FMT(test_sw_bit_leaf),
+ KUNIT_CASE_FMT(test_sw_bit_table),
+#endif
+ {},
+};
+
+static int pt_kunit_generic_pt_init(struct kunit *test)
+{
+ struct kunit_iommu_priv *priv;
+ int ret;
+
+ priv = kunit_kzalloc(test, sizeof(*priv), GFP_KERNEL);
+ if (!priv)
+ return -ENOMEM;
+ ret = pt_kunit_priv_init(test, priv);
+ if (ret) {
+ kunit_kfree(test, priv);
+ return ret;
+ }
+ test->priv = priv;
+ return 0;
+}
+
+static void pt_kunit_generic_pt_exit(struct kunit *test)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+
+ if (!test->priv)
+ return;
+
+ pt_iommu_deinit(priv->iommu);
+ kunit_kfree(test, test->priv);
+}
+
+static struct kunit_suite NS(generic_pt_suite) = {
+ .name = __stringify(NS(fmt_test)),
+ .init = pt_kunit_generic_pt_init,
+ .exit = pt_kunit_generic_pt_exit,
+ .test_cases = generic_pt_test_cases,
+};
+kunit_test_suites(&NS(generic_pt_suite));
diff --git a/drivers/iommu/generic_pt/kunit_iommu.h b/drivers/iommu/generic_pt/kunit_iommu.h
new file mode 100644
index 000000000000..22c9e4c4dd97
--- /dev/null
+++ b/drivers/iommu/generic_pt/kunit_iommu.h
@@ -0,0 +1,184 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ */
+#ifndef __GENERIC_PT_KUNIT_IOMMU_H
+#define __GENERIC_PT_KUNIT_IOMMU_H
+
+#define GENERIC_PT_KUNIT 1
+#include <kunit/device.h>
+#include <kunit/test.h>
+#include "../iommu-pages.h"
+#include "pt_iter.h"
+
+#define pt_iommu_table_cfg CONCATENATE(pt_iommu_table, _cfg)
+#define pt_iommu_init CONCATENATE(CONCATENATE(pt_iommu_, PTPFX), init)
+int pt_iommu_init(struct pt_iommu_table *fmt_table,
+ const struct pt_iommu_table_cfg *cfg, gfp_t gfp);
+
+/* The format can provide a list of configurations it would like to test */
+#ifdef kunit_fmt_cfgs
+static const void *kunit_pt_gen_params_cfg(struct kunit *test, const void *prev,
+ char *desc)
+{
+ uintptr_t cfg_id = (uintptr_t)prev;
+
+ cfg_id++;
+ if (cfg_id >= ARRAY_SIZE(kunit_fmt_cfgs) + 1)
+ return NULL;
+ snprintf(desc, KUNIT_PARAM_DESC_SIZE, "%s_cfg_%u",
+ __stringify(PTPFX_RAW), (unsigned int)(cfg_id - 1));
+ return (void *)cfg_id;
+}
+#define KUNIT_CASE_FMT(test_name) \
+ KUNIT_CASE_PARAM(test_name, kunit_pt_gen_params_cfg)
+#else
+#define KUNIT_CASE_FMT(test_name) KUNIT_CASE(test_name)
+#endif
+
+#define KUNIT_ASSERT_NO_ERRNO(test, ret) \
+ KUNIT_ASSERT_EQ_MSG(test, ret, 0, KUNIT_SUBSUBTEST_INDENT "errno %pe", \
+ ERR_PTR(ret))
+
+#define KUNIT_ASSERT_NO_ERRNO_FN(test, fn, ret) \
+ KUNIT_ASSERT_EQ_MSG(test, ret, 0, \
+ KUNIT_SUBSUBTEST_INDENT "errno %pe from %s", \
+ ERR_PTR(ret), fn)
+
+/*
+ * When the test is run on a 32 bit system unsigned long can be 32 bits. This
+ * cause the iommu op signatures to be restricted to 32 bits. Meaning the test
+ * has to be mindful not to create any VA's over the 32 bit limit. Reduce the
+ * scope of the testing as the main purpose of checking on full 32 bit is to
+ * look for 32bitism in the core code. Run the test on i386 with X86_PAE=y to
+ * get the full coverage when dma_addr_t & phys_addr_t are 8 bytes
+ */
+#define IS_32BIT (sizeof(unsigned long) == 4)
+
+struct kunit_iommu_priv {
+ union {
+ struct iommu_domain domain;
+ struct pt_iommu_table fmt_table;
+ };
+ spinlock_t top_lock;
+ struct device *dummy_dev;
+ struct pt_iommu *iommu;
+ struct pt_common *common;
+ struct pt_iommu_table_cfg cfg;
+ struct pt_iommu_info info;
+ unsigned int smallest_pgsz_lg2;
+ pt_vaddr_t smallest_pgsz;
+ unsigned int largest_pgsz_lg2;
+ pt_oaddr_t test_oa;
+ pt_vaddr_t safe_pgsize_bitmap;
+ unsigned long orig_nr_secondary_pagetable;
+
+};
+PT_IOMMU_CHECK_DOMAIN(struct kunit_iommu_priv, fmt_table.iommu, domain);
+
+static void pt_kunit_iotlb_sync(struct iommu_domain *domain,
+ struct iommu_iotlb_gather *gather)
+{
+ iommu_put_pages_list(&gather->freelist);
+}
+
+#define IOMMU_PT_DOMAIN_OPS1(x) IOMMU_PT_DOMAIN_OPS(x)
+static const struct iommu_domain_ops kunit_pt_ops = {
+ IOMMU_PT_DOMAIN_OPS1(PTPFX_RAW),
+ .iotlb_sync = &pt_kunit_iotlb_sync,
+};
+
+static void pt_kunit_change_top(struct pt_iommu *iommu_table,
+ phys_addr_t top_paddr, unsigned int top_level)
+{
+}
+
+static spinlock_t *pt_kunit_get_top_lock(struct pt_iommu *iommu_table)
+{
+ struct kunit_iommu_priv *priv = container_of(
+ iommu_table, struct kunit_iommu_priv, fmt_table.iommu);
+
+ return &priv->top_lock;
+}
+
+static const struct pt_iommu_driver_ops pt_kunit_driver_ops = {
+ .change_top = &pt_kunit_change_top,
+ .get_top_lock = &pt_kunit_get_top_lock,
+};
+
+static int pt_kunit_priv_init(struct kunit *test, struct kunit_iommu_priv *priv)
+{
+ unsigned int va_lg2sz;
+ int ret;
+
+ /* Enough so the memory allocator works */
+ priv->dummy_dev = kunit_device_register(test, "pt_kunit_dev");
+ if (IS_ERR(priv->dummy_dev))
+ return PTR_ERR(priv->dummy_dev);
+ set_dev_node(priv->dummy_dev, NUMA_NO_NODE);
+
+ spin_lock_init(&priv->top_lock);
+
+#ifdef kunit_fmt_cfgs
+ priv->cfg = kunit_fmt_cfgs[((uintptr_t)test->param_value) - 1];
+ /*
+ * The format can set a list of features that the kunit_fmt_cfgs
+ * controls, other features are default to on.
+ */
+ priv->cfg.common.features |= PT_SUPPORTED_FEATURES &
+ (~KUNIT_FMT_FEATURES);
+#else
+ priv->cfg.common.features = PT_SUPPORTED_FEATURES;
+#endif
+
+ /* Defaults, for the kunit */
+ if (!priv->cfg.common.hw_max_vasz_lg2)
+ priv->cfg.common.hw_max_vasz_lg2 = PT_MAX_VA_ADDRESS_LG2;
+ if (!priv->cfg.common.hw_max_oasz_lg2)
+ priv->cfg.common.hw_max_oasz_lg2 = pt_max_oa_lg2(NULL);
+
+ priv->fmt_table.iommu.nid = NUMA_NO_NODE;
+ priv->fmt_table.iommu.driver_ops = &pt_kunit_driver_ops;
+ priv->fmt_table.iommu.iommu_device = priv->dummy_dev;
+ priv->domain.ops = &kunit_pt_ops;
+ ret = pt_iommu_init(&priv->fmt_table, &priv->cfg, GFP_KERNEL);
+ if (ret) {
+ if (ret == -EOVERFLOW)
+ kunit_skip(
+ test,
+ "This configuration cannot be tested on 32 bit");
+ return ret;
+ }
+
+ priv->iommu = &priv->fmt_table.iommu;
+ priv->common = common_from_iommu(&priv->fmt_table.iommu);
+ priv->iommu->ops->get_info(priv->iommu, &priv->info);
+
+ /*
+ * size_t is used to pass the mapping length, it can be 32 bit, truncate
+ * the pagesizes so we don't use large sizes.
+ */
+ priv->info.pgsize_bitmap = (size_t)priv->info.pgsize_bitmap;
+
+ priv->smallest_pgsz_lg2 = vaffs(priv->info.pgsize_bitmap);
+ priv->smallest_pgsz = log2_to_int(priv->smallest_pgsz_lg2);
+ priv->largest_pgsz_lg2 =
+ vafls((dma_addr_t)priv->info.pgsize_bitmap) - 1;
+
+ priv->test_oa =
+ oalog2_mod(0x74a71445deadbeef, priv->common->max_oasz_lg2);
+
+ /*
+ * We run out of VA space if the mappings get too big, make something
+ * smaller that can safely pass through dma_addr_t API.
+ */
+ va_lg2sz = priv->common->max_vasz_lg2;
+ if (IS_32BIT && va_lg2sz > 32)
+ va_lg2sz = 32;
+ priv->safe_pgsize_bitmap =
+ log2_mod(priv->info.pgsize_bitmap, va_lg2sz - 1);
+
+ return 0;
+}
+
+#endif
diff --git a/drivers/iommu/generic_pt/kunit_iommu_pt.h b/drivers/iommu/generic_pt/kunit_iommu_pt.h
new file mode 100644
index 000000000000..e8a63c8ea850
--- /dev/null
+++ b/drivers/iommu/generic_pt/kunit_iommu_pt.h
@@ -0,0 +1,487 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+ */
+#include "kunit_iommu.h"
+#include "pt_iter.h"
+#include <linux/generic_pt/iommu.h>
+#include <linux/iommu.h>
+
+static void do_map(struct kunit *test, pt_vaddr_t va, pt_oaddr_t pa,
+ pt_vaddr_t len);
+
+struct count_valids {
+ u64 per_size[PT_VADDR_MAX_LG2];
+};
+
+static int __count_valids(struct pt_range *range, void *arg, unsigned int level,
+ struct pt_table_p *table)
+{
+ struct pt_state pts = pt_init(range, level, table);
+ struct count_valids *valids = arg;
+
+ for_each_pt_level_entry(&pts) {
+ if (pts.type == PT_ENTRY_TABLE) {
+ pt_descend(&pts, arg, __count_valids);
+ continue;
+ }
+ if (pts.type == PT_ENTRY_OA) {
+ valids->per_size[pt_entry_oa_lg2sz(&pts)]++;
+ continue;
+ }
+ }
+ return 0;
+}
+
+/*
+ * Number of valid table entries. This counts contiguous entries as a single
+ * valid.
+ */
+static unsigned int count_valids(struct kunit *test)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ struct pt_range range = pt_top_range(priv->common);
+ struct count_valids valids = {};
+ u64 total = 0;
+ unsigned int i;
+
+ KUNIT_ASSERT_NO_ERRNO(test,
+ pt_walk_range(&range, __count_valids, &valids));
+
+ for (i = 0; i != ARRAY_SIZE(valids.per_size); i++)
+ total += valids.per_size[i];
+ return total;
+}
+
+/* Only a single page size is present, count the number of valid entries */
+static unsigned int count_valids_single(struct kunit *test, pt_vaddr_t pgsz)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ struct pt_range range = pt_top_range(priv->common);
+ struct count_valids valids = {};
+ u64 total = 0;
+ unsigned int i;
+
+ KUNIT_ASSERT_NO_ERRNO(test,
+ pt_walk_range(&range, __count_valids, &valids));
+
+ for (i = 0; i != ARRAY_SIZE(valids.per_size); i++) {
+ if ((1ULL << i) == pgsz)
+ total = valids.per_size[i];
+ else
+ KUNIT_ASSERT_EQ(test, valids.per_size[i], 0);
+ }
+ return total;
+}
+
+static void do_unmap(struct kunit *test, pt_vaddr_t va, pt_vaddr_t len)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ size_t ret;
+
+ ret = iommu_unmap(&priv->domain, va, len);
+ KUNIT_ASSERT_EQ(test, ret, len);
+}
+
+static void check_iova(struct kunit *test, pt_vaddr_t va, pt_oaddr_t pa,
+ pt_vaddr_t len)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ pt_vaddr_t pfn = log2_div(va, priv->smallest_pgsz_lg2);
+ pt_vaddr_t end_pfn = pfn + log2_div(len, priv->smallest_pgsz_lg2);
+
+ for (; pfn != end_pfn; pfn++) {
+ phys_addr_t res = iommu_iova_to_phys(&priv->domain,
+ pfn * priv->smallest_pgsz);
+
+ KUNIT_ASSERT_EQ(test, res, (phys_addr_t)pa);
+ if (res != pa)
+ break;
+ pa += priv->smallest_pgsz;
+ }
+}
+
+static void test_increase_level(struct kunit *test)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ struct pt_common *common = priv->common;
+
+ if (!pt_feature(common, PT_FEAT_DYNAMIC_TOP))
+ kunit_skip(test, "PT_FEAT_DYNAMIC_TOP not set for this format");
+
+ if (IS_32BIT)
+ kunit_skip(test, "Unable to test on 32bit");
+
+ KUNIT_ASSERT_GT(test, common->max_vasz_lg2,
+ pt_top_range(common).max_vasz_lg2);
+
+ /* Add every possible level to the max */
+ while (common->max_vasz_lg2 != pt_top_range(common).max_vasz_lg2) {
+ struct pt_range top_range = pt_top_range(common);
+
+ if (top_range.va == 0)
+ do_map(test, top_range.last_va + 1, 0,
+ priv->smallest_pgsz);
+ else
+ do_map(test, top_range.va - priv->smallest_pgsz, 0,
+ priv->smallest_pgsz);
+
+ KUNIT_ASSERT_EQ(test, pt_top_range(common).top_level,
+ top_range.top_level + 1);
+ KUNIT_ASSERT_GE(test, common->max_vasz_lg2,
+ pt_top_range(common).max_vasz_lg2);
+ }
+}
+
+static void test_map_simple(struct kunit *test)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ struct pt_range range = pt_top_range(priv->common);
+ struct count_valids valids = {};
+ pt_vaddr_t pgsize_bitmap = priv->safe_pgsize_bitmap;
+ unsigned int pgsz_lg2;
+ pt_vaddr_t cur_va;
+
+ /* Map every reported page size */
+ cur_va = range.va + priv->smallest_pgsz * 256;
+ for (pgsz_lg2 = 0; pgsz_lg2 != PT_VADDR_MAX_LG2; pgsz_lg2++) {
+ pt_oaddr_t paddr = log2_set_mod(priv->test_oa, 0, pgsz_lg2);
+ u64 len = log2_to_int(pgsz_lg2);
+
+ if (!(pgsize_bitmap & len))
+ continue;
+
+ cur_va = ALIGN(cur_va, len);
+ do_map(test, cur_va, paddr, len);
+ if (len <= SZ_2G)
+ check_iova(test, cur_va, paddr, len);
+ cur_va += len;
+ }
+
+ /* The read interface reports that every page size was created */
+ range = pt_top_range(priv->common);
+ KUNIT_ASSERT_NO_ERRNO(test,
+ pt_walk_range(&range, __count_valids, &valids));
+ for (pgsz_lg2 = 0; pgsz_lg2 != PT_VADDR_MAX_LG2; pgsz_lg2++) {
+ if (pgsize_bitmap & (1ULL << pgsz_lg2))
+ KUNIT_ASSERT_EQ(test, valids.per_size[pgsz_lg2], 1);
+ else
+ KUNIT_ASSERT_EQ(test, valids.per_size[pgsz_lg2], 0);
+ }
+
+ /* Unmap works */
+ range = pt_top_range(priv->common);
+ cur_va = range.va + priv->smallest_pgsz * 256;
+ for (pgsz_lg2 = 0; pgsz_lg2 != PT_VADDR_MAX_LG2; pgsz_lg2++) {
+ u64 len = log2_to_int(pgsz_lg2);
+
+ if (!(pgsize_bitmap & len))
+ continue;
+ cur_va = ALIGN(cur_va, len);
+ do_unmap(test, cur_va, len);
+ cur_va += len;
+ }
+ KUNIT_ASSERT_EQ(test, count_valids(test), 0);
+}
+
+/*
+ * Test to convert a table pointer into an OA by mapping something small,
+ * unmapping it so as to leave behind a table pointer, then mapping something
+ * larger that will convert the table into an OA.
+ */
+static void test_map_table_to_oa(struct kunit *test)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ pt_vaddr_t limited_pgbitmap =
+ priv->info.pgsize_bitmap % (IS_32BIT ? SZ_2G : SZ_16G);
+ struct pt_range range = pt_top_range(priv->common);
+ unsigned int pgsz_lg2;
+ pt_vaddr_t max_pgsize;
+ pt_vaddr_t cur_va;
+
+ max_pgsize = 1ULL << (vafls(limited_pgbitmap) - 1);
+ KUNIT_ASSERT_TRUE(test, priv->info.pgsize_bitmap & max_pgsize);
+
+ for (pgsz_lg2 = 0; pgsz_lg2 != PT_VADDR_MAX_LG2; pgsz_lg2++) {
+ pt_oaddr_t paddr = log2_set_mod(priv->test_oa, 0, pgsz_lg2);
+ u64 len = log2_to_int(pgsz_lg2);
+ pt_vaddr_t offset;
+
+ if (!(priv->info.pgsize_bitmap & len))
+ continue;
+ if (len > max_pgsize)
+ break;
+
+ cur_va = ALIGN(range.va + priv->smallest_pgsz * 256,
+ max_pgsize);
+ for (offset = 0; offset != max_pgsize; offset += len)
+ do_map(test, cur_va + offset, paddr + offset, len);
+ check_iova(test, cur_va, paddr, max_pgsize);
+ KUNIT_ASSERT_EQ(test, count_valids_single(test, len),
+ log2_div(max_pgsize, pgsz_lg2));
+
+ if (len == max_pgsize) {
+ do_unmap(test, cur_va, max_pgsize);
+ } else {
+ do_unmap(test, cur_va, max_pgsize / 2);
+ for (offset = max_pgsize / 2; offset != max_pgsize;
+ offset += len)
+ do_unmap(test, cur_va + offset, len);
+ }
+
+ KUNIT_ASSERT_EQ(test, count_valids(test), 0);
+ }
+}
+
+/*
+ * Test unmapping a small page at the start of a large page. This always unmaps
+ * the large page.
+ */
+static void test_unmap_split(struct kunit *test)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ struct pt_range top_range = pt_top_range(priv->common);
+ pt_vaddr_t pgsize_bitmap = priv->safe_pgsize_bitmap;
+ unsigned int pgsz_lg2;
+ unsigned int count = 0;
+
+ for (pgsz_lg2 = 0; pgsz_lg2 != PT_VADDR_MAX_LG2; pgsz_lg2++) {
+ pt_vaddr_t base_len = log2_to_int(pgsz_lg2);
+ unsigned int next_pgsz_lg2;
+
+ if (!(pgsize_bitmap & base_len))
+ continue;
+
+ for (next_pgsz_lg2 = pgsz_lg2 + 1;
+ next_pgsz_lg2 != PT_VADDR_MAX_LG2; next_pgsz_lg2++) {
+ pt_vaddr_t next_len = log2_to_int(next_pgsz_lg2);
+ pt_vaddr_t vaddr = top_range.va;
+ pt_oaddr_t paddr = 0;
+ size_t gnmapped;
+
+ if (!(pgsize_bitmap & next_len))
+ continue;
+
+ do_map(test, vaddr, paddr, next_len);
+ gnmapped = iommu_unmap(&priv->domain, vaddr, base_len);
+ KUNIT_ASSERT_EQ(test, gnmapped, next_len);
+
+ /* Make sure unmap doesn't keep going */
+ do_map(test, vaddr, paddr, next_len);
+ do_map(test, vaddr + next_len, paddr, next_len);
+ gnmapped = iommu_unmap(&priv->domain, vaddr, base_len);
+ KUNIT_ASSERT_EQ(test, gnmapped, next_len);
+ gnmapped = iommu_unmap(&priv->domain, vaddr + next_len,
+ next_len);
+ KUNIT_ASSERT_EQ(test, gnmapped, next_len);
+
+ count++;
+ }
+ }
+
+ if (count == 0)
+ kunit_skip(test, "Test needs two page sizes");
+}
+
+static void unmap_collisions(struct kunit *test, struct maple_tree *mt,
+ pt_vaddr_t start, pt_vaddr_t last)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ MA_STATE(mas, mt, start, last);
+ void *entry;
+
+ mtree_lock(mt);
+ mas_for_each(&mas, entry, last) {
+ pt_vaddr_t mas_start = mas.index;
+ pt_vaddr_t len = (mas.last - mas_start) + 1;
+ pt_oaddr_t paddr;
+
+ mas_erase(&mas);
+ mas_pause(&mas);
+ mtree_unlock(mt);
+
+ paddr = oalog2_mod(mas_start, priv->common->max_oasz_lg2);
+ check_iova(test, mas_start, paddr, len);
+ do_unmap(test, mas_start, len);
+ mtree_lock(mt);
+ }
+ mtree_unlock(mt);
+}
+
+static void clamp_range(struct kunit *test, struct pt_range *range)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+
+ if (range->last_va - range->va > SZ_1G)
+ range->last_va = range->va + SZ_1G;
+ KUNIT_ASSERT_NE(test, range->last_va, PT_VADDR_MAX);
+ if (range->va <= MAPLE_RESERVED_RANGE)
+ range->va =
+ ALIGN(MAPLE_RESERVED_RANGE, priv->smallest_pgsz);
+}
+
+/*
+ * Randomly map and unmap ranges that can large physical pages. If a random
+ * range overlaps with existing ranges then unmap them. This hits all the
+ * special cases.
+ */
+static void test_random_map(struct kunit *test)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ struct pt_range upper_range = pt_upper_range(priv->common);
+ struct pt_range top_range = pt_top_range(priv->common);
+ struct maple_tree mt;
+ unsigned int iter;
+
+ mt_init(&mt);
+
+ /*
+ * Shrink the range so randomization is more likely to have
+ * intersections
+ */
+ clamp_range(test, &top_range);
+ clamp_range(test, &upper_range);
+
+ for (iter = 0; iter != 1000; iter++) {
+ struct pt_range *range = &top_range;
+ pt_oaddr_t paddr;
+ pt_vaddr_t start;
+ pt_vaddr_t end;
+ int ret;
+
+ if (pt_feature(priv->common, PT_FEAT_SIGN_EXTEND) &&
+ ULONG_MAX >= PT_VADDR_MAX && get_random_u32_inclusive(0, 1))
+ range = &upper_range;
+
+ start = get_random_u32_below(
+ min(U32_MAX, range->last_va - range->va));
+ end = get_random_u32_below(
+ min(U32_MAX, range->last_va - start));
+
+ start = ALIGN_DOWN(start, priv->smallest_pgsz);
+ end = ALIGN(end, priv->smallest_pgsz);
+ start += range->va;
+ end += start;
+ if (start < range->va || end > range->last_va + 1 ||
+ start >= end)
+ continue;
+
+ /* Try overmapping to test the failure handling */
+ paddr = oalog2_mod(start, priv->common->max_oasz_lg2);
+ ret = iommu_map(&priv->domain, start, paddr, end - start,
+ IOMMU_READ | IOMMU_WRITE, GFP_KERNEL);
+ if (ret) {
+ KUNIT_ASSERT_EQ(test, ret, -EADDRINUSE);
+ unmap_collisions(test, &mt, start, end - 1);
+ do_map(test, start, paddr, end - start);
+ }
+
+ KUNIT_ASSERT_NO_ERRNO_FN(test, "mtree_insert_range",
+ mtree_insert_range(&mt, start, end - 1,
+ XA_ZERO_ENTRY,
+ GFP_KERNEL));
+
+ check_iova(test, start, paddr, end - start);
+ if (iter % 100)
+ cond_resched();
+ }
+
+ unmap_collisions(test, &mt, 0, PT_VADDR_MAX);
+ KUNIT_ASSERT_EQ(test, count_valids(test), 0);
+
+ mtree_destroy(&mt);
+}
+
+/* See https://lore.kernel.org/r/b9b18a03-63a2-4065-a27e-d92dd5c860bc@amd.com */
+static void test_pgsize_boundary(struct kunit *test)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ struct pt_range top_range = pt_top_range(priv->common);
+
+ if (top_range.va != 0 || top_range.last_va < 0xfef9ffff ||
+ priv->smallest_pgsz != SZ_4K)
+ kunit_skip(test, "Format does not have the required range");
+
+ do_map(test, 0xfef80000, 0x208b95d000, 0xfef9ffff - 0xfef80000 + 1);
+}
+
+/* See https://lore.kernel.org/r/20250826143816.38686-1-eugkoira@amazon.com */
+static void test_mixed(struct kunit *test)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ struct pt_range top_range = pt_top_range(priv->common);
+ u64 start = 0x3fe400ULL << 12;
+ u64 end = 0x4c0600ULL << 12;
+ pt_vaddr_t len = end - start;
+ pt_oaddr_t oa = start;
+
+ if (top_range.last_va <= start || sizeof(unsigned long) == 4)
+ kunit_skip(test, "range is too small");
+ if ((priv->safe_pgsize_bitmap & GENMASK(30, 21)) != (BIT(30) | BIT(21)))
+ kunit_skip(test, "incompatible psize");
+
+ do_map(test, start, oa, len);
+ /* 14 2M, 3 1G, 3 2M */
+ KUNIT_ASSERT_EQ(test, count_valids(test), 20);
+ check_iova(test, start, oa, len);
+}
+
+static struct kunit_case iommu_test_cases[] = {
+ KUNIT_CASE_FMT(test_increase_level),
+ KUNIT_CASE_FMT(test_map_simple),
+ KUNIT_CASE_FMT(test_map_table_to_oa),
+ KUNIT_CASE_FMT(test_unmap_split),
+ KUNIT_CASE_FMT(test_random_map),
+ KUNIT_CASE_FMT(test_pgsize_boundary),
+ KUNIT_CASE_FMT(test_mixed),
+ {},
+};
+
+static int pt_kunit_iommu_init(struct kunit *test)
+{
+ struct kunit_iommu_priv *priv;
+ int ret;
+
+ priv = kunit_kzalloc(test, sizeof(*priv), GFP_KERNEL);
+ if (!priv)
+ return -ENOMEM;
+
+ priv->orig_nr_secondary_pagetable =
+ global_node_page_state(NR_SECONDARY_PAGETABLE);
+ ret = pt_kunit_priv_init(test, priv);
+ if (ret) {
+ kunit_kfree(test, priv);
+ return ret;
+ }
+ test->priv = priv;
+ return 0;
+}
+
+static void pt_kunit_iommu_exit(struct kunit *test)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+
+ if (!test->priv)
+ return;
+
+ pt_iommu_deinit(priv->iommu);
+ /*
+ * Look for memory leaks, assumes kunit is running isolated and nothing
+ * else is using secondary page tables.
+ */
+ KUNIT_ASSERT_EQ(test, priv->orig_nr_secondary_pagetable,
+ global_node_page_state(NR_SECONDARY_PAGETABLE));
+ kunit_kfree(test, test->priv);
+}
+
+static struct kunit_suite NS(iommu_suite) = {
+ .name = __stringify(NS(iommu_test)),
+ .init = pt_kunit_iommu_init,
+ .exit = pt_kunit_iommu_exit,
+ .test_cases = iommu_test_cases,
+};
+kunit_test_suites(&NS(iommu_suite));
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Kunit for generic page table");
+MODULE_IMPORT_NS("GENERIC_PT_IOMMU");
diff --git a/drivers/iommu/generic_pt/pt_common.h b/drivers/iommu/generic_pt/pt_common.h
new file mode 100644
index 000000000000..e1123d35c907
--- /dev/null
+++ b/drivers/iommu/generic_pt/pt_common.h
@@ -0,0 +1,389 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ *
+ * This header is included after the format. It contains definitions
+ * that build on the format definitions to create the basic format API.
+ *
+ * The format API is listed here, with kdocs. The functions without bodies are
+ * implemented in the format using the pattern:
+ * static inline FMTpt_XXX(..) {..}
+ * #define pt_XXX FMTpt_XXX
+ *
+ * If the format doesn't implement a function then pt_fmt_defaults.h can provide
+ * a generic version.
+ *
+ * The routines marked "@pts: Entry to query" operate on the entire contiguous
+ * entry and can be called with a pts->index pointing to any sub item that makes
+ * up that entry.
+ *
+ * The header order is:
+ * pt_defs.h
+ * FMT.h
+ * pt_common.h
+ */
+#ifndef __GENERIC_PT_PT_COMMON_H
+#define __GENERIC_PT_PT_COMMON_H
+
+#include "pt_defs.h"
+#include "pt_fmt_defaults.h"
+
+/**
+ * pt_attr_from_entry() - Convert the permission bits back to attrs
+ * @pts: Entry to convert from
+ * @attrs: Resulting attrs
+ *
+ * Fill in the attrs with the permission bits encoded in the current leaf entry.
+ * The attrs should be usable with pt_install_leaf_entry() to reconstruct the
+ * same entry.
+ */
+static inline void pt_attr_from_entry(const struct pt_state *pts,
+ struct pt_write_attrs *attrs);
+
+/**
+ * pt_can_have_leaf() - True if the current level can have an OA entry
+ * @pts: The current level
+ *
+ * True if the current level can support pt_install_leaf_entry(). A leaf
+ * entry produce an OA.
+ */
+static inline bool pt_can_have_leaf(const struct pt_state *pts);
+
+/**
+ * pt_can_have_table() - True if the current level can have a lower table
+ * @pts: The current level
+ *
+ * Every level except 0 is allowed to have a lower table.
+ */
+static inline bool pt_can_have_table(const struct pt_state *pts)
+{
+ /* No further tables at level 0 */
+ return pts->level > 0;
+}
+
+/**
+ * pt_clear_entries() - Make entries empty (non-present)
+ * @pts: Starting table index
+ * @num_contig_lg2: Number of contiguous items to clear
+ *
+ * Clear a run of entries. A cleared entry will load back as PT_ENTRY_EMPTY
+ * and does not have any effect on table walking. The starting index must be
+ * aligned to num_contig_lg2.
+ */
+static inline void pt_clear_entries(struct pt_state *pts,
+ unsigned int num_contig_lg2);
+
+/**
+ * pt_entry_make_write_dirty() - Make an entry dirty
+ * @pts: Table entry to change
+ *
+ * Make pt_entry_is_write_dirty() return true for this entry. This can be called
+ * asynchronously with any other table manipulation under a RCU lock and must
+ * not corrupt the table.
+ */
+static inline bool pt_entry_make_write_dirty(struct pt_state *pts);
+
+/**
+ * pt_entry_make_write_clean() - Make the entry write clean
+ * @pts: Table entry to change
+ *
+ * Modify the entry so that pt_entry_is_write_dirty() == false. The HW will
+ * eventually be notified of this change via a TLB flush, which is the point
+ * that the HW must become synchronized. Any "write dirty" prior to the TLB
+ * flush can be lost, but once the TLB flush completes all writes must make
+ * their entries write dirty.
+ *
+ * The format should alter the entry in a way that is compatible with any
+ * concurrent update from HW. The entire contiguous entry is changed.
+ */
+static inline void pt_entry_make_write_clean(struct pt_state *pts);
+
+/**
+ * pt_entry_is_write_dirty() - True if the entry has been written to
+ * @pts: Entry to query
+ *
+ * "write dirty" means that the HW has written to the OA translated
+ * by this entry. If the entry is contiguous then the consolidated
+ * "write dirty" for all the items must be returned.
+ */
+static inline bool pt_entry_is_write_dirty(const struct pt_state *pts);
+
+/**
+ * pt_dirty_supported() - True if the page table supports dirty tracking
+ * @common: Page table to query
+ */
+static inline bool pt_dirty_supported(struct pt_common *common);
+
+/**
+ * pt_entry_num_contig_lg2() - Number of contiguous items for this leaf entry
+ * @pts: Entry to query
+ *
+ * Return the number of contiguous items this leaf entry spans. If the entry
+ * is single item it returns ilog2(1).
+ */
+static inline unsigned int pt_entry_num_contig_lg2(const struct pt_state *pts);
+
+/**
+ * pt_entry_oa() - Output Address for this leaf entry
+ * @pts: Entry to query
+ *
+ * Return the output address for the start of the entry. If the entry
+ * is contiguous this returns the same value for each sub-item. I.e.::
+ *
+ * log2_mod(pt_entry_oa(), pt_entry_oa_lg2sz()) == 0
+ *
+ * See pt_item_oa(). The format should implement one of these two functions
+ * depending on how it stores the OAs in the table.
+ */
+static inline pt_oaddr_t pt_entry_oa(const struct pt_state *pts);
+
+/**
+ * pt_entry_oa_lg2sz() - Return the size of an OA entry
+ * @pts: Entry to query
+ *
+ * If the entry is not contiguous this returns pt_table_item_lg2sz(), otherwise
+ * it returns the total VA/OA size of the entire contiguous entry.
+ */
+static inline unsigned int pt_entry_oa_lg2sz(const struct pt_state *pts)
+{
+ return pt_entry_num_contig_lg2(pts) + pt_table_item_lg2sz(pts);
+}
+
+/**
+ * pt_entry_oa_exact() - Return the complete OA for an entry
+ * @pts: Entry to query
+ *
+ * During iteration the first entry could have a VA with an offset from the
+ * natural start of the entry. Return the exact OA including the pts's VA
+ * offset.
+ */
+static inline pt_oaddr_t pt_entry_oa_exact(const struct pt_state *pts)
+{
+ return _pt_entry_oa_fast(pts) |
+ log2_mod(pts->range->va, pt_entry_oa_lg2sz(pts));
+}
+
+/**
+ * pt_full_va_prefix() - The top bits of the VA
+ * @common: Page table to query
+ *
+ * This is usually 0, but some formats have their VA space going downward from
+ * PT_VADDR_MAX, and will return that instead. This value must always be
+ * adjusted by struct pt_common max_vasz_lg2.
+ */
+static inline pt_vaddr_t pt_full_va_prefix(const struct pt_common *common);
+
+/**
+ * pt_has_system_page_size() - True if level 0 can install a PAGE_SHIFT entry
+ * @common: Page table to query
+ *
+ * If true the caller can use, at level 0, pt_install_leaf_entry(PAGE_SHIFT).
+ * This is useful to create optimized paths for common cases of PAGE_SIZE
+ * mappings.
+ */
+static inline bool pt_has_system_page_size(const struct pt_common *common);
+
+/**
+ * pt_install_leaf_entry() - Write a leaf entry to the table
+ * @pts: Table index to change
+ * @oa: Output Address for this leaf
+ * @oasz_lg2: Size in VA/OA for this leaf
+ * @attrs: Attributes to modify the entry
+ *
+ * A leaf OA entry will return PT_ENTRY_OA from pt_load_entry(). It translates
+ * the VA indicated by pts to the given OA.
+ *
+ * For a single item non-contiguous entry oasz_lg2 is pt_table_item_lg2sz().
+ * For contiguous it is pt_table_item_lg2sz() + num_contig_lg2.
+ *
+ * This must not be called if pt_can_have_leaf() == false. Contiguous sizes
+ * not indicated by pt_possible_sizes() must not be specified.
+ */
+static inline void pt_install_leaf_entry(struct pt_state *pts, pt_oaddr_t oa,
+ unsigned int oasz_lg2,
+ const struct pt_write_attrs *attrs);
+
+/**
+ * pt_install_table() - Write a table entry to the table
+ * @pts: Table index to change
+ * @table_pa: CPU physical address of the lower table's memory
+ * @attrs: Attributes to modify the table index
+ *
+ * A table entry will return PT_ENTRY_TABLE from pt_load_entry(). The table_pa
+ * is the table at pts->level - 1. This is done by cmpxchg so pts must have the
+ * current entry loaded. The pts is updated with the installed entry.
+ *
+ * This must not be called if pt_can_have_table() == false.
+ *
+ * Returns: true if the table was installed successfully.
+ */
+static inline bool pt_install_table(struct pt_state *pts, pt_oaddr_t table_pa,
+ const struct pt_write_attrs *attrs);
+
+/**
+ * pt_item_oa() - Output Address for this leaf item
+ * @pts: Item to query
+ *
+ * Return the output address for this item. If the item is part of a contiguous
+ * entry it returns the value of the OA for this individual sub item.
+ *
+ * See pt_entry_oa(). The format should implement one of these two functions
+ * depending on how it stores the OA's in the table.
+ */
+static inline pt_oaddr_t pt_item_oa(const struct pt_state *pts);
+
+/**
+ * pt_load_entry_raw() - Read from the location pts points at into the pts
+ * @pts: Table index to load
+ *
+ * Return the type of entry that was loaded. pts->entry will be filled in with
+ * the entry's content. See pt_load_entry()
+ */
+static inline enum pt_entry_type pt_load_entry_raw(struct pt_state *pts);
+
+/**
+ * pt_max_oa_lg2() - Return the maximum OA the table format can hold
+ * @common: Page table to query
+ *
+ * The value oalog2_to_max_int(pt_max_oa_lg2()) is the MAX for the
+ * OA. This is the absolute maximum address the table can hold. struct pt_common
+ * max_oasz_lg2 sets a lower dynamic maximum based on HW capability.
+ */
+static inline unsigned int
+pt_max_oa_lg2(const struct pt_common *common);
+
+/**
+ * pt_num_items_lg2() - Return the number of items in this table level
+ * @pts: The current level
+ *
+ * The number of items in a table level defines the number of bits this level
+ * decodes from the VA. This function is not called for the top level,
+ * so it does not need to compute a special value for the top case. The
+ * result for the top is based on pt_common max_vasz_lg2.
+ *
+ * The value is used as part of determining the table indexes via the
+ * equation::
+ *
+ * log2_mod(log2_div(VA, pt_table_item_lg2sz()), pt_num_items_lg2())
+ */
+static inline unsigned int pt_num_items_lg2(const struct pt_state *pts);
+
+/**
+ * pt_pgsz_lg2_to_level - Return the level that maps the page size
+ * @common: Page table to query
+ * @pgsize_lg2: Log2 page size
+ *
+ * Returns the table level that will map the given page size. The page
+ * size must be part of the pt_possible_sizes() for some level.
+ */
+static inline unsigned int pt_pgsz_lg2_to_level(struct pt_common *common,
+ unsigned int pgsize_lg2);
+
+/**
+ * pt_possible_sizes() - Return a bitmap of possible output sizes at this level
+ * @pts: The current level
+ *
+ * Each level has a list of possible output sizes that can be installed as
+ * leaf entries. If pt_can_have_leaf() is false returns zero.
+ *
+ * Otherwise the bit in position pt_table_item_lg2sz() should be set indicating
+ * that a non-contiguous single item leaf entry is supported. The following
+ * pt_num_items_lg2() number of bits can be set indicating contiguous entries
+ * are supported. Bit pt_table_item_lg2sz() + pt_num_items_lg2() must not be
+ * set, contiguous entries cannot span the entire table.
+ *
+ * The OR of pt_possible_sizes() of all levels is the typical bitmask of all
+ * supported sizes in the entire table.
+ */
+static inline pt_vaddr_t pt_possible_sizes(const struct pt_state *pts);
+
+/**
+ * pt_table_item_lg2sz() - Size of a single item entry in this table level
+ * @pts: The current level
+ *
+ * The size of the item specifies how much VA and OA a single item occupies.
+ *
+ * See pt_entry_oa_lg2sz() for the same value including the effect of contiguous
+ * entries.
+ */
+static inline unsigned int pt_table_item_lg2sz(const struct pt_state *pts);
+
+/**
+ * pt_table_oa_lg2sz() - Return the VA/OA size of the entire table
+ * @pts: The current level
+ *
+ * Return the size of VA decoded by the entire table level.
+ */
+static inline unsigned int pt_table_oa_lg2sz(const struct pt_state *pts)
+{
+ if (pts->range->top_level == pts->level)
+ return pts->range->max_vasz_lg2;
+ return min_t(unsigned int, pts->range->common->max_vasz_lg2,
+ pt_num_items_lg2(pts) + pt_table_item_lg2sz(pts));
+}
+
+/**
+ * pt_table_pa() - Return the CPU physical address of the table entry
+ * @pts: Entry to query
+ *
+ * This is only ever called on PT_ENTRY_TABLE entries. Must return the same
+ * value passed to pt_install_table().
+ */
+static inline pt_oaddr_t pt_table_pa(const struct pt_state *pts);
+
+/**
+ * pt_table_ptr() - Return a CPU pointer for a table item
+ * @pts: Entry to query
+ *
+ * Same as pt_table_pa() but returns a CPU pointer.
+ */
+static inline struct pt_table_p *pt_table_ptr(const struct pt_state *pts)
+{
+ return __va(pt_table_pa(pts));
+}
+
+/**
+ * pt_max_sw_bit() - Return the maximum software bit usable for any level and
+ * entry
+ * @common: Page table
+ *
+ * The swbit can be passed as bitnr to the other sw_bit functions.
+ */
+static inline unsigned int pt_max_sw_bit(struct pt_common *common);
+
+/**
+ * pt_test_sw_bit_acquire() - Read a software bit in an item
+ * @pts: Entry to read
+ * @bitnr: Bit to read
+ *
+ * Software bits are ignored by HW and can be used for any purpose by the
+ * software. This does a test bit and acquire operation.
+ */
+static inline bool pt_test_sw_bit_acquire(struct pt_state *pts,
+ unsigned int bitnr);
+
+/**
+ * pt_set_sw_bit_release() - Set a software bit in an item
+ * @pts: Entry to set
+ * @bitnr: Bit to set
+ *
+ * Software bits are ignored by HW and can be used for any purpose by the
+ * software. This does a set bit and release operation.
+ */
+static inline void pt_set_sw_bit_release(struct pt_state *pts,
+ unsigned int bitnr);
+
+/**
+ * pt_load_entry() - Read from the location pts points at into the pts
+ * @pts: Table index to load
+ *
+ * Set the type of entry that was loaded. pts->entry and pts->table_lower
+ * will be filled in with the entry's content.
+ */
+static inline void pt_load_entry(struct pt_state *pts)
+{
+ pts->type = pt_load_entry_raw(pts);
+ if (pts->type == PT_ENTRY_TABLE)
+ pts->table_lower = pt_table_ptr(pts);
+}
+#endif
diff --git a/drivers/iommu/generic_pt/pt_defs.h b/drivers/iommu/generic_pt/pt_defs.h
new file mode 100644
index 000000000000..c25544d72f97
--- /dev/null
+++ b/drivers/iommu/generic_pt/pt_defs.h
@@ -0,0 +1,332 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ *
+ * This header is included before the format. It contains definitions
+ * that are required to compile the format. The header order is:
+ * pt_defs.h
+ * fmt_XX.h
+ * pt_common.h
+ */
+#ifndef __GENERIC_PT_DEFS_H
+#define __GENERIC_PT_DEFS_H
+
+#include <linux/generic_pt/common.h>
+
+#include <linux/types.h>
+#include <linux/atomic.h>
+#include <linux/bits.h>
+#include <linux/limits.h>
+#include <linux/bug.h>
+#include <linux/kconfig.h>
+#include "pt_log2.h"
+
+/* Header self-compile default defines */
+#ifndef pt_write_attrs
+typedef u64 pt_vaddr_t;
+typedef u64 pt_oaddr_t;
+#endif
+
+struct pt_table_p;
+
+enum {
+ PT_VADDR_MAX = sizeof(pt_vaddr_t) == 8 ? U64_MAX : U32_MAX,
+ PT_VADDR_MAX_LG2 = sizeof(pt_vaddr_t) == 8 ? 64 : 32,
+ PT_OADDR_MAX = sizeof(pt_oaddr_t) == 8 ? U64_MAX : U32_MAX,
+ PT_OADDR_MAX_LG2 = sizeof(pt_oaddr_t) == 8 ? 64 : 32,
+};
+
+/*
+ * The format instantiation can have features wired off or on to optimize the
+ * code gen. Supported features are just a reflection of what the current set of
+ * kernel users want to use.
+ */
+#ifndef PT_SUPPORTED_FEATURES
+#define PT_SUPPORTED_FEATURES 0
+#endif
+
+/*
+ * When in debug mode we compile all formats with all features. This allows the
+ * kunit to test the full matrix. SIGN_EXTEND can't co-exist with DYNAMIC_TOP or
+ * FULL_VA. DMA_INCOHERENT requires a SW bit that not all formats have
+ */
+#if IS_ENABLED(CONFIG_DEBUG_GENERIC_PT)
+enum {
+ PT_ORIG_SUPPORTED_FEATURES = PT_SUPPORTED_FEATURES,
+ PT_DEBUG_SUPPORTED_FEATURES =
+ UINT_MAX &
+ ~((PT_ORIG_SUPPORTED_FEATURES & BIT(PT_FEAT_DMA_INCOHERENT) ?
+ 0 :
+ BIT(PT_FEAT_DMA_INCOHERENT))) &
+ ~((PT_ORIG_SUPPORTED_FEATURES & BIT(PT_FEAT_SIGN_EXTEND)) ?
+ BIT(PT_FEAT_DYNAMIC_TOP) | BIT(PT_FEAT_FULL_VA) :
+ BIT(PT_FEAT_SIGN_EXTEND)),
+};
+#undef PT_SUPPORTED_FEATURES
+#define PT_SUPPORTED_FEATURES PT_DEBUG_SUPPORTED_FEATURES
+#endif
+
+#ifndef PT_FORCE_ENABLED_FEATURES
+#define PT_FORCE_ENABLED_FEATURES 0
+#endif
+
+/**
+ * DOC: Generic Page Table Language
+ *
+ * Language used in Generic Page Table
+ * VA
+ * The input address to the page table, often the virtual address.
+ * OA
+ * The output address from the page table, often the physical address.
+ * leaf
+ * An entry that results in an output address.
+ * start/end
+ * An half-open range, e.g. [0,0) refers to no VA.
+ * start/last
+ * An inclusive closed range, e.g. [0,0] refers to the VA 0
+ * common
+ * The generic page table container struct pt_common
+ * level
+ * Level 0 is always a table of only leaves with no futher table pointers.
+ * Increasing levels increase the size of the table items. The least
+ * significant VA bits used to index page tables are used to index the Level
+ * 0 table. The various labels for table levels used by HW descriptions are
+ * not used.
+ * top_level
+ * The inclusive highest level of the table. A two-level table
+ * has a top level of 1.
+ * table
+ * A linear array of translation items for that level.
+ * index
+ * The position in a table of an element: item = table[index]
+ * item
+ * A single index in a table
+ * entry
+ * A single logical element in a table. If contiguous pages are not
+ * supported then item and entry are the same thing, otherwise entry refers
+ * to all the items that comprise a single contiguous translation.
+ * item/entry_size
+ * The number of bytes of VA the table index translates for.
+ * If the item is a table entry then the next table covers
+ * this size. If the entry translates to an output address then the
+ * full OA is: OA | (VA % entry_size)
+ * contig_count
+ * The number of consecutive items fused into a single entry.
+ * item_size * contig_count is the size of that entry's translation.
+ * lg2
+ * Indicates the value is encoded as log2, i.e. 1<<x is the actual value.
+ * Normally the compiler is fine to optimize divide and mod with log2 values
+ * automatically when inlining, however if the values are not constant
+ * expressions it can't. So we do it by hand; we want to avoid 64-bit
+ * divmod.
+ */
+
+/* Returned by pt_load_entry() and for_each_pt_level_entry() */
+enum pt_entry_type {
+ PT_ENTRY_EMPTY,
+ /* Entry is valid and points to a lower table level */
+ PT_ENTRY_TABLE,
+ /* Entry is valid and returns an output address */
+ PT_ENTRY_OA,
+};
+
+struct pt_range {
+ struct pt_common *common;
+ struct pt_table_p *top_table;
+ pt_vaddr_t va;
+ pt_vaddr_t last_va;
+ u8 top_level;
+ u8 max_vasz_lg2;
+};
+
+/*
+ * Similar to xa_state, this records information about an in-progress parse at a
+ * single level.
+ */
+struct pt_state {
+ struct pt_range *range;
+ struct pt_table_p *table;
+ struct pt_table_p *table_lower;
+ u64 entry;
+ enum pt_entry_type type;
+ unsigned short index;
+ unsigned short end_index;
+ u8 level;
+};
+
+#define pt_cur_table(pts, type) ((type *)((pts)->table))
+
+/*
+ * Try to install a new table pointer. The locking methodology requires this to
+ * be atomic (multiple threads can race to install a pointer). The losing
+ * threads will fail the atomic and return false. They should free any memory
+ * and reparse the table level again.
+ */
+#if !IS_ENABLED(CONFIG_GENERIC_ATOMIC64)
+static inline bool pt_table_install64(struct pt_state *pts, u64 table_entry)
+{
+ u64 *entryp = pt_cur_table(pts, u64) + pts->index;
+ u64 old_entry = pts->entry;
+ bool ret;
+
+ /*
+ * Ensure the zero'd table content itself is visible before its PTE can
+ * be. release is a NOP on !SMP, but the HW is still doing an acquire.
+ */
+ if (!IS_ENABLED(CONFIG_SMP))
+ dma_wmb();
+ ret = try_cmpxchg64_release(entryp, &old_entry, table_entry);
+ if (ret)
+ pts->entry = table_entry;
+ return ret;
+}
+#endif
+
+static inline bool pt_table_install32(struct pt_state *pts, u32 table_entry)
+{
+ u32 *entryp = pt_cur_table(pts, u32) + pts->index;
+ u32 old_entry = pts->entry;
+ bool ret;
+
+ /*
+ * Ensure the zero'd table content itself is visible before its PTE can
+ * be. release is a NOP on !SMP, but the HW is still doing an acquire.
+ */
+ if (!IS_ENABLED(CONFIG_SMP))
+ dma_wmb();
+ ret = try_cmpxchg_release(entryp, &old_entry, table_entry);
+ if (ret)
+ pts->entry = table_entry;
+ return ret;
+}
+
+#define PT_SUPPORTED_FEATURE(feature_nr) (PT_SUPPORTED_FEATURES & BIT(feature_nr))
+
+static inline bool pt_feature(const struct pt_common *common,
+ unsigned int feature_nr)
+{
+ if (PT_FORCE_ENABLED_FEATURES & BIT(feature_nr))
+ return true;
+ if (!PT_SUPPORTED_FEATURE(feature_nr))
+ return false;
+ return common->features & BIT(feature_nr);
+}
+
+static inline bool pts_feature(const struct pt_state *pts,
+ unsigned int feature_nr)
+{
+ return pt_feature(pts->range->common, feature_nr);
+}
+
+/*
+ * PT_WARN_ON is used for invariants that the kunit should be checking can't
+ * happen.
+ */
+#if IS_ENABLED(CONFIG_DEBUG_GENERIC_PT)
+#define PT_WARN_ON WARN_ON
+#else
+static inline bool PT_WARN_ON(bool condition)
+{
+ return false;
+}
+#endif
+
+/* These all work on the VA type */
+#define log2_to_int(a_lg2) log2_to_int_t(pt_vaddr_t, a_lg2)
+#define log2_to_max_int(a_lg2) log2_to_max_int_t(pt_vaddr_t, a_lg2)
+#define log2_div(a, b_lg2) log2_div_t(pt_vaddr_t, a, b_lg2)
+#define log2_div_eq(a, b, c_lg2) log2_div_eq_t(pt_vaddr_t, a, b, c_lg2)
+#define log2_mod(a, b_lg2) log2_mod_t(pt_vaddr_t, a, b_lg2)
+#define log2_mod_eq_max(a, b_lg2) log2_mod_eq_max_t(pt_vaddr_t, a, b_lg2)
+#define log2_set_mod(a, val, b_lg2) log2_set_mod_t(pt_vaddr_t, a, val, b_lg2)
+#define log2_set_mod_max(a, b_lg2) log2_set_mod_max_t(pt_vaddr_t, a, b_lg2)
+#define log2_mul(a, b_lg2) log2_mul_t(pt_vaddr_t, a, b_lg2)
+#define vaffs(a) ffs_t(pt_vaddr_t, a)
+#define vafls(a) fls_t(pt_vaddr_t, a)
+#define vaffz(a) ffz_t(pt_vaddr_t, a)
+
+/*
+ * The full VA (fva) versions permit the lg2 value to be == PT_VADDR_MAX_LG2 and
+ * generate a useful defined result. The non-fva versions will malfunction at
+ * this extreme.
+ */
+static inline pt_vaddr_t fvalog2_div(pt_vaddr_t a, unsigned int b_lg2)
+{
+ if (PT_SUPPORTED_FEATURE(PT_FEAT_FULL_VA) && b_lg2 == PT_VADDR_MAX_LG2)
+ return 0;
+ return log2_div_t(pt_vaddr_t, a, b_lg2);
+}
+
+static inline pt_vaddr_t fvalog2_mod(pt_vaddr_t a, unsigned int b_lg2)
+{
+ if (PT_SUPPORTED_FEATURE(PT_FEAT_FULL_VA) && b_lg2 == PT_VADDR_MAX_LG2)
+ return a;
+ return log2_mod_t(pt_vaddr_t, a, b_lg2);
+}
+
+static inline bool fvalog2_div_eq(pt_vaddr_t a, pt_vaddr_t b,
+ unsigned int c_lg2)
+{
+ if (PT_SUPPORTED_FEATURE(PT_FEAT_FULL_VA) && c_lg2 == PT_VADDR_MAX_LG2)
+ return true;
+ return log2_div_eq_t(pt_vaddr_t, a, b, c_lg2);
+}
+
+static inline pt_vaddr_t fvalog2_set_mod(pt_vaddr_t a, pt_vaddr_t val,
+ unsigned int b_lg2)
+{
+ if (PT_SUPPORTED_FEATURE(PT_FEAT_FULL_VA) && b_lg2 == PT_VADDR_MAX_LG2)
+ return val;
+ return log2_set_mod_t(pt_vaddr_t, a, val, b_lg2);
+}
+
+static inline pt_vaddr_t fvalog2_set_mod_max(pt_vaddr_t a, unsigned int b_lg2)
+{
+ if (PT_SUPPORTED_FEATURE(PT_FEAT_FULL_VA) && b_lg2 == PT_VADDR_MAX_LG2)
+ return PT_VADDR_MAX;
+ return log2_set_mod_max_t(pt_vaddr_t, a, b_lg2);
+}
+
+/* These all work on the OA type */
+#define oalog2_to_int(a_lg2) log2_to_int_t(pt_oaddr_t, a_lg2)
+#define oalog2_to_max_int(a_lg2) log2_to_max_int_t(pt_oaddr_t, a_lg2)
+#define oalog2_div(a, b_lg2) log2_div_t(pt_oaddr_t, a, b_lg2)
+#define oalog2_div_eq(a, b, c_lg2) log2_div_eq_t(pt_oaddr_t, a, b, c_lg2)
+#define oalog2_mod(a, b_lg2) log2_mod_t(pt_oaddr_t, a, b_lg2)
+#define oalog2_mod_eq_max(a, b_lg2) log2_mod_eq_max_t(pt_oaddr_t, a, b_lg2)
+#define oalog2_set_mod(a, val, b_lg2) log2_set_mod_t(pt_oaddr_t, a, val, b_lg2)
+#define oalog2_set_mod_max(a, b_lg2) log2_set_mod_max_t(pt_oaddr_t, a, b_lg2)
+#define oalog2_mul(a, b_lg2) log2_mul_t(pt_oaddr_t, a, b_lg2)
+#define oaffs(a) ffs_t(pt_oaddr_t, a)
+#define oafls(a) fls_t(pt_oaddr_t, a)
+#define oaffz(a) ffz_t(pt_oaddr_t, a)
+
+static inline uintptr_t _pt_top_set(struct pt_table_p *table_mem,
+ unsigned int top_level)
+{
+ return top_level | (uintptr_t)table_mem;
+}
+
+static inline void pt_top_set(struct pt_common *common,
+ struct pt_table_p *table_mem,
+ unsigned int top_level)
+{
+ WRITE_ONCE(common->top_of_table, _pt_top_set(table_mem, top_level));
+}
+
+static inline void pt_top_set_level(struct pt_common *common,
+ unsigned int top_level)
+{
+ pt_top_set(common, NULL, top_level);
+}
+
+static inline unsigned int pt_top_get_level(const struct pt_common *common)
+{
+ return READ_ONCE(common->top_of_table) % (1 << PT_TOP_LEVEL_BITS);
+}
+
+static inline bool pt_check_install_leaf_args(struct pt_state *pts,
+ pt_oaddr_t oa,
+ unsigned int oasz_lg2);
+
+#endif
diff --git a/drivers/iommu/generic_pt/pt_fmt_defaults.h b/drivers/iommu/generic_pt/pt_fmt_defaults.h
new file mode 100644
index 000000000000..69fb7c2314ca
--- /dev/null
+++ b/drivers/iommu/generic_pt/pt_fmt_defaults.h
@@ -0,0 +1,295 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ *
+ * Default definitions for formats that don't define these functions.
+ */
+#ifndef __GENERIC_PT_PT_FMT_DEFAULTS_H
+#define __GENERIC_PT_PT_FMT_DEFAULTS_H
+
+#include "pt_defs.h"
+#include <linux/log2.h>
+
+/* Header self-compile default defines */
+#ifndef pt_load_entry_raw
+#include "fmt/amdv1.h"
+#endif
+
+/*
+ * The format must provide PT_GRANULE_LG2SZ, PT_TABLEMEM_LG2SZ, and
+ * PT_ITEM_WORD_SIZE. They must be the same at every level excluding the top.
+ */
+#ifndef pt_table_item_lg2sz
+static inline unsigned int pt_table_item_lg2sz(const struct pt_state *pts)
+{
+ return PT_GRANULE_LG2SZ +
+ (PT_TABLEMEM_LG2SZ - ilog2(PT_ITEM_WORD_SIZE)) * pts->level;
+}
+#endif
+
+#ifndef pt_pgsz_lg2_to_level
+static inline unsigned int pt_pgsz_lg2_to_level(struct pt_common *common,
+ unsigned int pgsize_lg2)
+{
+ return ((unsigned int)(pgsize_lg2 - PT_GRANULE_LG2SZ)) /
+ (PT_TABLEMEM_LG2SZ - ilog2(PT_ITEM_WORD_SIZE));
+}
+#endif
+
+/*
+ * If not supplied by the format then contiguous pages are not supported.
+ *
+ * If contiguous pages are supported then the format must also provide
+ * pt_contig_count_lg2() if it supports a single contiguous size per level,
+ * or pt_possible_sizes() if it supports multiple sizes per level.
+ */
+#ifndef pt_entry_num_contig_lg2
+static inline unsigned int pt_entry_num_contig_lg2(const struct pt_state *pts)
+{
+ return ilog2(1);
+}
+
+/*
+ * Return the number of contiguous OA items forming an entry at this table level
+ */
+static inline unsigned short pt_contig_count_lg2(const struct pt_state *pts)
+{
+ return ilog2(1);
+}
+#endif
+
+/* If not supplied by the format then dirty tracking is not supported */
+#ifndef pt_entry_is_write_dirty
+static inline bool pt_entry_is_write_dirty(const struct pt_state *pts)
+{
+ return false;
+}
+
+static inline void pt_entry_make_write_clean(struct pt_state *pts)
+{
+}
+
+static inline bool pt_dirty_supported(struct pt_common *common)
+{
+ return false;
+}
+#else
+/* If not supplied then dirty tracking is always enabled */
+#ifndef pt_dirty_supported
+static inline bool pt_dirty_supported(struct pt_common *common)
+{
+ return true;
+}
+#endif
+#endif
+
+#ifndef pt_entry_make_write_dirty
+static inline bool pt_entry_make_write_dirty(struct pt_state *pts)
+{
+ return false;
+}
+#endif
+
+/*
+ * Format supplies either:
+ * pt_entry_oa - OA is at the start of a contiguous entry
+ * or
+ * pt_item_oa - OA is adjusted for every item in a contiguous entry
+ *
+ * Build the missing one
+ *
+ * The internal helper _pt_entry_oa_fast() allows generating
+ * an efficient pt_entry_oa_exact(), it doesn't care which
+ * option is selected.
+ */
+#ifdef pt_entry_oa
+static inline pt_oaddr_t pt_item_oa(const struct pt_state *pts)
+{
+ return pt_entry_oa(pts) |
+ log2_mul(pts->index, pt_table_item_lg2sz(pts));
+}
+#define _pt_entry_oa_fast pt_entry_oa
+#endif
+
+#ifdef pt_item_oa
+static inline pt_oaddr_t pt_entry_oa(const struct pt_state *pts)
+{
+ return log2_set_mod(pt_item_oa(pts), 0,
+ pt_entry_num_contig_lg2(pts) +
+ pt_table_item_lg2sz(pts));
+}
+#define _pt_entry_oa_fast pt_item_oa
+#endif
+
+/*
+ * If not supplied by the format then use the constant
+ * PT_MAX_OUTPUT_ADDRESS_LG2.
+ */
+#ifndef pt_max_oa_lg2
+static inline unsigned int
+pt_max_oa_lg2(const struct pt_common *common)
+{
+ return PT_MAX_OUTPUT_ADDRESS_LG2;
+}
+#endif
+
+#ifndef pt_has_system_page_size
+static inline bool pt_has_system_page_size(const struct pt_common *common)
+{
+ return PT_GRANULE_LG2SZ == PAGE_SHIFT;
+}
+#endif
+
+/*
+ * If not supplied by the format then assume only one contiguous size determined
+ * by pt_contig_count_lg2()
+ */
+#ifndef pt_possible_sizes
+static inline unsigned short pt_contig_count_lg2(const struct pt_state *pts);
+
+/* Return a bitmap of possible leaf page sizes at this level */
+static inline pt_vaddr_t pt_possible_sizes(const struct pt_state *pts)
+{
+ unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
+
+ if (!pt_can_have_leaf(pts))
+ return 0;
+ return log2_to_int(isz_lg2) |
+ log2_to_int(pt_contig_count_lg2(pts) + isz_lg2);
+}
+#endif
+
+/* If not supplied by the format then use 0. */
+#ifndef pt_full_va_prefix
+static inline pt_vaddr_t pt_full_va_prefix(const struct pt_common *common)
+{
+ return 0;
+}
+#endif
+
+/* If not supplied by the format then zero fill using PT_ITEM_WORD_SIZE */
+#ifndef pt_clear_entries
+static inline void pt_clear_entries64(struct pt_state *pts,
+ unsigned int num_contig_lg2)
+{
+ u64 *tablep = pt_cur_table(pts, u64) + pts->index;
+ u64 *end = tablep + log2_to_int(num_contig_lg2);
+
+ PT_WARN_ON(log2_mod(pts->index, num_contig_lg2));
+ for (; tablep != end; tablep++)
+ WRITE_ONCE(*tablep, 0);
+}
+
+static inline void pt_clear_entries32(struct pt_state *pts,
+ unsigned int num_contig_lg2)
+{
+ u32 *tablep = pt_cur_table(pts, u32) + pts->index;
+ u32 *end = tablep + log2_to_int(num_contig_lg2);
+
+ PT_WARN_ON(log2_mod(pts->index, num_contig_lg2));
+ for (; tablep != end; tablep++)
+ WRITE_ONCE(*tablep, 0);
+}
+
+static inline void pt_clear_entries(struct pt_state *pts,
+ unsigned int num_contig_lg2)
+{
+ if (PT_ITEM_WORD_SIZE == sizeof(u32))
+ pt_clear_entries32(pts, num_contig_lg2);
+ else
+ pt_clear_entries64(pts, num_contig_lg2);
+}
+#define pt_clear_entries pt_clear_entries
+#endif
+
+/* If not supplied then SW bits are not supported */
+#ifdef pt_sw_bit
+static inline bool pt_test_sw_bit_acquire(struct pt_state *pts,
+ unsigned int bitnr)
+{
+ /* Acquire, pairs with pt_set_sw_bit_release() */
+ smp_mb();
+ /* For a contiguous entry the sw bit is only stored in the first item. */
+ return pts->entry & pt_sw_bit(bitnr);
+}
+#define pt_test_sw_bit_acquire pt_test_sw_bit_acquire
+
+static inline void pt_set_sw_bit_release(struct pt_state *pts,
+ unsigned int bitnr)
+{
+#if !IS_ENABLED(CONFIG_GENERIC_ATOMIC64)
+ if (PT_ITEM_WORD_SIZE == sizeof(u64)) {
+ u64 *entryp = pt_cur_table(pts, u64) + pts->index;
+ u64 old_entry = pts->entry;
+ u64 new_entry;
+
+ do {
+ new_entry = old_entry | pt_sw_bit(bitnr);
+ } while (!try_cmpxchg64_release(entryp, &old_entry, new_entry));
+ pts->entry = new_entry;
+ return;
+ }
+#endif
+ if (PT_ITEM_WORD_SIZE == sizeof(u32)) {
+ u32 *entryp = pt_cur_table(pts, u32) + pts->index;
+ u32 old_entry = pts->entry;
+ u32 new_entry;
+
+ do {
+ new_entry = old_entry | pt_sw_bit(bitnr);
+ } while (!try_cmpxchg_release(entryp, &old_entry, new_entry));
+ pts->entry = new_entry;
+ } else
+ BUILD_BUG();
+}
+#define pt_set_sw_bit_release pt_set_sw_bit_release
+#else
+static inline unsigned int pt_max_sw_bit(struct pt_common *common)
+{
+ return 0;
+}
+
+extern void __pt_no_sw_bit(void);
+static inline bool pt_test_sw_bit_acquire(struct pt_state *pts,
+ unsigned int bitnr)
+{
+ __pt_no_sw_bit();
+ return false;
+}
+
+static inline void pt_set_sw_bit_release(struct pt_state *pts,
+ unsigned int bitnr)
+{
+ __pt_no_sw_bit();
+}
+#endif
+
+/*
+ * Format can call in the pt_install_leaf_entry() to check the arguments are all
+ * aligned correctly.
+ */
+static inline bool pt_check_install_leaf_args(struct pt_state *pts,
+ pt_oaddr_t oa,
+ unsigned int oasz_lg2)
+{
+ unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
+
+ if (PT_WARN_ON(oalog2_mod(oa, oasz_lg2)))
+ return false;
+
+#ifdef pt_possible_sizes
+ if (PT_WARN_ON(isz_lg2 > oasz_lg2 ||
+ oasz_lg2 > isz_lg2 + pt_num_items_lg2(pts)))
+ return false;
+#else
+ if (PT_WARN_ON(oasz_lg2 != isz_lg2 &&
+ oasz_lg2 != isz_lg2 + pt_contig_count_lg2(pts)))
+ return false;
+#endif
+
+ if (PT_WARN_ON(oalog2_mod(pts->index, oasz_lg2 - isz_lg2)))
+ return false;
+ return true;
+}
+
+#endif
diff --git a/drivers/iommu/generic_pt/pt_iter.h b/drivers/iommu/generic_pt/pt_iter.h
new file mode 100644
index 000000000000..c0d8617cce29
--- /dev/null
+++ b/drivers/iommu/generic_pt/pt_iter.h
@@ -0,0 +1,636 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ *
+ * Iterators for Generic Page Table
+ */
+#ifndef __GENERIC_PT_PT_ITER_H
+#define __GENERIC_PT_PT_ITER_H
+
+#include "pt_common.h"
+
+#include <linux/errno.h>
+
+/*
+ * Use to mangle symbols so that backtraces and the symbol table are
+ * understandable. Any non-inlined function should get mangled like this.
+ */
+#define NS(fn) CONCATENATE(PTPFX, fn)
+
+/**
+ * pt_check_range() - Validate the range can be iterated
+ * @range: Range to validate
+ *
+ * Check that VA and last_va fall within the permitted range of VAs. If the
+ * format is using PT_FEAT_SIGN_EXTEND then this also checks the sign extension
+ * is correct.
+ */
+static inline int pt_check_range(struct pt_range *range)
+{
+ pt_vaddr_t prefix;
+
+ PT_WARN_ON(!range->max_vasz_lg2);
+
+ if (pt_feature(range->common, PT_FEAT_SIGN_EXTEND)) {
+ PT_WARN_ON(range->common->max_vasz_lg2 != range->max_vasz_lg2);
+ prefix = fvalog2_div(range->va, range->max_vasz_lg2 - 1) ?
+ PT_VADDR_MAX :
+ 0;
+ } else {
+ prefix = pt_full_va_prefix(range->common);
+ }
+
+ if (!fvalog2_div_eq(range->va, prefix, range->max_vasz_lg2) ||
+ !fvalog2_div_eq(range->last_va, prefix, range->max_vasz_lg2))
+ return -ERANGE;
+ return 0;
+}
+
+/**
+ * pt_index_to_va() - Update range->va to the current pts->index
+ * @pts: Iteration State
+ *
+ * Adjust range->va to match the current index. This is done in a lazy manner
+ * since computing the VA takes several instructions and is rarely required.
+ */
+static inline void pt_index_to_va(struct pt_state *pts)
+{
+ pt_vaddr_t lower_va;
+
+ lower_va = log2_mul(pts->index, pt_table_item_lg2sz(pts));
+ pts->range->va = fvalog2_set_mod(pts->range->va, lower_va,
+ pt_table_oa_lg2sz(pts));
+}
+
+/*
+ * Add index_count_lg2 number of entries to pts's VA and index. The VA will be
+ * adjusted to the end of the contiguous block if it is currently in the middle.
+ */
+static inline void _pt_advance(struct pt_state *pts,
+ unsigned int index_count_lg2)
+{
+ pts->index = log2_set_mod(pts->index + log2_to_int(index_count_lg2), 0,
+ index_count_lg2);
+}
+
+/**
+ * pt_entry_fully_covered() - Check if the item or entry is entirely contained
+ * within pts->range
+ * @pts: Iteration State
+ * @oasz_lg2: The size of the item to check, pt_table_item_lg2sz() or
+ * pt_entry_oa_lg2sz()
+ *
+ * Returns: true if the item is fully enclosed by the pts->range.
+ */
+static inline bool pt_entry_fully_covered(const struct pt_state *pts,
+ unsigned int oasz_lg2)
+{
+ struct pt_range *range = pts->range;
+
+ /* Range begins at the start of the entry */
+ if (log2_mod(pts->range->va, oasz_lg2))
+ return false;
+
+ /* Range ends past the end of the entry */
+ if (!log2_div_eq(range->va, range->last_va, oasz_lg2))
+ return true;
+
+ /* Range ends at the end of the entry */
+ return log2_mod_eq_max(range->last_va, oasz_lg2);
+}
+
+/**
+ * pt_range_to_index() - Starting index for an iteration
+ * @pts: Iteration State
+ *
+ * Return: the starting index for the iteration in pts.
+ */
+static inline unsigned int pt_range_to_index(const struct pt_state *pts)
+{
+ unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
+
+ PT_WARN_ON(pts->level > pts->range->top_level);
+ if (pts->range->top_level == pts->level)
+ return log2_div(fvalog2_mod(pts->range->va,
+ pts->range->max_vasz_lg2),
+ isz_lg2);
+ return log2_mod(log2_div(pts->range->va, isz_lg2),
+ pt_num_items_lg2(pts));
+}
+
+/**
+ * pt_range_to_end_index() - Ending index iteration
+ * @pts: Iteration State
+ *
+ * Return: the last index for the iteration in pts.
+ */
+static inline unsigned int pt_range_to_end_index(const struct pt_state *pts)
+{
+ unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
+ struct pt_range *range = pts->range;
+ unsigned int num_entries_lg2;
+
+ if (range->va == range->last_va)
+ return pts->index + 1;
+
+ if (pts->range->top_level == pts->level)
+ return log2_div(fvalog2_mod(pts->range->last_va,
+ pts->range->max_vasz_lg2),
+ isz_lg2) +
+ 1;
+
+ num_entries_lg2 = pt_num_items_lg2(pts);
+
+ /* last_va falls within this table */
+ if (log2_div_eq(range->va, range->last_va, num_entries_lg2 + isz_lg2))
+ return log2_mod(log2_div(pts->range->last_va, isz_lg2),
+ num_entries_lg2) +
+ 1;
+
+ return log2_to_int(num_entries_lg2);
+}
+
+static inline void _pt_iter_first(struct pt_state *pts)
+{
+ pts->index = pt_range_to_index(pts);
+ pts->end_index = pt_range_to_end_index(pts);
+ PT_WARN_ON(pts->index > pts->end_index);
+}
+
+static inline bool _pt_iter_load(struct pt_state *pts)
+{
+ if (pts->index >= pts->end_index)
+ return false;
+ pt_load_entry(pts);
+ return true;
+}
+
+/**
+ * pt_next_entry() - Advance pts to the next entry
+ * @pts: Iteration State
+ *
+ * Update pts to go to the next index at this level. If pts is pointing at a
+ * contiguous entry then the index may advance my more than one.
+ */
+static inline void pt_next_entry(struct pt_state *pts)
+{
+ if (pts->type == PT_ENTRY_OA &&
+ !__builtin_constant_p(pt_entry_num_contig_lg2(pts) == 0))
+ _pt_advance(pts, pt_entry_num_contig_lg2(pts));
+ else
+ pts->index++;
+ pt_index_to_va(pts);
+}
+
+/**
+ * for_each_pt_level_entry() - For loop wrapper over entries in the range
+ * @pts: Iteration State
+ *
+ * This is the basic iteration primitive. It iterates over all the entries in
+ * pts->range that fall within the pts's current table level. Each step does
+ * pt_load_entry(pts).
+ */
+#define for_each_pt_level_entry(pts) \
+ for (_pt_iter_first(pts); _pt_iter_load(pts); pt_next_entry(pts))
+
+/**
+ * pt_load_single_entry() - Version of pt_load_entry() usable within a walker
+ * @pts: Iteration State
+ *
+ * Alternative to for_each_pt_level_entry() if the walker function uses only a
+ * single entry.
+ */
+static inline enum pt_entry_type pt_load_single_entry(struct pt_state *pts)
+{
+ pts->index = pt_range_to_index(pts);
+ pt_load_entry(pts);
+ return pts->type;
+}
+
+static __always_inline struct pt_range _pt_top_range(struct pt_common *common,
+ uintptr_t top_of_table)
+{
+ struct pt_range range = {
+ .common = common,
+ .top_table =
+ (struct pt_table_p *)(top_of_table &
+ ~(uintptr_t)PT_TOP_LEVEL_MASK),
+ .top_level = top_of_table % (1 << PT_TOP_LEVEL_BITS),
+ };
+ struct pt_state pts = { .range = &range, .level = range.top_level };
+ unsigned int max_vasz_lg2;
+
+ max_vasz_lg2 = common->max_vasz_lg2;
+ if (pt_feature(common, PT_FEAT_DYNAMIC_TOP) &&
+ pts.level != PT_MAX_TOP_LEVEL)
+ max_vasz_lg2 = min_t(unsigned int, common->max_vasz_lg2,
+ pt_num_items_lg2(&pts) +
+ pt_table_item_lg2sz(&pts));
+
+ /*
+ * The top range will default to the lower region only with sign extend.
+ */
+ range.max_vasz_lg2 = max_vasz_lg2;
+ if (pt_feature(common, PT_FEAT_SIGN_EXTEND))
+ max_vasz_lg2--;
+
+ range.va = fvalog2_set_mod(pt_full_va_prefix(common), 0, max_vasz_lg2);
+ range.last_va =
+ fvalog2_set_mod_max(pt_full_va_prefix(common), max_vasz_lg2);
+ return range;
+}
+
+/**
+ * pt_top_range() - Return a range that spans part of the top level
+ * @common: Table
+ *
+ * For PT_FEAT_SIGN_EXTEND this will return the lower range, and cover half the
+ * total page table. Otherwise it returns the entire page table.
+ */
+static __always_inline struct pt_range pt_top_range(struct pt_common *common)
+{
+ /*
+ * The top pointer can change without locking. We capture the value and
+ * it's level here and are safe to walk it so long as both values are
+ * captured without tearing.
+ */
+ return _pt_top_range(common, READ_ONCE(common->top_of_table));
+}
+
+/**
+ * pt_all_range() - Return a range that spans the entire page table
+ * @common: Table
+ *
+ * The returned range spans the whole page table. Due to how PT_FEAT_SIGN_EXTEND
+ * is supported range->va and range->last_va will be incorrect during the
+ * iteration and must not be accessed.
+ */
+static inline struct pt_range pt_all_range(struct pt_common *common)
+{
+ struct pt_range range = pt_top_range(common);
+
+ if (!pt_feature(common, PT_FEAT_SIGN_EXTEND))
+ return range;
+
+ /*
+ * Pretend the table is linear from 0 without a sign extension. This
+ * generates the correct indexes for iteration.
+ */
+ range.last_va = fvalog2_set_mod_max(0, range.max_vasz_lg2);
+ return range;
+}
+
+/**
+ * pt_upper_range() - Return a range that spans part of the top level
+ * @common: Table
+ *
+ * For PT_FEAT_SIGN_EXTEND this will return the upper range, and cover half the
+ * total page table. Otherwise it returns the entire page table.
+ */
+static inline struct pt_range pt_upper_range(struct pt_common *common)
+{
+ struct pt_range range = pt_top_range(common);
+
+ if (!pt_feature(common, PT_FEAT_SIGN_EXTEND))
+ return range;
+
+ range.va = fvalog2_set_mod(PT_VADDR_MAX, 0, range.max_vasz_lg2 - 1);
+ range.last_va = PT_VADDR_MAX;
+ return range;
+}
+
+/**
+ * pt_make_range() - Return a range that spans part of the table
+ * @common: Table
+ * @va: Start address
+ * @last_va: Last address
+ *
+ * The caller must validate the range with pt_check_range() before using it.
+ */
+static __always_inline struct pt_range
+pt_make_range(struct pt_common *common, pt_vaddr_t va, pt_vaddr_t last_va)
+{
+ struct pt_range range =
+ _pt_top_range(common, READ_ONCE(common->top_of_table));
+
+ range.va = va;
+ range.last_va = last_va;
+
+ return range;
+}
+
+/*
+ * Span a slice of the table starting at a lower table level from an active
+ * walk.
+ */
+static __always_inline struct pt_range
+pt_make_child_range(const struct pt_range *parent, pt_vaddr_t va,
+ pt_vaddr_t last_va)
+{
+ struct pt_range range = *parent;
+
+ range.va = va;
+ range.last_va = last_va;
+
+ PT_WARN_ON(last_va < va);
+ PT_WARN_ON(pt_check_range(&range));
+
+ return range;
+}
+
+/**
+ * pt_init() - Initialize a pt_state on the stack
+ * @range: Range pointer to embed in the state
+ * @level: Table level for the state
+ * @table: Pointer to the table memory at level
+ *
+ * Helper to initialize the on-stack pt_state from walker arguments.
+ */
+static __always_inline struct pt_state
+pt_init(struct pt_range *range, unsigned int level, struct pt_table_p *table)
+{
+ struct pt_state pts = {
+ .range = range,
+ .table = table,
+ .level = level,
+ };
+ return pts;
+}
+
+/**
+ * pt_init_top() - Initialize a pt_state on the stack
+ * @range: Range pointer to embed in the state
+ *
+ * The pt_state points to the top most level.
+ */
+static __always_inline struct pt_state pt_init_top(struct pt_range *range)
+{
+ return pt_init(range, range->top_level, range->top_table);
+}
+
+typedef int (*pt_level_fn_t)(struct pt_range *range, void *arg,
+ unsigned int level, struct pt_table_p *table);
+
+/**
+ * pt_descend() - Recursively invoke the walker for the lower level
+ * @pts: Iteration State
+ * @arg: Value to pass to the function
+ * @fn: Walker function to call
+ *
+ * pts must point to a table item. Invoke fn as a walker on the table
+ * pts points to.
+ */
+static __always_inline int pt_descend(struct pt_state *pts, void *arg,
+ pt_level_fn_t fn)
+{
+ int ret;
+
+ if (PT_WARN_ON(!pts->table_lower))
+ return -EINVAL;
+
+ ret = (*fn)(pts->range, arg, pts->level - 1, pts->table_lower);
+ return ret;
+}
+
+/**
+ * pt_walk_range() - Walk over a VA range
+ * @range: Range pointer
+ * @fn: Walker function to call
+ * @arg: Value to pass to the function
+ *
+ * Walk over a VA range. The caller should have done a validity check, at
+ * least calling pt_check_range(), when building range. The walk will
+ * start at the top most table.
+ */
+static __always_inline int pt_walk_range(struct pt_range *range,
+ pt_level_fn_t fn, void *arg)
+{
+ return fn(range, arg, range->top_level, range->top_table);
+}
+
+/*
+ * pt_walk_descend() - Recursively invoke the walker for a slice of a lower
+ * level
+ * @pts: Iteration State
+ * @va: Start address
+ * @last_va: Last address
+ * @fn: Walker function to call
+ * @arg: Value to pass to the function
+ *
+ * With pts pointing at a table item this will descend and over a slice of the
+ * lower table. The caller must ensure that va/last_va are within the table
+ * item. This creates a new walk and does not alter pts or pts->range.
+ */
+static __always_inline int pt_walk_descend(const struct pt_state *pts,
+ pt_vaddr_t va, pt_vaddr_t last_va,
+ pt_level_fn_t fn, void *arg)
+{
+ struct pt_range range = pt_make_child_range(pts->range, va, last_va);
+
+ if (PT_WARN_ON(!pt_can_have_table(pts)) ||
+ PT_WARN_ON(!pts->table_lower))
+ return -EINVAL;
+
+ return fn(&range, arg, pts->level - 1, pts->table_lower);
+}
+
+/*
+ * pt_walk_descend_all() - Recursively invoke the walker for a table item
+ * @parent_pts: Iteration State
+ * @fn: Walker function to call
+ * @arg: Value to pass to the function
+ *
+ * With pts pointing at a table item this will descend and over the entire lower
+ * table. This creates a new walk and does not alter pts or pts->range.
+ */
+static __always_inline int
+pt_walk_descend_all(const struct pt_state *parent_pts, pt_level_fn_t fn,
+ void *arg)
+{
+ unsigned int isz_lg2 = pt_table_item_lg2sz(parent_pts);
+
+ return pt_walk_descend(parent_pts,
+ log2_set_mod(parent_pts->range->va, 0, isz_lg2),
+ log2_set_mod_max(parent_pts->range->va, isz_lg2),
+ fn, arg);
+}
+
+/**
+ * pt_range_slice() - Return a range that spans indexes
+ * @pts: Iteration State
+ * @start_index: Starting index within pts
+ * @end_index: Ending index within pts
+ *
+ * Create a range than spans an index range of the current table level
+ * pt_state points at.
+ */
+static inline struct pt_range pt_range_slice(const struct pt_state *pts,
+ unsigned int start_index,
+ unsigned int end_index)
+{
+ unsigned int table_lg2sz = pt_table_oa_lg2sz(pts);
+ pt_vaddr_t last_va;
+ pt_vaddr_t va;
+
+ va = fvalog2_set_mod(pts->range->va,
+ log2_mul(start_index, pt_table_item_lg2sz(pts)),
+ table_lg2sz);
+ last_va = fvalog2_set_mod(
+ pts->range->va,
+ log2_mul(end_index, pt_table_item_lg2sz(pts)) - 1, table_lg2sz);
+ return pt_make_child_range(pts->range, va, last_va);
+}
+
+/**
+ * pt_top_memsize_lg2()
+ * @common: Table
+ * @top_of_table: Top of table value from _pt_top_set()
+ *
+ * Compute the allocation size of the top table. For PT_FEAT_DYNAMIC_TOP this
+ * will compute the top size assuming the table will grow.
+ */
+static inline unsigned int pt_top_memsize_lg2(struct pt_common *common,
+ uintptr_t top_of_table)
+{
+ struct pt_range range = _pt_top_range(common, top_of_table);
+ struct pt_state pts = pt_init_top(&range);
+ unsigned int num_items_lg2;
+
+ num_items_lg2 = common->max_vasz_lg2 - pt_table_item_lg2sz(&pts);
+ if (range.top_level != PT_MAX_TOP_LEVEL &&
+ pt_feature(common, PT_FEAT_DYNAMIC_TOP))
+ num_items_lg2 = min(num_items_lg2, pt_num_items_lg2(&pts));
+
+ /* Round up the allocation size to the minimum alignment */
+ return max(ffs_t(u64, PT_TOP_PHYS_MASK),
+ num_items_lg2 + ilog2(PT_ITEM_WORD_SIZE));
+}
+
+/**
+ * pt_compute_best_pgsize() - Determine the best page size for leaf entries
+ * @pgsz_bitmap: Permitted page sizes
+ * @va: Starting virtual address for the leaf entry
+ * @last_va: Last virtual address for the leaf entry, sets the max page size
+ * @oa: Starting output address for the leaf entry
+ *
+ * Compute the largest page size for va, last_va, and oa together and return it
+ * in lg2. The largest page size depends on the format's supported page sizes at
+ * this level, and the relative alignment of the VA and OA addresses. 0 means
+ * the OA cannot be stored with the provided pgsz_bitmap.
+ */
+static inline unsigned int pt_compute_best_pgsize(pt_vaddr_t pgsz_bitmap,
+ pt_vaddr_t va,
+ pt_vaddr_t last_va,
+ pt_oaddr_t oa)
+{
+ unsigned int best_pgsz_lg2;
+ unsigned int pgsz_lg2;
+ pt_vaddr_t len = last_va - va + 1;
+ pt_vaddr_t mask;
+
+ if (PT_WARN_ON(va >= last_va))
+ return 0;
+
+ /*
+ * Given a VA/OA pair the best page size is the largest page size
+ * where:
+ *
+ * 1) VA and OA start at the page. Bitwise this is the count of least
+ * significant 0 bits.
+ * This also implies that last_va/oa has the same prefix as va/oa.
+ */
+ mask = va | oa;
+
+ /*
+ * 2) The page size is not larger than the last_va (length). Since page
+ * sizes are always power of two this can't be larger than the
+ * largest power of two factor of the length.
+ */
+ mask |= log2_to_int(vafls(len) - 1);
+
+ best_pgsz_lg2 = vaffs(mask);
+
+ /* Choose the highest bit <= best_pgsz_lg2 */
+ if (best_pgsz_lg2 < PT_VADDR_MAX_LG2 - 1)
+ pgsz_bitmap = log2_mod(pgsz_bitmap, best_pgsz_lg2 + 1);
+
+ pgsz_lg2 = vafls(pgsz_bitmap);
+ if (!pgsz_lg2)
+ return 0;
+
+ pgsz_lg2--;
+
+ PT_WARN_ON(log2_mod(va, pgsz_lg2) != 0);
+ PT_WARN_ON(oalog2_mod(oa, pgsz_lg2) != 0);
+ PT_WARN_ON(va + log2_to_int(pgsz_lg2) - 1 > last_va);
+ PT_WARN_ON(!log2_div_eq(va, va + log2_to_int(pgsz_lg2) - 1, pgsz_lg2));
+ PT_WARN_ON(
+ !oalog2_div_eq(oa, oa + log2_to_int(pgsz_lg2) - 1, pgsz_lg2));
+ return pgsz_lg2;
+}
+
+#define _PT_MAKE_CALL_LEVEL(fn) \
+ static __always_inline int fn(struct pt_range *range, void *arg, \
+ unsigned int level, \
+ struct pt_table_p *table) \
+ { \
+ static_assert(PT_MAX_TOP_LEVEL <= 5); \
+ if (level == 0) \
+ return CONCATENATE(fn, 0)(range, arg, 0, table); \
+ if (level == 1 || PT_MAX_TOP_LEVEL == 1) \
+ return CONCATENATE(fn, 1)(range, arg, 1, table); \
+ if (level == 2 || PT_MAX_TOP_LEVEL == 2) \
+ return CONCATENATE(fn, 2)(range, arg, 2, table); \
+ if (level == 3 || PT_MAX_TOP_LEVEL == 3) \
+ return CONCATENATE(fn, 3)(range, arg, 3, table); \
+ if (level == 4 || PT_MAX_TOP_LEVEL == 4) \
+ return CONCATENATE(fn, 4)(range, arg, 4, table); \
+ return CONCATENATE(fn, 5)(range, arg, 5, table); \
+ }
+
+static inline int __pt_make_level_fn_err(struct pt_range *range, void *arg,
+ unsigned int unused_level,
+ struct pt_table_p *table)
+{
+ static_assert(PT_MAX_TOP_LEVEL <= 5);
+ return -EPROTOTYPE;
+}
+
+#define __PT_MAKE_LEVEL_FN(fn, level, descend_fn, do_fn) \
+ static inline int fn(struct pt_range *range, void *arg, \
+ unsigned int unused_level, \
+ struct pt_table_p *table) \
+ { \
+ return do_fn(range, arg, level, table, descend_fn); \
+ }
+
+/**
+ * PT_MAKE_LEVELS() - Build an unwound walker
+ * @fn: Name of the walker function
+ * @do_fn: Function to call at each level
+ *
+ * This builds a function call tree that can be fully inlined.
+ * The caller must provide a function body in an __always_inline function::
+ *
+ * static __always_inline int do_fn(struct pt_range *range, void *arg,
+ * unsigned int level, struct pt_table_p *table,
+ * pt_level_fn_t descend_fn)
+ *
+ * An inline function will be created for each table level that calls do_fn with
+ * a compile time constant for level and a pointer to the next lower function.
+ * This generates an optimally inlined walk where each of the functions sees a
+ * constant level and can codegen the exact constants/etc for that level.
+ *
+ * Note this can produce a lot of code!
+ */
+#define PT_MAKE_LEVELS(fn, do_fn) \
+ __PT_MAKE_LEVEL_FN(CONCATENATE(fn, 0), 0, __pt_make_level_fn_err, \
+ do_fn); \
+ __PT_MAKE_LEVEL_FN(CONCATENATE(fn, 1), 1, CONCATENATE(fn, 0), do_fn); \
+ __PT_MAKE_LEVEL_FN(CONCATENATE(fn, 2), 2, CONCATENATE(fn, 1), do_fn); \
+ __PT_MAKE_LEVEL_FN(CONCATENATE(fn, 3), 3, CONCATENATE(fn, 2), do_fn); \
+ __PT_MAKE_LEVEL_FN(CONCATENATE(fn, 4), 4, CONCATENATE(fn, 3), do_fn); \
+ __PT_MAKE_LEVEL_FN(CONCATENATE(fn, 5), 5, CONCATENATE(fn, 4), do_fn); \
+ _PT_MAKE_CALL_LEVEL(fn)
+
+#endif
diff --git a/drivers/iommu/generic_pt/pt_log2.h b/drivers/iommu/generic_pt/pt_log2.h
new file mode 100644
index 000000000000..6dbbed119238
--- /dev/null
+++ b/drivers/iommu/generic_pt/pt_log2.h
@@ -0,0 +1,122 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ *
+ * Helper macros for working with log2 values
+ *
+ */
+#ifndef __GENERIC_PT_LOG2_H
+#define __GENERIC_PT_LOG2_H
+#include <linux/bitops.h>
+#include <linux/limits.h>
+
+/* Compute a */
+#define log2_to_int_t(type, a_lg2) ((type)(((type)1) << (a_lg2)))
+static_assert(log2_to_int_t(unsigned int, 0) == 1);
+
+/* Compute a - 1 (aka all low bits set) */
+#define log2_to_max_int_t(type, a_lg2) ((type)(log2_to_int_t(type, a_lg2) - 1))
+
+/* Compute a / b */
+#define log2_div_t(type, a, b_lg2) ((type)(((type)a) >> (b_lg2)))
+static_assert(log2_div_t(unsigned int, 4, 2) == 1);
+
+/*
+ * Compute:
+ * a / c == b / c
+ * aka the high bits are equal
+ */
+#define log2_div_eq_t(type, a, b, c_lg2) \
+ (log2_div_t(type, (a) ^ (b), c_lg2) == 0)
+static_assert(log2_div_eq_t(unsigned int, 1, 1, 2));
+
+/* Compute a % b */
+#define log2_mod_t(type, a, b_lg2) \
+ ((type)(((type)a) & log2_to_max_int_t(type, b_lg2)))
+static_assert(log2_mod_t(unsigned int, 1, 2) == 1);
+
+/*
+ * Compute:
+ * a % b == b - 1
+ * aka the low bits are all 1s
+ */
+#define log2_mod_eq_max_t(type, a, b_lg2) \
+ (log2_mod_t(type, a, b_lg2) == log2_to_max_int_t(type, b_lg2))
+static_assert(log2_mod_eq_max_t(unsigned int, 3, 2));
+
+/*
+ * Return a value such that:
+ * a / b == ret / b
+ * ret % b == val
+ * aka set the low bits to val. val must be < b
+ */
+#define log2_set_mod_t(type, a, val, b_lg2) \
+ ((((type)(a)) & (~log2_to_max_int_t(type, b_lg2))) | ((type)(val)))
+static_assert(log2_set_mod_t(unsigned int, 3, 1, 2) == 1);
+
+/* Return a value such that:
+ * a / b == ret / b
+ * ret % b == b - 1
+ * aka set the low bits to all 1s
+ */
+#define log2_set_mod_max_t(type, a, b_lg2) \
+ (((type)(a)) | log2_to_max_int_t(type, b_lg2))
+static_assert(log2_set_mod_max_t(unsigned int, 2, 2) == 3);
+
+/* Compute a * b */
+#define log2_mul_t(type, a, b_lg2) ((type)(((type)a) << (b_lg2)))
+static_assert(log2_mul_t(unsigned int, 2, 2) == 8);
+
+#define _dispatch_sz(type, fn, a) \
+ (sizeof(type) == 4 ? fn##32((u32)a) : fn##64(a))
+
+/*
+ * Return the highest value such that:
+ * fls_t(u32, 0) == 0
+ * fls_t(u3, 1) == 1
+ * a >= log2_to_int(ret - 1)
+ * aka find last set bit
+ */
+static inline unsigned int fls32(u32 a)
+{
+ return fls(a);
+}
+#define fls_t(type, a) _dispatch_sz(type, fls, a)
+
+/*
+ * Return the highest value such that:
+ * ffs_t(u32, 0) == UNDEFINED
+ * ffs_t(u32, 1) == 0
+ * log_mod(a, ret) == 0
+ * aka find first set bit
+ */
+static inline unsigned int __ffs32(u32 a)
+{
+ return __ffs(a);
+}
+#define ffs_t(type, a) _dispatch_sz(type, __ffs, a)
+
+/*
+ * Return the highest value such that:
+ * ffz_t(u32, U32_MAX) == UNDEFINED
+ * ffz_t(u32, 0) == 0
+ * ffz_t(u32, 1) == 1
+ * log_mod(a, ret) == log_to_max_int(ret)
+ * aka find first zero bit
+ */
+static inline unsigned int ffz32(u32 a)
+{
+ return ffz(a);
+}
+static inline unsigned int ffz64(u64 a)
+{
+ if (sizeof(u64) == sizeof(unsigned long))
+ return ffz(a);
+
+ if ((u32)a == U32_MAX)
+ return ffz32(a >> 32) + 32;
+ return ffz32(a);
+}
+#define ffz_t(type, a) _dispatch_sz(type, ffz, a)
+
+#endif
diff --git a/drivers/iommu/intel/Kconfig b/drivers/iommu/intel/Kconfig
index f2f538c70650..5471f814e073 100644
--- a/drivers/iommu/intel/Kconfig
+++ b/drivers/iommu/intel/Kconfig
@@ -13,6 +13,10 @@ config INTEL_IOMMU
bool "Support for Intel IOMMU using DMA Remapping Devices"
depends on PCI_MSI && ACPI && X86
select IOMMU_API
+ select GENERIC_PT
+ select IOMMU_PT
+ select IOMMU_PT_X86_64
+ select IOMMU_PT_VTDSS
select IOMMU_IOVA
select IOMMU_IOPF
select IOMMUFD_DRIVER if IOMMUFD
@@ -66,7 +70,7 @@ config INTEL_IOMMU_DEFAULT_ON
config INTEL_IOMMU_FLOPPY_WA
def_bool y
- depends on X86
+ depends on X86 && BLK_DEV_FD
help
Floppy disk drivers are known to bypass DMA API calls
thereby failing to work when IOMMU is enabled. This
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index e236c7ec221f..4e888867e85c 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -45,16 +45,9 @@
#define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
-#define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
-#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
-
-/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
- to match. That way, we can use 'unsigned long' for PFNs with impunity. */
-#define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
- __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
-#define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
-
static void __init check_tylersburg_isoch(void);
+static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
+ bool enable);
static int rwbf_quirk;
#define rwbf_required(iommu) (rwbf_quirk || cap_rwbf((iommu)->cap))
@@ -217,7 +210,6 @@ static int disable_igfx_iommu;
#define IDENTMAP_AZALIA 4
const struct iommu_ops intel_iommu_ops;
-static const struct iommu_dirty_ops intel_dirty_ops;
static bool translation_pre_enabled(struct intel_iommu *iommu)
{
@@ -285,13 +277,6 @@ static int __init intel_iommu_setup(char *str)
}
__setup("intel_iommu=", intel_iommu_setup);
-static int domain_pfn_supported(struct dmar_domain *domain, unsigned long pfn)
-{
- int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
-
- return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
-}
-
/*
* Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
* Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
@@ -353,23 +338,6 @@ static bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
}
-/* Return the super pagesize bitmap if supported. */
-static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
-{
- unsigned long bitmap = 0;
-
- /*
- * 1-level super page supports page size of 2MiB, 2-level super page
- * supports page size of both 2MiB and 1GiB.
- */
- if (domain->iommu_superpage == 1)
- bitmap |= SZ_2M;
- else if (domain->iommu_superpage == 2)
- bitmap |= SZ_2M | SZ_1G;
-
- return bitmap;
-}
-
struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
u8 devfn, int alloc)
{
@@ -556,13 +524,6 @@ out:
return iommu;
}
-static void domain_flush_cache(struct dmar_domain *domain,
- void *addr, int size)
-{
- if (!domain->iommu_coherency)
- clflush_cache_range(addr, size);
-}
-
static void free_context_table(struct intel_iommu *iommu)
{
struct context_entry *context;
@@ -707,280 +668,6 @@ pgtable_walk:
}
#endif
-static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
- unsigned long pfn, int *target_level,
- gfp_t gfp)
-{
- struct dma_pte *parent, *pte;
- int level = agaw_to_level(domain->agaw);
- int offset;
-
- if (!domain_pfn_supported(domain, pfn))
- /* Address beyond IOMMU's addressing capabilities. */
- return NULL;
-
- parent = domain->pgd;
-
- while (1) {
- void *tmp_page;
-
- offset = pfn_level_offset(pfn, level);
- pte = &parent[offset];
- if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
- break;
- if (level == *target_level)
- break;
-
- if (!dma_pte_present(pte)) {
- uint64_t pteval, tmp;
-
- tmp_page = iommu_alloc_pages_node_sz(domain->nid, gfp,
- SZ_4K);
-
- if (!tmp_page)
- return NULL;
-
- domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
- pteval = virt_to_phys(tmp_page) | DMA_PTE_READ |
- DMA_PTE_WRITE;
- if (domain->use_first_level)
- pteval |= DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
-
- tmp = 0ULL;
- if (!try_cmpxchg64(&pte->val, &tmp, pteval))
- /* Someone else set it while we were thinking; use theirs. */
- iommu_free_pages(tmp_page);
- else
- domain_flush_cache(domain, pte, sizeof(*pte));
- }
- if (level == 1)
- break;
-
- parent = phys_to_virt(dma_pte_addr(pte));
- level--;
- }
-
- if (!*target_level)
- *target_level = level;
-
- return pte;
-}
-
-/* return address's pte at specific level */
-static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
- unsigned long pfn,
- int level, int *large_page)
-{
- struct dma_pte *parent, *pte;
- int total = agaw_to_level(domain->agaw);
- int offset;
-
- parent = domain->pgd;
- while (level <= total) {
- offset = pfn_level_offset(pfn, total);
- pte = &parent[offset];
- if (level == total)
- return pte;
-
- if (!dma_pte_present(pte)) {
- *large_page = total;
- break;
- }
-
- if (dma_pte_superpage(pte)) {
- *large_page = total;
- return pte;
- }
-
- parent = phys_to_virt(dma_pte_addr(pte));
- total--;
- }
- return NULL;
-}
-
-/* clear last level pte, a tlb flush should be followed */
-static void dma_pte_clear_range(struct dmar_domain *domain,
- unsigned long start_pfn,
- unsigned long last_pfn)
-{
- unsigned int large_page;
- struct dma_pte *first_pte, *pte;
-
- if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
- WARN_ON(start_pfn > last_pfn))
- return;
-
- /* we don't need lock here; nobody else touches the iova range */
- do {
- large_page = 1;
- first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
- if (!pte) {
- start_pfn = align_to_level(start_pfn + 1, large_page + 1);
- continue;
- }
- do {
- dma_clear_pte(pte);
- start_pfn += lvl_to_nr_pages(large_page);
- pte++;
- } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
-
- domain_flush_cache(domain, first_pte,
- (void *)pte - (void *)first_pte);
-
- } while (start_pfn && start_pfn <= last_pfn);
-}
-
-static void dma_pte_free_level(struct dmar_domain *domain, int level,
- int retain_level, struct dma_pte *pte,
- unsigned long pfn, unsigned long start_pfn,
- unsigned long last_pfn)
-{
- pfn = max(start_pfn, pfn);
- pte = &pte[pfn_level_offset(pfn, level)];
-
- do {
- unsigned long level_pfn;
- struct dma_pte *level_pte;
-
- if (!dma_pte_present(pte) || dma_pte_superpage(pte))
- goto next;
-
- level_pfn = pfn & level_mask(level);
- level_pte = phys_to_virt(dma_pte_addr(pte));
-
- if (level > 2) {
- dma_pte_free_level(domain, level - 1, retain_level,
- level_pte, level_pfn, start_pfn,
- last_pfn);
- }
-
- /*
- * Free the page table if we're below the level we want to
- * retain and the range covers the entire table.
- */
- if (level < retain_level && !(start_pfn > level_pfn ||
- last_pfn < level_pfn + level_size(level) - 1)) {
- dma_clear_pte(pte);
- domain_flush_cache(domain, pte, sizeof(*pte));
- iommu_free_pages(level_pte);
- }
-next:
- pfn += level_size(level);
- } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
-}
-
-/*
- * clear last level (leaf) ptes and free page table pages below the
- * level we wish to keep intact.
- */
-static void dma_pte_free_pagetable(struct dmar_domain *domain,
- unsigned long start_pfn,
- unsigned long last_pfn,
- int retain_level)
-{
- dma_pte_clear_range(domain, start_pfn, last_pfn);
-
- /* We don't need lock here; nobody else touches the iova range */
- dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
- domain->pgd, 0, start_pfn, last_pfn);
-
- /* free pgd */
- if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
- iommu_free_pages(domain->pgd);
- domain->pgd = NULL;
- }
-}
-
-/* When a page at a given level is being unlinked from its parent, we don't
- need to *modify* it at all. All we need to do is make a list of all the
- pages which can be freed just as soon as we've flushed the IOTLB and we
- know the hardware page-walk will no longer touch them.
- The 'pte' argument is the *parent* PTE, pointing to the page that is to
- be freed. */
-static void dma_pte_list_pagetables(struct dmar_domain *domain,
- int level, struct dma_pte *parent_pte,
- struct iommu_pages_list *freelist)
-{
- struct dma_pte *pte = phys_to_virt(dma_pte_addr(parent_pte));
-
- iommu_pages_list_add(freelist, pte);
-
- if (level == 1)
- return;
-
- do {
- if (dma_pte_present(pte) && !dma_pte_superpage(pte))
- dma_pte_list_pagetables(domain, level - 1, pte, freelist);
- pte++;
- } while (!first_pte_in_page(pte));
-}
-
-static void dma_pte_clear_level(struct dmar_domain *domain, int level,
- struct dma_pte *pte, unsigned long pfn,
- unsigned long start_pfn, unsigned long last_pfn,
- struct iommu_pages_list *freelist)
-{
- struct dma_pte *first_pte = NULL, *last_pte = NULL;
-
- pfn = max(start_pfn, pfn);
- pte = &pte[pfn_level_offset(pfn, level)];
-
- do {
- unsigned long level_pfn = pfn & level_mask(level);
-
- if (!dma_pte_present(pte))
- goto next;
-
- /* If range covers entire pagetable, free it */
- if (start_pfn <= level_pfn &&
- last_pfn >= level_pfn + level_size(level) - 1) {
- /* These suborbinate page tables are going away entirely. Don't
- bother to clear them; we're just going to *free* them. */
- if (level > 1 && !dma_pte_superpage(pte))
- dma_pte_list_pagetables(domain, level - 1, pte, freelist);
-
- dma_clear_pte(pte);
- if (!first_pte)
- first_pte = pte;
- last_pte = pte;
- } else if (level > 1) {
- /* Recurse down into a level that isn't *entirely* obsolete */
- dma_pte_clear_level(domain, level - 1,
- phys_to_virt(dma_pte_addr(pte)),
- level_pfn, start_pfn, last_pfn,
- freelist);
- }
-next:
- pfn = level_pfn + level_size(level);
- } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
-
- if (first_pte)
- domain_flush_cache(domain, first_pte,
- (void *)++last_pte - (void *)first_pte);
-}
-
-/* We can't just free the pages because the IOMMU may still be walking
- the page tables, and may have cached the intermediate levels. The
- pages can only be freed after the IOTLB flush has been done. */
-static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
- unsigned long last_pfn,
- struct iommu_pages_list *freelist)
-{
- if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
- WARN_ON(start_pfn > last_pfn))
- return;
-
- /* we don't need lock here; nobody else touches the iova range */
- dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
- domain->pgd, 0, start_pfn, last_pfn, freelist);
-
- /* free pgd */
- if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
- iommu_pages_list_add(freelist, domain->pgd);
- domain->pgd = NULL;
- }
-}
-
/* iommu handling */
static int iommu_alloc_root_entry(struct intel_iommu *iommu)
{
@@ -1460,13 +1147,15 @@ static int domain_context_mapping_one(struct dmar_domain *domain,
domain_lookup_dev_info(domain, iommu, bus, devfn);
u16 did = domain_id_iommu(domain, iommu);
int translation = CONTEXT_TT_MULTI_LEVEL;
- struct dma_pte *pgd = domain->pgd;
+ struct pt_iommu_vtdss_hw_info pt_info;
struct context_entry *context;
int ret;
if (WARN_ON(!intel_domain_is_ss_paging(domain)))
return -EINVAL;
+ pt_iommu_vtdss_hw_info(&domain->sspt, &pt_info);
+
pr_debug("Set context mapping for %02x:%02x.%d\n",
bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
@@ -1489,8 +1178,8 @@ static int domain_context_mapping_one(struct dmar_domain *domain,
else
translation = CONTEXT_TT_MULTI_LEVEL;
- context_set_address_root(context, virt_to_phys(pgd));
- context_set_address_width(context, domain->agaw);
+ context_set_address_root(context, pt_info.ssptptr);
+ context_set_address_width(context, pt_info.aw);
context_set_translation_type(context, translation);
context_set_fault_enable(context);
context_set_present(context);
@@ -1537,177 +1226,6 @@ domain_context_mapping(struct dmar_domain *domain, struct device *dev)
return 0;
}
-/* Return largest possible superpage level for a given mapping */
-static int hardware_largepage_caps(struct dmar_domain *domain, unsigned long iov_pfn,
- unsigned long phy_pfn, unsigned long pages)
-{
- int support, level = 1;
- unsigned long pfnmerge;
-
- support = domain->iommu_superpage;
-
- /* To use a large page, the virtual *and* physical addresses
- must be aligned to 2MiB/1GiB/etc. Lower bits set in either
- of them will mean we have to use smaller pages. So just
- merge them and check both at once. */
- pfnmerge = iov_pfn | phy_pfn;
-
- while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
- pages >>= VTD_STRIDE_SHIFT;
- if (!pages)
- break;
- pfnmerge >>= VTD_STRIDE_SHIFT;
- level++;
- support--;
- }
- return level;
-}
-
-/*
- * Ensure that old small page tables are removed to make room for superpage(s).
- * We're going to add new large pages, so make sure we don't remove their parent
- * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
- */
-static void switch_to_super_page(struct dmar_domain *domain,
- unsigned long start_pfn,
- unsigned long end_pfn, int level)
-{
- unsigned long lvl_pages = lvl_to_nr_pages(level);
- struct dma_pte *pte = NULL;
-
- if (WARN_ON(!IS_ALIGNED(start_pfn, lvl_pages) ||
- !IS_ALIGNED(end_pfn + 1, lvl_pages)))
- return;
-
- while (start_pfn <= end_pfn) {
- if (!pte)
- pte = pfn_to_dma_pte(domain, start_pfn, &level,
- GFP_ATOMIC);
-
- if (dma_pte_present(pte)) {
- dma_pte_free_pagetable(domain, start_pfn,
- start_pfn + lvl_pages - 1,
- level + 1);
-
- cache_tag_flush_range(domain, start_pfn << VTD_PAGE_SHIFT,
- end_pfn << VTD_PAGE_SHIFT, 0);
- }
-
- pte++;
- start_pfn += lvl_pages;
- if (first_pte_in_page(pte))
- pte = NULL;
- }
-}
-
-static int
-__domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
- unsigned long phys_pfn, unsigned long nr_pages, int prot,
- gfp_t gfp)
-{
- struct dma_pte *first_pte = NULL, *pte = NULL;
- unsigned int largepage_lvl = 0;
- unsigned long lvl_pages = 0;
- phys_addr_t pteval;
- u64 attr;
-
- if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)))
- return -EINVAL;
-
- if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
- return -EINVAL;
-
- if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) {
- pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
- return -EINVAL;
- }
-
- attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
- if (domain->use_first_level) {
- attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
- if (prot & DMA_PTE_WRITE)
- attr |= DMA_FL_PTE_DIRTY;
- }
-
- domain->has_mappings = true;
-
- pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
-
- while (nr_pages > 0) {
- uint64_t tmp;
-
- if (!pte) {
- largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
- phys_pfn, nr_pages);
-
- pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
- gfp);
- if (!pte)
- return -ENOMEM;
- first_pte = pte;
-
- lvl_pages = lvl_to_nr_pages(largepage_lvl);
-
- /* It is large page*/
- if (largepage_lvl > 1) {
- unsigned long end_pfn;
- unsigned long pages_to_remove;
-
- pteval |= DMA_PTE_LARGE_PAGE;
- pages_to_remove = min_t(unsigned long,
- round_down(nr_pages, lvl_pages),
- nr_pte_to_next_page(pte) * lvl_pages);
- end_pfn = iov_pfn + pages_to_remove - 1;
- switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
- } else {
- pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
- }
-
- }
- /* We don't need lock here, nobody else
- * touches the iova range
- */
- tmp = 0ULL;
- if (!try_cmpxchg64_local(&pte->val, &tmp, pteval)) {
- static int dumps = 5;
- pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
- iov_pfn, tmp, (unsigned long long)pteval);
- if (dumps) {
- dumps--;
- debug_dma_dump_mappings(NULL);
- }
- WARN_ON(1);
- }
-
- nr_pages -= lvl_pages;
- iov_pfn += lvl_pages;
- phys_pfn += lvl_pages;
- pteval += lvl_pages * VTD_PAGE_SIZE;
-
- /* If the next PTE would be the first in a new page, then we
- * need to flush the cache on the entries we've just written.
- * And then we'll need to recalculate 'pte', so clear it and
- * let it get set again in the if (!pte) block above.
- *
- * If we're done (!nr_pages) we need to flush the cache too.
- *
- * Also if we've been setting superpages, we may need to
- * recalculate 'pte' and switch back to smaller pages for the
- * end of the mapping, if the trailing size is not enough to
- * use another superpage (i.e. nr_pages < lvl_pages).
- */
- pte++;
- if (!nr_pages || first_pte_in_page(pte) ||
- (largepage_lvl > 1 && nr_pages < lvl_pages)) {
- domain_flush_cache(domain, first_pte,
- (void *)pte - (void *)first_pte);
- pte = NULL;
- }
- }
-
- return 0;
-}
-
static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
{
struct intel_iommu *iommu = info->iommu;
@@ -1769,22 +1287,26 @@ static int domain_setup_first_level(struct intel_iommu *iommu,
struct device *dev,
u32 pasid, struct iommu_domain *old)
{
- struct dma_pte *pgd = domain->pgd;
- int level, flags = 0;
+ struct pt_iommu_x86_64_hw_info pt_info;
+ unsigned int flags = 0;
- level = agaw_to_level(domain->agaw);
- if (level != 4 && level != 5)
+ pt_iommu_x86_64_hw_info(&domain->fspt, &pt_info);
+ if (WARN_ON(pt_info.levels != 4 && pt_info.levels != 5))
return -EINVAL;
- if (level == 5)
+ if (pt_info.levels == 5)
flags |= PASID_FLAG_FL5LP;
if (domain->force_snooping)
flags |= PASID_FLAG_PAGE_SNOOP;
+ if (!(domain->fspt.x86_64_pt.common.features &
+ BIT(PT_FEAT_DMA_INCOHERENT)))
+ flags |= PASID_FLAG_PWSNP;
+
return __domain_setup_first_level(iommu, dev, pasid,
domain_id_iommu(domain, iommu),
- __pa(pgd), flags, old);
+ pt_info.gcr3_pt, flags, old);
}
static int dmar_domain_attach_device(struct dmar_domain *domain,
@@ -3230,7 +2752,8 @@ void device_block_translation(struct device *dev)
}
static int blocking_domain_attach_dev(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
struct device_domain_info *info = dev_iommu_priv_get(dev);
@@ -3251,23 +2774,9 @@ static struct iommu_domain blocking_domain = {
}
};
-static int iommu_superpage_capability(struct intel_iommu *iommu, bool first_stage)
-{
- if (!intel_iommu_superpage)
- return 0;
-
- if (first_stage)
- return cap_fl1gp_support(iommu->cap) ? 2 : 1;
-
- return fls(cap_super_page_val(iommu->cap));
-}
-
-static struct dmar_domain *paging_domain_alloc(struct device *dev, bool first_stage)
+static struct dmar_domain *paging_domain_alloc(void)
{
- struct device_domain_info *info = dev_iommu_priv_get(dev);
- struct intel_iommu *iommu = info->iommu;
struct dmar_domain *domain;
- int addr_width;
domain = kzalloc(sizeof(*domain), GFP_KERNEL);
if (!domain)
@@ -3282,56 +2791,38 @@ static struct dmar_domain *paging_domain_alloc(struct device *dev, bool first_st
INIT_LIST_HEAD(&domain->s1_domains);
spin_lock_init(&domain->s1_lock);
- domain->nid = dev_to_node(dev);
- domain->use_first_level = first_stage;
-
- domain->domain.type = IOMMU_DOMAIN_UNMANAGED;
-
- /* calculate the address width */
- addr_width = agaw_to_width(iommu->agaw);
- if (addr_width > cap_mgaw(iommu->cap))
- addr_width = cap_mgaw(iommu->cap);
- domain->gaw = addr_width;
- domain->agaw = iommu->agaw;
- domain->max_addr = __DOMAIN_MAX_ADDR(addr_width);
-
- /* iommu memory access coherency */
- domain->iommu_coherency = iommu_paging_structure_coherency(iommu);
+ return domain;
+}
- /* pagesize bitmap */
- domain->domain.pgsize_bitmap = SZ_4K;
- domain->iommu_superpage = iommu_superpage_capability(iommu, first_stage);
- domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
+static unsigned int compute_vasz_lg2_fs(struct intel_iommu *iommu,
+ unsigned int *top_level)
+{
+ unsigned int mgaw = cap_mgaw(iommu->cap);
/*
- * IOVA aperture: First-level translation restricts the input-address
- * to a canonical address (i.e., address bits 63:N have the same value
- * as address bit [N-1], where N is 48-bits with 4-level paging and
- * 57-bits with 5-level paging). Hence, skip bit [N-1].
+ * Spec 3.6 First-Stage Translation:
+ *
+ * Software must limit addresses to less than the minimum of MGAW
+ * and the lower canonical address width implied by FSPM (i.e.,
+ * 47-bit when FSPM is 4-level and 56-bit when FSPM is 5-level).
*/
- domain->domain.geometry.force_aperture = true;
- domain->domain.geometry.aperture_start = 0;
- if (first_stage)
- domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
- else
- domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
-
- /* always allocate the top pgd */
- domain->pgd = iommu_alloc_pages_node_sz(domain->nid, GFP_KERNEL, SZ_4K);
- if (!domain->pgd) {
- kfree(domain);
- return ERR_PTR(-ENOMEM);
+ if (mgaw > 48 && cap_fl5lp_support(iommu->cap)) {
+ *top_level = 4;
+ return min(57, mgaw);
}
- domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
- return domain;
+ /* Four level is always supported */
+ *top_level = 3;
+ return min(48, mgaw);
}
static struct iommu_domain *
intel_iommu_domain_alloc_first_stage(struct device *dev,
struct intel_iommu *iommu, u32 flags)
{
+ struct pt_iommu_x86_64_cfg cfg = {};
struct dmar_domain *dmar_domain;
+ int ret;
if (flags & ~IOMMU_HWPT_ALLOC_PASID)
return ERR_PTR(-EOPNOTSUPP);
@@ -3340,10 +2831,20 @@ intel_iommu_domain_alloc_first_stage(struct device *dev,
if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
return ERR_PTR(-EOPNOTSUPP);
- dmar_domain = paging_domain_alloc(dev, true);
+ dmar_domain = paging_domain_alloc();
if (IS_ERR(dmar_domain))
return ERR_CAST(dmar_domain);
+ cfg.common.hw_max_vasz_lg2 =
+ compute_vasz_lg2_fs(iommu, &cfg.top_level);
+ cfg.common.hw_max_oasz_lg2 = 52;
+ cfg.common.features = BIT(PT_FEAT_SIGN_EXTEND) |
+ BIT(PT_FEAT_FLUSH_RANGE);
+ /* First stage always uses scalable mode */
+ if (!ecap_smpwc(iommu->ecap))
+ cfg.common.features |= BIT(PT_FEAT_DMA_INCOHERENT);
+ dmar_domain->iommu.iommu_device = dev;
+ dmar_domain->iommu.nid = dev_to_node(dev);
dmar_domain->domain.ops = &intel_fs_paging_domain_ops;
/*
* iotlb sync for map is only needed for legacy implementations that
@@ -3353,14 +2854,58 @@ intel_iommu_domain_alloc_first_stage(struct device *dev,
if (rwbf_required(iommu))
dmar_domain->iotlb_sync_map = true;
+ ret = pt_iommu_x86_64_init(&dmar_domain->fspt, &cfg, GFP_KERNEL);
+ if (ret) {
+ kfree(dmar_domain);
+ return ERR_PTR(ret);
+ }
+
+ if (!cap_fl1gp_support(iommu->cap))
+ dmar_domain->domain.pgsize_bitmap &= ~(u64)SZ_1G;
+ if (!intel_iommu_superpage)
+ dmar_domain->domain.pgsize_bitmap = SZ_4K;
+
return &dmar_domain->domain;
}
+static unsigned int compute_vasz_lg2_ss(struct intel_iommu *iommu,
+ unsigned int *top_level)
+{
+ unsigned int sagaw = cap_sagaw(iommu->cap);
+ unsigned int mgaw = cap_mgaw(iommu->cap);
+
+ /*
+ * Find the largest table size that both the mgaw and sagaw support.
+ * This sets the valid range of IOVA and the top starting level.
+ * Some HW may only support a 4 or 5 level walk but must limit IOVA to
+ * 3 levels.
+ */
+ if (mgaw > 48 && sagaw >= BIT(3)) {
+ *top_level = 4;
+ return min(57, mgaw);
+ } else if (mgaw > 39 && sagaw >= BIT(2)) {
+ *top_level = 3 + ffs(sagaw >> 3);
+ return min(48, mgaw);
+ } else if (mgaw > 30 && sagaw >= BIT(1)) {
+ *top_level = 2 + ffs(sagaw >> 2);
+ return min(39, mgaw);
+ }
+ return 0;
+}
+
+static const struct iommu_dirty_ops intel_second_stage_dirty_ops = {
+ IOMMU_PT_DIRTY_OPS(vtdss),
+ .set_dirty_tracking = intel_iommu_set_dirty_tracking,
+};
+
static struct iommu_domain *
intel_iommu_domain_alloc_second_stage(struct device *dev,
struct intel_iommu *iommu, u32 flags)
{
+ struct pt_iommu_vtdss_cfg cfg = {};
struct dmar_domain *dmar_domain;
+ unsigned int sslps;
+ int ret;
if (flags &
(~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING |
@@ -3377,15 +2922,46 @@ intel_iommu_domain_alloc_second_stage(struct device *dev,
if (sm_supported(iommu) && !ecap_slts(iommu->ecap))
return ERR_PTR(-EOPNOTSUPP);
- dmar_domain = paging_domain_alloc(dev, false);
+ dmar_domain = paging_domain_alloc();
if (IS_ERR(dmar_domain))
return ERR_CAST(dmar_domain);
+ cfg.common.hw_max_vasz_lg2 = compute_vasz_lg2_ss(iommu, &cfg.top_level);
+ cfg.common.hw_max_oasz_lg2 = 52;
+ cfg.common.features = BIT(PT_FEAT_FLUSH_RANGE);
+
+ /*
+ * Read-only mapping is disallowed on the domain which serves as the
+ * parent in a nested configuration, due to HW errata
+ * (ERRATA_772415_SPR17)
+ */
+ if (flags & IOMMU_HWPT_ALLOC_NEST_PARENT)
+ cfg.common.features |= BIT(PT_FEAT_VTDSS_FORCE_WRITEABLE);
+
+ if (!iommu_paging_structure_coherency(iommu))
+ cfg.common.features |= BIT(PT_FEAT_DMA_INCOHERENT);
+ dmar_domain->iommu.iommu_device = dev;
+ dmar_domain->iommu.nid = dev_to_node(dev);
dmar_domain->domain.ops = &intel_ss_paging_domain_ops;
dmar_domain->nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
if (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING)
- dmar_domain->domain.dirty_ops = &intel_dirty_ops;
+ dmar_domain->domain.dirty_ops = &intel_second_stage_dirty_ops;
+
+ ret = pt_iommu_vtdss_init(&dmar_domain->sspt, &cfg, GFP_KERNEL);
+ if (ret) {
+ kfree(dmar_domain);
+ return ERR_PTR(ret);
+ }
+
+ /* Adjust the supported page sizes to HW capability */
+ sslps = cap_super_page_val(iommu->cap);
+ if (!(sslps & BIT(0)))
+ dmar_domain->domain.pgsize_bitmap &= ~(u64)SZ_2M;
+ if (!(sslps & BIT(1)))
+ dmar_domain->domain.pgsize_bitmap &= ~(u64)SZ_1G;
+ if (!intel_iommu_superpage)
+ dmar_domain->domain.pgsize_bitmap = SZ_4K;
/*
* Besides the internal write buffer flush, the caching mode used for
@@ -3427,14 +3003,7 @@ static void intel_iommu_domain_free(struct iommu_domain *domain)
if (WARN_ON(!list_empty(&dmar_domain->devices)))
return;
- if (dmar_domain->pgd) {
- struct iommu_pages_list freelist =
- IOMMU_PAGES_LIST_INIT(freelist);
-
- domain_unmap(dmar_domain, 0, DOMAIN_MAX_PFN(dmar_domain->gaw),
- &freelist);
- iommu_put_pages_list(&freelist);
- }
+ pt_iommu_deinit(&dmar_domain->iommu);
kfree(dmar_domain->qi_batch);
kfree(dmar_domain);
@@ -3451,6 +3020,16 @@ static int paging_domain_compatible_first_stage(struct dmar_domain *dmar_domain,
if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
return -EINVAL;
+ if (!ecap_smpwc(iommu->ecap) &&
+ !(dmar_domain->fspt.x86_64_pt.common.features &
+ BIT(PT_FEAT_DMA_INCOHERENT)))
+ return -EINVAL;
+
+ /* Supports the number of table levels */
+ if (!cap_fl5lp_support(iommu->cap) &&
+ dmar_domain->fspt.x86_64_pt.common.max_vasz_lg2 > 48)
+ return -EINVAL;
+
/* Same page size support */
if (!cap_fl1gp_support(iommu->cap) &&
(dmar_domain->domain.pgsize_bitmap & SZ_1G))
@@ -3467,7 +3046,11 @@ static int
paging_domain_compatible_second_stage(struct dmar_domain *dmar_domain,
struct intel_iommu *iommu)
{
+ unsigned int vasz_lg2 = dmar_domain->sspt.vtdss_pt.common.max_vasz_lg2;
unsigned int sslps = cap_super_page_val(iommu->cap);
+ struct pt_iommu_vtdss_hw_info pt_info;
+
+ pt_iommu_vtdss_hw_info(&dmar_domain->sspt, &pt_info);
if (dmar_domain->domain.dirty_ops && !ssads_supported(iommu))
return -EINVAL;
@@ -3478,6 +3061,19 @@ paging_domain_compatible_second_stage(struct dmar_domain *dmar_domain,
if (sm_supported(iommu) && !ecap_slts(iommu->ecap))
return -EINVAL;
+ if (!iommu_paging_structure_coherency(iommu) &&
+ !(dmar_domain->sspt.vtdss_pt.common.features &
+ BIT(PT_FEAT_DMA_INCOHERENT)))
+ return -EINVAL;
+
+ /* Address width falls within the capability */
+ if (cap_mgaw(iommu->cap) < vasz_lg2)
+ return -EINVAL;
+
+ /* Page table level is supported. */
+ if (!(cap_sagaw(iommu->cap) & BIT(pt_info.aw)))
+ return -EINVAL;
+
/* Same page size support */
if (!(sslps & BIT(0)) && (dmar_domain->domain.pgsize_bitmap & SZ_2M))
return -EINVAL;
@@ -3489,6 +3085,14 @@ paging_domain_compatible_second_stage(struct dmar_domain *dmar_domain,
!dmar_domain->iotlb_sync_map)
return -EINVAL;
+ /*
+ * FIXME this is locked wrong, it needs to be under the
+ * dmar_domain->lock
+ */
+ if ((dmar_domain->sspt.vtdss_pt.common.features &
+ BIT(PT_FEAT_VTDSS_FORCE_COHERENCE)) &&
+ !ecap_sc_support(iommu->ecap))
+ return -EINVAL;
return 0;
}
@@ -3498,7 +3102,6 @@ int paging_domain_compatible(struct iommu_domain *domain, struct device *dev)
struct dmar_domain *dmar_domain = to_dmar_domain(domain);
struct intel_iommu *iommu = info->iommu;
int ret = -EINVAL;
- int addr_width;
if (intel_domain_is_fs_paging(dmar_domain))
ret = paging_domain_compatible_first_stage(dmar_domain, iommu);
@@ -3509,26 +3112,6 @@ int paging_domain_compatible(struct iommu_domain *domain, struct device *dev)
if (ret)
return ret;
- /*
- * FIXME this is locked wrong, it needs to be under the
- * dmar_domain->lock
- */
- if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
- return -EINVAL;
-
- if (dmar_domain->iommu_coherency !=
- iommu_paging_structure_coherency(iommu))
- return -EINVAL;
-
-
- /* check if this iommu agaw is sufficient for max mapped address */
- addr_width = agaw_to_width(iommu->agaw);
- if (addr_width > cap_mgaw(iommu->cap))
- addr_width = cap_mgaw(iommu->cap);
-
- if (dmar_domain->gaw > addr_width || dmar_domain->agaw > iommu->agaw)
- return -EINVAL;
-
if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
context_copied(iommu, info->bus, info->devfn))
return intel_pasid_setup_sm_context(dev);
@@ -3537,7 +3120,8 @@ int paging_domain_compatible(struct iommu_domain *domain, struct device *dev)
}
static int intel_iommu_attach_device(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
int ret;
@@ -3558,110 +3142,6 @@ static int intel_iommu_attach_device(struct iommu_domain *domain,
return ret;
}
-static int intel_iommu_map(struct iommu_domain *domain,
- unsigned long iova, phys_addr_t hpa,
- size_t size, int iommu_prot, gfp_t gfp)
-{
- struct dmar_domain *dmar_domain = to_dmar_domain(domain);
- u64 max_addr;
- int prot = 0;
-
- if (iommu_prot & IOMMU_READ)
- prot |= DMA_PTE_READ;
- if (iommu_prot & IOMMU_WRITE)
- prot |= DMA_PTE_WRITE;
- if (dmar_domain->set_pte_snp)
- prot |= DMA_PTE_SNP;
-
- max_addr = iova + size;
- if (dmar_domain->max_addr < max_addr) {
- u64 end;
-
- /* check if minimum agaw is sufficient for mapped address */
- end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
- if (end < max_addr) {
- pr_err("%s: iommu width (%d) is not "
- "sufficient for the mapped address (%llx)\n",
- __func__, dmar_domain->gaw, max_addr);
- return -EFAULT;
- }
- dmar_domain->max_addr = max_addr;
- }
- /* Round up size to next multiple of PAGE_SIZE, if it and
- the low bits of hpa would take us onto the next page */
- size = aligned_nrpages(hpa, size);
- return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
- hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
-}
-
-static int intel_iommu_map_pages(struct iommu_domain *domain,
- unsigned long iova, phys_addr_t paddr,
- size_t pgsize, size_t pgcount,
- int prot, gfp_t gfp, size_t *mapped)
-{
- unsigned long pgshift = __ffs(pgsize);
- size_t size = pgcount << pgshift;
- int ret;
-
- if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
- return -EINVAL;
-
- if (!IS_ALIGNED(iova | paddr, pgsize))
- return -EINVAL;
-
- ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
- if (!ret && mapped)
- *mapped = size;
-
- return ret;
-}
-
-static size_t intel_iommu_unmap(struct iommu_domain *domain,
- unsigned long iova, size_t size,
- struct iommu_iotlb_gather *gather)
-{
- struct dmar_domain *dmar_domain = to_dmar_domain(domain);
- unsigned long start_pfn, last_pfn;
- int level = 0;
-
- /* Cope with horrid API which requires us to unmap more than the
- size argument if it happens to be a large-page mapping. */
- if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
- &level, GFP_ATOMIC)))
- return 0;
-
- if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
- size = VTD_PAGE_SIZE << level_to_offset_bits(level);
-
- start_pfn = iova >> VTD_PAGE_SHIFT;
- last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
-
- domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
-
- if (dmar_domain->max_addr == iova + size)
- dmar_domain->max_addr = iova;
-
- /*
- * We do not use page-selective IOTLB invalidation in flush queue,
- * so there is no need to track page and sync iotlb.
- */
- if (!iommu_iotlb_gather_queued(gather))
- iommu_iotlb_gather_add_page(domain, gather, iova, size);
-
- return size;
-}
-
-static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
- unsigned long iova,
- size_t pgsize, size_t pgcount,
- struct iommu_iotlb_gather *gather)
-{
- unsigned long pgshift = __ffs(pgsize);
- size_t size = pgcount << pgshift;
-
- return intel_iommu_unmap(domain, iova, size, gather);
-}
-
static void intel_iommu_tlb_sync(struct iommu_domain *domain,
struct iommu_iotlb_gather *gather)
{
@@ -3671,24 +3151,6 @@ static void intel_iommu_tlb_sync(struct iommu_domain *domain,
iommu_put_pages_list(&gather->freelist);
}
-static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
- dma_addr_t iova)
-{
- struct dmar_domain *dmar_domain = to_dmar_domain(domain);
- struct dma_pte *pte;
- int level = 0;
- u64 phys = 0;
-
- pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
- GFP_ATOMIC);
- if (pte && dma_pte_present(pte))
- phys = dma_pte_addr(pte) +
- (iova & (BIT_MASK(level_to_offset_bits(level) +
- VTD_PAGE_SHIFT) - 1));
-
- return phys;
-}
-
static bool domain_support_force_snooping(struct dmar_domain *domain)
{
struct device_domain_info *info;
@@ -3730,15 +3192,15 @@ static bool intel_iommu_enforce_cache_coherency_ss(struct iommu_domain *domain)
struct dmar_domain *dmar_domain = to_dmar_domain(domain);
guard(spinlock_irqsave)(&dmar_domain->lock);
- if (!domain_support_force_snooping(dmar_domain) ||
- dmar_domain->has_mappings)
+ if (!domain_support_force_snooping(dmar_domain))
return false;
/*
* Second level page table supports per-PTE snoop control. The
* iommu_map() interface will handle this by setting SNP bit.
*/
- dmar_domain->set_pte_snp = true;
+ dmar_domain->sspt.vtdss_pt.common.features |=
+ BIT(PT_FEAT_VTDSS_FORCE_COHERENCE);
dmar_domain->force_snooping = true;
return true;
}
@@ -4302,49 +3764,6 @@ err_unwind:
return ret;
}
-static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain,
- unsigned long iova, size_t size,
- unsigned long flags,
- struct iommu_dirty_bitmap *dirty)
-{
- struct dmar_domain *dmar_domain = to_dmar_domain(domain);
- unsigned long end = iova + size - 1;
- unsigned long pgsize;
-
- /*
- * IOMMUFD core calls into a dirty tracking disabled domain without an
- * IOVA bitmap set in order to clean dirty bits in all PTEs that might
- * have occurred when we stopped dirty tracking. This ensures that we
- * never inherit dirtied bits from a previous cycle.
- */
- if (!dmar_domain->dirty_tracking && dirty->bitmap)
- return -EINVAL;
-
- do {
- struct dma_pte *pte;
- int lvl = 0;
-
- pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl,
- GFP_ATOMIC);
- pgsize = level_size(lvl) << VTD_PAGE_SHIFT;
- if (!pte || !dma_pte_present(pte)) {
- iova += pgsize;
- continue;
- }
-
- if (dma_sl_pte_test_and_clear_dirty(pte, flags))
- iommu_dirty_bitmap_record(dirty, iova, pgsize);
- iova += pgsize;
- } while (iova < end);
-
- return 0;
-}
-
-static const struct iommu_dirty_ops intel_dirty_ops = {
- .set_dirty_tracking = intel_iommu_set_dirty_tracking,
- .read_and_clear_dirty = intel_iommu_read_and_clear_dirty,
-};
-
static int context_setup_pass_through(struct device *dev, u8 bus, u8 devfn)
{
struct device_domain_info *info = dev_iommu_priv_get(dev);
@@ -4401,7 +3820,9 @@ static int device_setup_pass_through(struct device *dev)
context_setup_pass_through_cb, dev);
}
-static int identity_domain_attach_dev(struct iommu_domain *domain, struct device *dev)
+static int identity_domain_attach_dev(struct iommu_domain *domain,
+ struct device *dev,
+ struct iommu_domain *old)
{
struct device_domain_info *info = dev_iommu_priv_get(dev);
struct intel_iommu *iommu = info->iommu;
@@ -4462,27 +3883,23 @@ static struct iommu_domain identity_domain = {
};
const struct iommu_domain_ops intel_fs_paging_domain_ops = {
+ IOMMU_PT_DOMAIN_OPS(x86_64),
.attach_dev = intel_iommu_attach_device,
.set_dev_pasid = intel_iommu_set_dev_pasid,
- .map_pages = intel_iommu_map_pages,
- .unmap_pages = intel_iommu_unmap_pages,
.iotlb_sync_map = intel_iommu_iotlb_sync_map,
.flush_iotlb_all = intel_flush_iotlb_all,
.iotlb_sync = intel_iommu_tlb_sync,
- .iova_to_phys = intel_iommu_iova_to_phys,
.free = intel_iommu_domain_free,
.enforce_cache_coherency = intel_iommu_enforce_cache_coherency_fs,
};
const struct iommu_domain_ops intel_ss_paging_domain_ops = {
+ IOMMU_PT_DOMAIN_OPS(vtdss),
.attach_dev = intel_iommu_attach_device,
.set_dev_pasid = intel_iommu_set_dev_pasid,
- .map_pages = intel_iommu_map_pages,
- .unmap_pages = intel_iommu_unmap_pages,
.iotlb_sync_map = intel_iommu_iotlb_sync_map,
.flush_iotlb_all = intel_flush_iotlb_all,
.iotlb_sync = intel_iommu_tlb_sync,
- .iova_to_phys = intel_iommu_iova_to_phys,
.free = intel_iommu_domain_free,
.enforce_cache_coherency = intel_iommu_enforce_cache_coherency_ss,
};
@@ -4797,3 +4214,5 @@ err:
return ret;
}
+
+MODULE_IMPORT_NS("GENERIC_PT_IOMMU");
diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
index 3056583d7f56..25c5e22096d4 100644
--- a/drivers/iommu/intel/iommu.h
+++ b/drivers/iommu/intel/iommu.h
@@ -23,8 +23,8 @@
#include <linux/xarray.h>
#include <linux/perf_event.h>
#include <linux/pci.h>
+#include <linux/generic_pt/iommu.h>
-#include <asm/cacheflush.h>
#include <asm/iommu.h>
#include <uapi/linux/iommufd.h>
@@ -595,22 +595,20 @@ struct qi_batch {
};
struct dmar_domain {
- int nid; /* node id */
+ union {
+ struct iommu_domain domain;
+ struct pt_iommu iommu;
+ /* First stage page table */
+ struct pt_iommu_x86_64 fspt;
+ /* Second stage page table */
+ struct pt_iommu_vtdss sspt;
+ };
+
struct xarray iommu_array; /* Attached IOMMU array */
- u8 iommu_coherency: 1; /* indicate coherency of iommu access */
- u8 force_snooping : 1; /* Create IOPTEs with snoop control */
- u8 set_pte_snp:1;
- u8 use_first_level:1; /* DMA translation for the domain goes
- * through the first level page table,
- * otherwise, goes through the second
- * level.
- */
+ u8 force_snooping:1; /* Create PASID entry with snoop control */
u8 dirty_tracking:1; /* Dirty tracking is enabled */
u8 nested_parent:1; /* Has other domains nested on it */
- u8 has_mappings:1; /* Has mappings configured through
- * iommu_map() interface.
- */
u8 iotlb_sync_map:1; /* Need to flush IOTLB cache or write
* buffer when creating mappings.
*/
@@ -623,26 +621,9 @@ struct dmar_domain {
struct list_head cache_tags; /* Cache tag list */
struct qi_batch *qi_batch; /* Batched QI descriptors */
- int iommu_superpage;/* Level of superpages supported:
- 0 == 4KiB (no superpages), 1 == 2MiB,
- 2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
union {
/* DMA remapping domain */
struct {
- /* virtual address */
- struct dma_pte *pgd;
- /* max guest address width */
- int gaw;
- /*
- * adjusted guest address width:
- * 0: level 2 30-bit
- * 1: level 3 39-bit
- * 2: level 4 48-bit
- * 3: level 5 57-bit
- */
- int agaw;
- /* maximum mapped address */
- u64 max_addr;
/* Protect the s1_domains list */
spinlock_t s1_lock;
/* Track s1_domains nested on this domain */
@@ -664,10 +645,10 @@ struct dmar_domain {
struct mmu_notifier notifier;
};
};
-
- struct iommu_domain domain; /* generic domain data structure for
- iommu core */
};
+PT_IOMMU_CHECK_DOMAIN(struct dmar_domain, iommu, domain);
+PT_IOMMU_CHECK_DOMAIN(struct dmar_domain, sspt.iommu, domain);
+PT_IOMMU_CHECK_DOMAIN(struct dmar_domain, fspt.iommu, domain);
/*
* In theory, the VT-d 4.0 spec can support up to 2 ^ 16 counters.
@@ -866,11 +847,6 @@ struct dma_pte {
u64 val;
};
-static inline void dma_clear_pte(struct dma_pte *pte)
-{
- pte->val = 0;
-}
-
static inline u64 dma_pte_addr(struct dma_pte *pte)
{
#ifdef CONFIG_64BIT
@@ -886,32 +862,11 @@ static inline bool dma_pte_present(struct dma_pte *pte)
return (pte->val & 3) != 0;
}
-static inline bool dma_sl_pte_test_and_clear_dirty(struct dma_pte *pte,
- unsigned long flags)
-{
- if (flags & IOMMU_DIRTY_NO_CLEAR)
- return (pte->val & DMA_SL_PTE_DIRTY) != 0;
-
- return test_and_clear_bit(DMA_SL_PTE_DIRTY_BIT,
- (unsigned long *)&pte->val);
-}
-
static inline bool dma_pte_superpage(struct dma_pte *pte)
{
return (pte->val & DMA_PTE_LARGE_PAGE);
}
-static inline bool first_pte_in_page(struct dma_pte *pte)
-{
- return IS_ALIGNED((unsigned long)pte, VTD_PAGE_SIZE);
-}
-
-static inline int nr_pte_to_next_page(struct dma_pte *pte)
-{
- return first_pte_in_page(pte) ? BIT_ULL(VTD_STRIDE_SHIFT) :
- (struct dma_pte *)ALIGN((unsigned long)pte, VTD_PAGE_SIZE) - pte;
-}
-
static inline bool context_present(struct context_entry *context)
{
return (context->lo & 1);
@@ -927,11 +882,6 @@ static inline int agaw_to_level(int agaw)
return agaw + 2;
}
-static inline int agaw_to_width(int agaw)
-{
- return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
-}
-
static inline int width_to_agaw(int width)
{
return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
@@ -947,25 +897,6 @@ static inline int pfn_level_offset(u64 pfn, int level)
return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
}
-static inline u64 level_mask(int level)
-{
- return -1ULL << level_to_offset_bits(level);
-}
-
-static inline u64 level_size(int level)
-{
- return 1ULL << level_to_offset_bits(level);
-}
-
-static inline u64 align_to_level(u64 pfn, int level)
-{
- return (pfn + level_size(level) - 1) & level_mask(level);
-}
-
-static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
-{
- return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
-}
static inline void context_set_present(struct context_entry *context)
{
@@ -1097,7 +1028,7 @@ static inline void qi_desc_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
struct qi_desc *desc)
{
u8 dw = 0, dr = 0;
- int ih = 0;
+ int ih = addr & 1;
if (cap_write_drain(iommu->cap))
dw = 1;
diff --git a/drivers/iommu/intel/nested.c b/drivers/iommu/intel/nested.c
index 1b6ad9c900a5..a3fb8c193ca6 100644
--- a/drivers/iommu/intel/nested.c
+++ b/drivers/iommu/intel/nested.c
@@ -19,7 +19,7 @@
#include "pasid.h"
static int intel_nested_attach_dev(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev, struct iommu_domain *old)
{
struct device_domain_info *info = dev_iommu_priv_get(dev);
struct dmar_domain *dmar_domain = to_dmar_domain(domain);
@@ -29,11 +29,6 @@ static int intel_nested_attach_dev(struct iommu_domain *domain,
device_block_translation(dev);
- if (iommu->agaw < dmar_domain->s2_domain->agaw) {
- dev_err_ratelimited(dev, "Adjusted guest address width not compatible\n");
- return -ENODEV;
- }
-
/*
* Stage-1 domain cannot work alone, it is nested on a s2_domain.
* The s2_domain will be used in nested translation, hence needs
diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
index 52f678975da7..3e2255057079 100644
--- a/drivers/iommu/intel/pasid.c
+++ b/drivers/iommu/intel/pasid.c
@@ -366,7 +366,7 @@ static void pasid_pte_config_first_level(struct intel_iommu *iommu,
pasid_set_domain_id(pte, did);
pasid_set_address_width(pte, iommu->agaw);
- pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap));
+ pasid_set_page_snoop(pte, flags & PASID_FLAG_PWSNP);
/* Setup Present and PASID Granular Transfer Type: */
pasid_set_translation_type(pte, PASID_ENTRY_PGTT_FL_ONLY);
@@ -461,19 +461,22 @@ int intel_pasid_replace_first_level(struct intel_iommu *iommu,
*/
static void pasid_pte_config_second_level(struct intel_iommu *iommu,
struct pasid_entry *pte,
- u64 pgd_val, int agaw, u16 did,
- bool dirty_tracking)
+ struct dmar_domain *domain, u16 did)
{
+ struct pt_iommu_vtdss_hw_info pt_info;
+
lockdep_assert_held(&iommu->lock);
+ pt_iommu_vtdss_hw_info(&domain->sspt, &pt_info);
pasid_clear_entry(pte);
pasid_set_domain_id(pte, did);
- pasid_set_slptr(pte, pgd_val);
- pasid_set_address_width(pte, agaw);
+ pasid_set_slptr(pte, pt_info.ssptptr);
+ pasid_set_address_width(pte, pt_info.aw);
pasid_set_translation_type(pte, PASID_ENTRY_PGTT_SL_ONLY);
pasid_set_fault_enable(pte);
- pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap));
- if (dirty_tracking)
+ pasid_set_page_snoop(pte, !(domain->sspt.vtdss_pt.common.features &
+ BIT(PT_FEAT_DMA_INCOHERENT)));
+ if (domain->dirty_tracking)
pasid_set_ssade(pte);
pasid_set_present(pte);
@@ -484,10 +487,9 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu,
struct device *dev, u32 pasid)
{
struct pasid_entry *pte;
- struct dma_pte *pgd;
- u64 pgd_val;
u16 did;
+
/*
* If hardware advertises no support for second level
* translation, return directly.
@@ -498,8 +500,6 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu,
return -EINVAL;
}
- pgd = domain->pgd;
- pgd_val = virt_to_phys(pgd);
did = domain_id_iommu(domain, iommu);
spin_lock(&iommu->lock);
@@ -514,8 +514,7 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu,
return -EBUSY;
}
- pasid_pte_config_second_level(iommu, pte, pgd_val, domain->agaw,
- did, domain->dirty_tracking);
+ pasid_pte_config_second_level(iommu, pte, domain, did);
spin_unlock(&iommu->lock);
pasid_flush_caches(iommu, pte, pasid, did);
@@ -529,8 +528,6 @@ int intel_pasid_replace_second_level(struct intel_iommu *iommu,
u32 pasid)
{
struct pasid_entry *pte, new_pte;
- struct dma_pte *pgd;
- u64 pgd_val;
u16 did;
/*
@@ -543,13 +540,9 @@ int intel_pasid_replace_second_level(struct intel_iommu *iommu,
return -EINVAL;
}
- pgd = domain->pgd;
- pgd_val = virt_to_phys(pgd);
did = domain_id_iommu(domain, iommu);
- pasid_pte_config_second_level(iommu, &new_pte, pgd_val,
- domain->agaw, did,
- domain->dirty_tracking);
+ pasid_pte_config_second_level(iommu, &new_pte, domain, did);
spin_lock(&iommu->lock);
pte = intel_pasid_get_entry(dev, pasid);
@@ -747,10 +740,12 @@ static void pasid_pte_config_nestd(struct intel_iommu *iommu,
struct dmar_domain *s2_domain,
u16 did)
{
- struct dma_pte *pgd = s2_domain->pgd;
+ struct pt_iommu_vtdss_hw_info pt_info;
lockdep_assert_held(&iommu->lock);
+ pt_iommu_vtdss_hw_info(&s2_domain->sspt, &pt_info);
+
pasid_clear_entry(pte);
if (s1_cfg->addr_width == ADDR_WIDTH_5LEVEL)
@@ -770,11 +765,12 @@ static void pasid_pte_config_nestd(struct intel_iommu *iommu,
if (s2_domain->force_snooping)
pasid_set_pgsnp(pte);
- pasid_set_slptr(pte, virt_to_phys(pgd));
+ pasid_set_slptr(pte, pt_info.ssptptr);
pasid_set_fault_enable(pte);
pasid_set_domain_id(pte, did);
- pasid_set_address_width(pte, s2_domain->agaw);
- pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap));
+ pasid_set_address_width(pte, pt_info.aw);
+ pasid_set_page_snoop(pte, !(s2_domain->sspt.vtdss_pt.common.features &
+ BIT(PT_FEAT_DMA_INCOHERENT)));
if (s2_domain->dirty_tracking)
pasid_set_ssade(pte);
pasid_set_translation_type(pte, PASID_ENTRY_PGTT_NESTED);
diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h
index a771a77d4239..b4c85242dc79 100644
--- a/drivers/iommu/intel/pasid.h
+++ b/drivers/iommu/intel/pasid.h
@@ -24,6 +24,7 @@
#define PASID_FLAG_NESTED BIT(1)
#define PASID_FLAG_PAGE_SNOOP BIT(2)
+#define PASID_FLAG_PWSNP BIT(2)
/*
* The PASID_FLAG_FL5LP flag Indicates using 5-level paging for first-
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index e147f71f91b7..71de7947971f 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -170,6 +170,7 @@ static int intel_svm_set_dev_pasid(struct iommu_domain *domain,
/* Setup the pasid table: */
sflags = cpu_feature_enabled(X86_FEATURE_LA57) ? PASID_FLAG_FL5LP : 0;
+ sflags |= PASID_FLAG_PWSNP;
ret = __domain_setup_first_level(iommu, dev, pasid,
FLPT_DEFAULT_DID, __pa(mm->pgd),
sflags, old);
diff --git a/drivers/iommu/io-pgtable-arm-selftests.c b/drivers/iommu/io-pgtable-arm-selftests.c
new file mode 100644
index 000000000000..334e70350924
--- /dev/null
+++ b/drivers/iommu/io-pgtable-arm-selftests.c
@@ -0,0 +1,214 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * CPU-agnostic ARM page table allocator.
+ *
+ * Copyright (C) 2014 ARM Limited
+ *
+ * Author: Will Deacon <will.deacon@arm.com>
+ */
+
+#define pr_fmt(fmt) "arm-lpae io-pgtable: " fmt
+
+#include <kunit/device.h>
+#include <kunit/test.h>
+#include <linux/io-pgtable.h>
+#include <linux/kernel.h>
+
+#include "io-pgtable-arm.h"
+
+static struct io_pgtable_cfg *cfg_cookie;
+
+static void dummy_tlb_flush_all(void *cookie)
+{
+ WARN_ON(cookie != cfg_cookie);
+}
+
+static void dummy_tlb_flush(unsigned long iova, size_t size,
+ size_t granule, void *cookie)
+{
+ WARN_ON(cookie != cfg_cookie);
+ WARN_ON(!(size & cfg_cookie->pgsize_bitmap));
+}
+
+static void dummy_tlb_add_page(struct iommu_iotlb_gather *gather,
+ unsigned long iova, size_t granule,
+ void *cookie)
+{
+ dummy_tlb_flush(iova, granule, granule, cookie);
+}
+
+static const struct iommu_flush_ops dummy_tlb_ops = {
+ .tlb_flush_all = dummy_tlb_flush_all,
+ .tlb_flush_walk = dummy_tlb_flush,
+ .tlb_add_page = dummy_tlb_add_page,
+};
+
+#define __FAIL(test, i) ({ \
+ KUNIT_FAIL(test, "test failed for fmt idx %d\n", (i)); \
+ -EFAULT; \
+})
+
+static int arm_lpae_run_tests(struct kunit *test, struct io_pgtable_cfg *cfg)
+{
+ static const enum io_pgtable_fmt fmts[] = {
+ ARM_64_LPAE_S1,
+ ARM_64_LPAE_S2,
+ };
+
+ int i, j;
+ unsigned long iova;
+ size_t size, mapped;
+ struct io_pgtable_ops *ops;
+
+ for (i = 0; i < ARRAY_SIZE(fmts); ++i) {
+ cfg_cookie = cfg;
+ ops = alloc_io_pgtable_ops(fmts[i], cfg, cfg);
+ if (!ops) {
+ kunit_err(test, "failed to allocate io pgtable ops\n");
+ return -ENOMEM;
+ }
+
+ /*
+ * Initial sanity checks.
+ * Empty page tables shouldn't provide any translations.
+ */
+ if (ops->iova_to_phys(ops, 42))
+ return __FAIL(test, i);
+
+ if (ops->iova_to_phys(ops, SZ_1G + 42))
+ return __FAIL(test, i);
+
+ if (ops->iova_to_phys(ops, SZ_2G + 42))
+ return __FAIL(test, i);
+
+ /*
+ * Distinct mappings of different granule sizes.
+ */
+ iova = 0;
+ for_each_set_bit(j, &cfg->pgsize_bitmap, BITS_PER_LONG) {
+ size = 1UL << j;
+
+ if (ops->map_pages(ops, iova, iova, size, 1,
+ IOMMU_READ | IOMMU_WRITE |
+ IOMMU_NOEXEC | IOMMU_CACHE,
+ GFP_KERNEL, &mapped))
+ return __FAIL(test, i);
+
+ /* Overlapping mappings */
+ if (!ops->map_pages(ops, iova, iova + size, size, 1,
+ IOMMU_READ | IOMMU_NOEXEC,
+ GFP_KERNEL, &mapped))
+ return __FAIL(test, i);
+
+ if (ops->iova_to_phys(ops, iova + 42) != (iova + 42))
+ return __FAIL(test, i);
+
+ iova += SZ_1G;
+ }
+
+ /* Full unmap */
+ iova = 0;
+ for_each_set_bit(j, &cfg->pgsize_bitmap, BITS_PER_LONG) {
+ size = 1UL << j;
+
+ if (ops->unmap_pages(ops, iova, size, 1, NULL) != size)
+ return __FAIL(test, i);
+
+ if (ops->iova_to_phys(ops, iova + 42))
+ return __FAIL(test, i);
+
+ /* Remap full block */
+ if (ops->map_pages(ops, iova, iova, size, 1,
+ IOMMU_WRITE, GFP_KERNEL, &mapped))
+ return __FAIL(test, i);
+
+ if (ops->iova_to_phys(ops, iova + 42) != (iova + 42))
+ return __FAIL(test, i);
+
+ iova += SZ_1G;
+ }
+
+ /*
+ * Map/unmap the last largest supported page of the IAS, this can
+ * trigger corner cases in the concatednated page tables.
+ */
+ mapped = 0;
+ size = 1UL << __fls(cfg->pgsize_bitmap);
+ iova = (1UL << cfg->ias) - size;
+ if (ops->map_pages(ops, iova, iova, size, 1,
+ IOMMU_READ | IOMMU_WRITE |
+ IOMMU_NOEXEC | IOMMU_CACHE,
+ GFP_KERNEL, &mapped))
+ return __FAIL(test, i);
+ if (mapped != size)
+ return __FAIL(test, i);
+ if (ops->unmap_pages(ops, iova, size, 1, NULL) != size)
+ return __FAIL(test, i);
+
+ free_io_pgtable_ops(ops);
+ }
+
+ return 0;
+}
+
+static void arm_lpae_do_selftests(struct kunit *test)
+{
+ static const unsigned long pgsize[] = {
+ SZ_4K | SZ_2M | SZ_1G,
+ SZ_16K | SZ_32M,
+ SZ_64K | SZ_512M,
+ };
+
+ static const unsigned int address_size[] = {
+ 32, 36, 40, 42, 44, 48,
+ };
+
+ int i, j, k, pass = 0, fail = 0;
+ struct device *dev;
+ struct io_pgtable_cfg cfg = {
+ .tlb = &dummy_tlb_ops,
+ .coherent_walk = true,
+ .quirks = IO_PGTABLE_QUIRK_NO_WARN,
+ };
+
+ dev = kunit_device_register(test, "io-pgtable-test");
+ KUNIT_EXPECT_NOT_ERR_OR_NULL(test, dev);
+ if (IS_ERR_OR_NULL(dev))
+ return;
+
+ cfg.iommu_dev = dev;
+
+ for (i = 0; i < ARRAY_SIZE(pgsize); ++i) {
+ for (j = 0; j < ARRAY_SIZE(address_size); ++j) {
+ /* Don't use ias > oas as it is not valid for stage-2. */
+ for (k = 0; k <= j; ++k) {
+ cfg.pgsize_bitmap = pgsize[i];
+ cfg.ias = address_size[k];
+ cfg.oas = address_size[j];
+ kunit_info(test, "pgsize_bitmap 0x%08lx, IAS %u OAS %u\n",
+ pgsize[i], cfg.ias, cfg.oas);
+ if (arm_lpae_run_tests(test, &cfg))
+ fail++;
+ else
+ pass++;
+ }
+ }
+ }
+
+ kunit_info(test, "completed with %d PASS %d FAIL\n", pass, fail);
+}
+
+static struct kunit_case io_pgtable_arm_test_cases[] = {
+ KUNIT_CASE(arm_lpae_do_selftests),
+ {},
+};
+
+static struct kunit_suite io_pgtable_arm_test = {
+ .name = "io-pgtable-arm-test",
+ .test_cases = io_pgtable_arm_test_cases,
+};
+
+kunit_test_suite(io_pgtable_arm_test);
+
+MODULE_DESCRIPTION("io-pgtable-arm library kunit tests");
+MODULE_LICENSE("GPL");
diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index 7e8e2216c294..e6626004b323 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -12,8 +12,6 @@
#include <linux/atomic.h>
#include <linux/bitops.h>
#include <linux/io-pgtable.h>
-#include <linux/kernel.h>
-#include <linux/device/faux.h>
#include <linux/sizes.h>
#include <linux/slab.h>
#include <linux/types.h>
@@ -1267,204 +1265,3 @@ struct io_pgtable_init_fns io_pgtable_arm_mali_lpae_init_fns = {
.alloc = arm_mali_lpae_alloc_pgtable,
.free = arm_lpae_free_pgtable,
};
-
-#ifdef CONFIG_IOMMU_IO_PGTABLE_LPAE_SELFTEST
-
-static struct io_pgtable_cfg *cfg_cookie __initdata;
-
-static void __init dummy_tlb_flush_all(void *cookie)
-{
- WARN_ON(cookie != cfg_cookie);
-}
-
-static void __init dummy_tlb_flush(unsigned long iova, size_t size,
- size_t granule, void *cookie)
-{
- WARN_ON(cookie != cfg_cookie);
- WARN_ON(!(size & cfg_cookie->pgsize_bitmap));
-}
-
-static void __init dummy_tlb_add_page(struct iommu_iotlb_gather *gather,
- unsigned long iova, size_t granule,
- void *cookie)
-{
- dummy_tlb_flush(iova, granule, granule, cookie);
-}
-
-static const struct iommu_flush_ops dummy_tlb_ops __initconst = {
- .tlb_flush_all = dummy_tlb_flush_all,
- .tlb_flush_walk = dummy_tlb_flush,
- .tlb_add_page = dummy_tlb_add_page,
-};
-
-static void __init arm_lpae_dump_ops(struct io_pgtable_ops *ops)
-{
- struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops);
- struct io_pgtable_cfg *cfg = &data->iop.cfg;
-
- pr_err("cfg: pgsize_bitmap 0x%lx, ias %u-bit\n",
- cfg->pgsize_bitmap, cfg->ias);
- pr_err("data: %d levels, 0x%zx pgd_size, %u pg_shift, %u bits_per_level, pgd @ %p\n",
- ARM_LPAE_MAX_LEVELS - data->start_level, ARM_LPAE_PGD_SIZE(data),
- ilog2(ARM_LPAE_GRANULE(data)), data->bits_per_level, data->pgd);
-}
-
-#define __FAIL(ops, i) ({ \
- WARN(1, "selftest: test failed for fmt idx %d\n", (i)); \
- arm_lpae_dump_ops(ops); \
- -EFAULT; \
-})
-
-static int __init arm_lpae_run_tests(struct io_pgtable_cfg *cfg)
-{
- static const enum io_pgtable_fmt fmts[] __initconst = {
- ARM_64_LPAE_S1,
- ARM_64_LPAE_S2,
- };
-
- int i, j;
- unsigned long iova;
- size_t size, mapped;
- struct io_pgtable_ops *ops;
-
- for (i = 0; i < ARRAY_SIZE(fmts); ++i) {
- cfg_cookie = cfg;
- ops = alloc_io_pgtable_ops(fmts[i], cfg, cfg);
- if (!ops) {
- pr_err("selftest: failed to allocate io pgtable ops\n");
- return -ENOMEM;
- }
-
- /*
- * Initial sanity checks.
- * Empty page tables shouldn't provide any translations.
- */
- if (ops->iova_to_phys(ops, 42))
- return __FAIL(ops, i);
-
- if (ops->iova_to_phys(ops, SZ_1G + 42))
- return __FAIL(ops, i);
-
- if (ops->iova_to_phys(ops, SZ_2G + 42))
- return __FAIL(ops, i);
-
- /*
- * Distinct mappings of different granule sizes.
- */
- iova = 0;
- for_each_set_bit(j, &cfg->pgsize_bitmap, BITS_PER_LONG) {
- size = 1UL << j;
-
- if (ops->map_pages(ops, iova, iova, size, 1,
- IOMMU_READ | IOMMU_WRITE |
- IOMMU_NOEXEC | IOMMU_CACHE,
- GFP_KERNEL, &mapped))
- return __FAIL(ops, i);
-
- /* Overlapping mappings */
- if (!ops->map_pages(ops, iova, iova + size, size, 1,
- IOMMU_READ | IOMMU_NOEXEC,
- GFP_KERNEL, &mapped))
- return __FAIL(ops, i);
-
- if (ops->iova_to_phys(ops, iova + 42) != (iova + 42))
- return __FAIL(ops, i);
-
- iova += SZ_1G;
- }
-
- /* Full unmap */
- iova = 0;
- for_each_set_bit(j, &cfg->pgsize_bitmap, BITS_PER_LONG) {
- size = 1UL << j;
-
- if (ops->unmap_pages(ops, iova, size, 1, NULL) != size)
- return __FAIL(ops, i);
-
- if (ops->iova_to_phys(ops, iova + 42))
- return __FAIL(ops, i);
-
- /* Remap full block */
- if (ops->map_pages(ops, iova, iova, size, 1,
- IOMMU_WRITE, GFP_KERNEL, &mapped))
- return __FAIL(ops, i);
-
- if (ops->iova_to_phys(ops, iova + 42) != (iova + 42))
- return __FAIL(ops, i);
-
- iova += SZ_1G;
- }
-
- /*
- * Map/unmap the last largest supported page of the IAS, this can
- * trigger corner cases in the concatednated page tables.
- */
- mapped = 0;
- size = 1UL << __fls(cfg->pgsize_bitmap);
- iova = (1UL << cfg->ias) - size;
- if (ops->map_pages(ops, iova, iova, size, 1,
- IOMMU_READ | IOMMU_WRITE |
- IOMMU_NOEXEC | IOMMU_CACHE,
- GFP_KERNEL, &mapped))
- return __FAIL(ops, i);
- if (mapped != size)
- return __FAIL(ops, i);
- if (ops->unmap_pages(ops, iova, size, 1, NULL) != size)
- return __FAIL(ops, i);
-
- free_io_pgtable_ops(ops);
- }
-
- return 0;
-}
-
-static int __init arm_lpae_do_selftests(void)
-{
- static const unsigned long pgsize[] __initconst = {
- SZ_4K | SZ_2M | SZ_1G,
- SZ_16K | SZ_32M,
- SZ_64K | SZ_512M,
- };
-
- static const unsigned int address_size[] __initconst = {
- 32, 36, 40, 42, 44, 48,
- };
-
- int i, j, k, pass = 0, fail = 0;
- struct faux_device *dev;
- struct io_pgtable_cfg cfg = {
- .tlb = &dummy_tlb_ops,
- .coherent_walk = true,
- .quirks = IO_PGTABLE_QUIRK_NO_WARN,
- };
-
- dev = faux_device_create("io-pgtable-test", NULL, 0);
- if (!dev)
- return -ENOMEM;
-
- cfg.iommu_dev = &dev->dev;
-
- for (i = 0; i < ARRAY_SIZE(pgsize); ++i) {
- for (j = 0; j < ARRAY_SIZE(address_size); ++j) {
- /* Don't use ias > oas as it is not valid for stage-2. */
- for (k = 0; k <= j; ++k) {
- cfg.pgsize_bitmap = pgsize[i];
- cfg.ias = address_size[k];
- cfg.oas = address_size[j];
- pr_info("selftest: pgsize_bitmap 0x%08lx, IAS %u OAS %u\n",
- pgsize[i], cfg.ias, cfg.oas);
- if (arm_lpae_run_tests(&cfg))
- fail++;
- else
- pass++;
- }
- }
- }
-
- pr_info("selftest: completed with %d PASS %d FAIL\n", pass, fail);
- faux_device_destroy(dev);
-
- return fail ? -EFAULT : 0;
-}
-subsys_initcall(arm_lpae_do_selftests);
-#endif
diff --git a/drivers/iommu/io-pgtable.c b/drivers/iommu/io-pgtable.c
index 8841c1487f00..843fec8e8a51 100644
--- a/drivers/iommu/io-pgtable.c
+++ b/drivers/iommu/io-pgtable.c
@@ -28,10 +28,6 @@ io_pgtable_init_table[IO_PGTABLE_NUM_FMTS] = {
#ifdef CONFIG_IOMMU_IO_PGTABLE_ARMV7S
[ARM_V7S] = &io_pgtable_arm_v7s_init_fns,
#endif
-#ifdef CONFIG_AMD_IOMMU
- [AMD_IOMMU_V1] = &io_pgtable_amd_iommu_v1_init_fns,
- [AMD_IOMMU_V2] = &io_pgtable_amd_iommu_v2_init_fns,
-#endif
};
static int check_custom_allocator(enum io_pgtable_fmt fmt,
diff --git a/drivers/iommu/iommu-pages.c b/drivers/iommu/iommu-pages.c
index 238c09e5166b..3bab175d8557 100644
--- a/drivers/iommu/iommu-pages.c
+++ b/drivers/iommu/iommu-pages.c
@@ -4,6 +4,7 @@
* Pasha Tatashin <pasha.tatashin@soleen.com>
*/
#include "iommu-pages.h"
+#include <linux/dma-mapping.h>
#include <linux/gfp.h>
#include <linux/mm.h>
@@ -22,6 +23,11 @@ IOPTDESC_MATCH(memcg_data, memcg_data);
#undef IOPTDESC_MATCH
static_assert(sizeof(struct ioptdesc) <= sizeof(struct page));
+static inline size_t ioptdesc_mem_size(struct ioptdesc *desc)
+{
+ return 1UL << (folio_order(ioptdesc_folio(desc)) + PAGE_SHIFT);
+}
+
/**
* iommu_alloc_pages_node_sz - Allocate a zeroed page of a given size from
* specific NUMA node
@@ -36,6 +42,7 @@ static_assert(sizeof(struct ioptdesc) <= sizeof(struct page));
*/
void *iommu_alloc_pages_node_sz(int nid, gfp_t gfp, size_t size)
{
+ struct ioptdesc *iopt;
unsigned long pgcnt;
struct folio *folio;
unsigned int order;
@@ -60,6 +67,9 @@ void *iommu_alloc_pages_node_sz(int nid, gfp_t gfp, size_t size)
if (unlikely(!folio))
return NULL;
+ iopt = folio_ioptdesc(folio);
+ iopt->incoherent = false;
+
/*
* All page allocations that should be reported to as "iommu-pagetables"
* to userspace must use one of the functions below. This includes
@@ -80,7 +90,10 @@ EXPORT_SYMBOL_GPL(iommu_alloc_pages_node_sz);
static void __iommu_free_desc(struct ioptdesc *iopt)
{
struct folio *folio = ioptdesc_folio(iopt);
- const unsigned long pgcnt = 1UL << folio_order(folio);
+ const unsigned long pgcnt = folio_nr_pages(folio);
+
+ if (IOMMU_PAGES_USE_DMA_API)
+ WARN_ON_ONCE(iopt->incoherent);
mod_node_page_state(folio_pgdat(folio), NR_IOMMU_PAGES, -pgcnt);
lruvec_stat_mod_folio(folio, NR_SECONDARY_PAGETABLE, -pgcnt);
@@ -117,3 +130,124 @@ void iommu_put_pages_list(struct iommu_pages_list *list)
__iommu_free_desc(iopt);
}
EXPORT_SYMBOL_GPL(iommu_put_pages_list);
+
+/**
+ * iommu_pages_start_incoherent - Setup the page for cache incoherent operation
+ * @virt: The page to setup
+ * @dma_dev: The iommu device
+ *
+ * For incoherent memory this will use the DMA API to manage the cache flushing
+ * on some arches. This is a lot of complexity compared to just calling
+ * arch_sync_dma_for_device(), but it is what the existing ARM iommu drivers
+ * have been doing. The DMA API requires keeping track of the DMA map and
+ * freeing it when required. This keeps track of the dma map inside the ioptdesc
+ * so that error paths are simple for the caller.
+ */
+int iommu_pages_start_incoherent(void *virt, struct device *dma_dev)
+{
+ struct ioptdesc *iopt = virt_to_ioptdesc(virt);
+ dma_addr_t dma;
+
+ if (WARN_ON(iopt->incoherent))
+ return -EINVAL;
+
+ if (!IOMMU_PAGES_USE_DMA_API) {
+ iommu_pages_flush_incoherent(dma_dev, virt, 0,
+ ioptdesc_mem_size(iopt));
+ } else {
+ dma = dma_map_single(dma_dev, virt, ioptdesc_mem_size(iopt),
+ DMA_TO_DEVICE);
+ if (dma_mapping_error(dma_dev, dma))
+ return -EINVAL;
+
+ /*
+ * The DMA API is not allowed to do anything other than DMA
+ * direct. It would be nice to also check
+ * dev_is_dma_coherent(dma_dev));
+ */
+ if (WARN_ON(dma != virt_to_phys(virt))) {
+ dma_unmap_single(dma_dev, dma, ioptdesc_mem_size(iopt),
+ DMA_TO_DEVICE);
+ return -EOPNOTSUPP;
+ }
+ }
+
+ iopt->incoherent = 1;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(iommu_pages_start_incoherent);
+
+/**
+ * iommu_pages_start_incoherent_list - Make a list of pages incoherent
+ * @list: The list of pages to setup
+ * @dma_dev: The iommu device
+ *
+ * Perform iommu_pages_start_incoherent() across all of list.
+ *
+ * If this fails the caller must call iommu_pages_stop_incoherent_list().
+ */
+int iommu_pages_start_incoherent_list(struct iommu_pages_list *list,
+ struct device *dma_dev)
+{
+ struct ioptdesc *cur;
+ int ret;
+
+ list_for_each_entry(cur, &list->pages, iopt_freelist_elm) {
+ if (WARN_ON(cur->incoherent))
+ continue;
+
+ ret = iommu_pages_start_incoherent(
+ folio_address(ioptdesc_folio(cur)), dma_dev);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(iommu_pages_start_incoherent_list);
+
+/**
+ * iommu_pages_stop_incoherent_list - Undo incoherence across a list
+ * @list: The list of pages to release
+ * @dma_dev: The iommu device
+ *
+ * Revert iommu_pages_start_incoherent() across all of the list. Pages that did
+ * not call or succeed iommu_pages_start_incoherent() will be ignored.
+ */
+#if IOMMU_PAGES_USE_DMA_API
+void iommu_pages_stop_incoherent_list(struct iommu_pages_list *list,
+ struct device *dma_dev)
+{
+ struct ioptdesc *cur;
+
+ list_for_each_entry(cur, &list->pages, iopt_freelist_elm) {
+ struct folio *folio = ioptdesc_folio(cur);
+
+ if (!cur->incoherent)
+ continue;
+ dma_unmap_single(dma_dev, virt_to_phys(folio_address(folio)),
+ ioptdesc_mem_size(cur), DMA_TO_DEVICE);
+ cur->incoherent = 0;
+ }
+}
+EXPORT_SYMBOL_GPL(iommu_pages_stop_incoherent_list);
+
+/**
+ * iommu_pages_free_incoherent - Free an incoherent page
+ * @virt: virtual address of the page to be freed.
+ * @dma_dev: The iommu device
+ *
+ * If the page is incoherent it made coherent again then freed.
+ */
+void iommu_pages_free_incoherent(void *virt, struct device *dma_dev)
+{
+ struct ioptdesc *iopt = virt_to_ioptdesc(virt);
+
+ if (iopt->incoherent) {
+ dma_unmap_single(dma_dev, virt_to_phys(virt),
+ ioptdesc_mem_size(iopt), DMA_TO_DEVICE);
+ iopt->incoherent = 0;
+ }
+ __iommu_free_desc(iopt);
+}
+EXPORT_SYMBOL_GPL(iommu_pages_free_incoherent);
+#endif
diff --git a/drivers/iommu/iommu-pages.h b/drivers/iommu/iommu-pages.h
index b3af2813ed0c..ae9da4f571f6 100644
--- a/drivers/iommu/iommu-pages.h
+++ b/drivers/iommu/iommu-pages.h
@@ -21,7 +21,10 @@ struct ioptdesc {
struct list_head iopt_freelist_elm;
unsigned long __page_mapping;
- pgoff_t __index;
+ union {
+ u8 incoherent;
+ pgoff_t __index;
+ };
void *_private;
unsigned int __page_type;
@@ -98,4 +101,48 @@ static inline void *iommu_alloc_pages_sz(gfp_t gfp, size_t size)
return iommu_alloc_pages_node_sz(NUMA_NO_NODE, gfp, size);
}
-#endif /* __IOMMU_PAGES_H */
+int iommu_pages_start_incoherent(void *virt, struct device *dma_dev);
+int iommu_pages_start_incoherent_list(struct iommu_pages_list *list,
+ struct device *dma_dev);
+
+#ifdef CONFIG_X86
+#define IOMMU_PAGES_USE_DMA_API 0
+#include <linux/cacheflush.h>
+
+static inline void iommu_pages_flush_incoherent(struct device *dma_dev,
+ void *virt, size_t offset,
+ size_t len)
+{
+ clflush_cache_range(virt + offset, len);
+}
+static inline void
+iommu_pages_stop_incoherent_list(struct iommu_pages_list *list,
+ struct device *dma_dev)
+{
+ /*
+ * For performance leave the incoherent flag alone which turns this into
+ * a NOP. For X86 the rest of the stop/free flow ignores the flag.
+ */
+}
+static inline void iommu_pages_free_incoherent(void *virt,
+ struct device *dma_dev)
+{
+ iommu_free_pages(virt);
+}
+#else
+#define IOMMU_PAGES_USE_DMA_API 1
+#include <linux/dma-mapping.h>
+
+static inline void iommu_pages_flush_incoherent(struct device *dma_dev,
+ void *virt, size_t offset,
+ size_t len)
+{
+ dma_sync_single_for_device(dma_dev, (uintptr_t)virt + offset, len,
+ DMA_TO_DEVICE);
+}
+void iommu_pages_stop_incoherent_list(struct iommu_pages_list *list,
+ struct device *dma_dev);
+void iommu_pages_free_incoherent(void *virt, struct device *dma_dev);
+#endif
+
+#endif /* __IOMMU_PAGES_H */
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 59244c744eab..2ca990dfbb88 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -100,7 +100,7 @@ static int iommu_bus_notifier(struct notifier_block *nb,
unsigned long action, void *data);
static void iommu_release_device(struct device *dev);
static int __iommu_attach_device(struct iommu_domain *domain,
- struct device *dev);
+ struct device *dev, struct iommu_domain *old);
static int __iommu_attach_group(struct iommu_domain *domain,
struct iommu_group *group);
static struct iommu_domain *__iommu_paging_domain_alloc_flags(struct device *dev,
@@ -114,6 +114,7 @@ enum {
static int __iommu_device_set_domain(struct iommu_group *group,
struct device *dev,
struct iommu_domain *new_domain,
+ struct iommu_domain *old_domain,
unsigned int flags);
static int __iommu_group_set_domain_internal(struct iommu_group *group,
struct iommu_domain *new_domain,
@@ -542,8 +543,21 @@ static void iommu_deinit_device(struct device *dev)
* Regardless, if a delayed attach never occurred, then the release
* should still avoid touching any hardware configuration either.
*/
- if (!dev->iommu->attach_deferred && ops->release_domain)
- ops->release_domain->ops->attach_dev(ops->release_domain, dev);
+ if (!dev->iommu->attach_deferred && ops->release_domain) {
+ struct iommu_domain *release_domain = ops->release_domain;
+
+ /*
+ * If the device requires direct mappings then it should not
+ * be parked on a BLOCKED domain during release as that would
+ * break the direct mappings.
+ */
+ if (dev->iommu->require_direct && ops->identity_domain &&
+ release_domain == ops->blocked_domain)
+ release_domain = ops->identity_domain;
+
+ release_domain->ops->attach_dev(release_domain, dev,
+ group->domain);
+ }
if (ops->release_device)
ops->release_device(dev);
@@ -628,7 +642,8 @@ static int __iommu_probe_device(struct device *dev, struct list_head *group_list
if (group->default_domain)
iommu_create_device_direct_mappings(group->default_domain, dev);
if (group->domain) {
- ret = __iommu_device_set_domain(group, dev, group->domain, 0);
+ ret = __iommu_device_set_domain(group, dev, group->domain, NULL,
+ 0);
if (ret)
goto err_remove_gdev;
} else if (!group->default_domain && !group_list) {
@@ -2115,14 +2130,14 @@ static void __iommu_group_set_core_domain(struct iommu_group *group)
}
static int __iommu_attach_device(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev, struct iommu_domain *old)
{
int ret;
if (unlikely(domain->ops->attach_dev == NULL))
return -ENODEV;
- ret = domain->ops->attach_dev(domain, dev);
+ ret = domain->ops->attach_dev(domain, dev, old);
if (ret)
return ret;
dev->iommu->attach_deferred = 0;
@@ -2171,7 +2186,7 @@ EXPORT_SYMBOL_GPL(iommu_attach_device);
int iommu_deferred_attach(struct device *dev, struct iommu_domain *domain)
{
if (dev->iommu && dev->iommu->attach_deferred)
- return __iommu_attach_device(domain, dev);
+ return __iommu_attach_device(domain, dev, NULL);
return 0;
}
@@ -2284,6 +2299,7 @@ EXPORT_SYMBOL_GPL(iommu_attach_group);
static int __iommu_device_set_domain(struct iommu_group *group,
struct device *dev,
struct iommu_domain *new_domain,
+ struct iommu_domain *old_domain,
unsigned int flags)
{
int ret;
@@ -2309,7 +2325,7 @@ static int __iommu_device_set_domain(struct iommu_group *group,
dev->iommu->attach_deferred = 0;
}
- ret = __iommu_attach_device(new_domain, dev);
+ ret = __iommu_attach_device(new_domain, dev, old_domain);
if (ret) {
/*
* If we have a blocking domain then try to attach that in hopes
@@ -2319,7 +2335,8 @@ static int __iommu_device_set_domain(struct iommu_group *group,
if ((flags & IOMMU_SET_DOMAIN_MUST_SUCCEED) &&
group->blocking_domain &&
group->blocking_domain != new_domain)
- __iommu_attach_device(group->blocking_domain, dev);
+ __iommu_attach_device(group->blocking_domain, dev,
+ old_domain);
return ret;
}
return 0;
@@ -2366,7 +2383,7 @@ static int __iommu_group_set_domain_internal(struct iommu_group *group,
result = 0;
for_each_group_device(group, gdev) {
ret = __iommu_device_set_domain(group, gdev->dev, new_domain,
- flags);
+ group->domain, flags);
if (ret) {
result = ret;
/*
@@ -2391,6 +2408,9 @@ err_revert:
*/
last_gdev = gdev;
for_each_group_device(group, gdev) {
+ /* No need to revert the last gdev that failed to set domain */
+ if (gdev == last_gdev)
+ break;
/*
* A NULL domain can happen only for first probe, in which case
* we leave group->domain as NULL and let release clean
@@ -2398,10 +2418,8 @@ err_revert:
*/
if (group->domain)
WARN_ON(__iommu_device_set_domain(
- group, gdev->dev, group->domain,
+ group, gdev->dev, group->domain, new_domain,
IOMMU_SET_DOMAIN_MUST_SUCCEED));
- if (gdev == last_gdev)
- break;
}
return ret;
}
diff --git a/drivers/iommu/iommufd/Kconfig b/drivers/iommu/iommufd/Kconfig
index 2beeb4f60ee5..eae3f03629b0 100644
--- a/drivers/iommu/iommufd/Kconfig
+++ b/drivers/iommu/iommufd/Kconfig
@@ -41,6 +41,7 @@ config IOMMUFD_TEST
depends on DEBUG_KERNEL
depends on FAULT_INJECTION
depends on RUNTIME_TESTING_MENU
+ depends on IOMMU_PT_AMDV1
select IOMMUFD_DRIVER
default n
help
diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c
index 75d60f2ad900..54cf4d856179 100644
--- a/drivers/iommu/iommufd/io_pagetable.c
+++ b/drivers/iommu/iommufd/io_pagetable.c
@@ -8,8 +8,10 @@
* The datastructure uses the iopt_pages to optimize the storage of the PFNs
* between the domains and xarray.
*/
+#include <linux/dma-buf.h>
#include <linux/err.h>
#include <linux/errno.h>
+#include <linux/file.h>
#include <linux/iommu.h>
#include <linux/iommufd.h>
#include <linux/lockdep.h>
@@ -284,6 +286,9 @@ static int iopt_alloc_area_pages(struct io_pagetable *iopt,
case IOPT_ADDRESS_FILE:
start = elm->start_byte + elm->pages->start;
break;
+ case IOPT_ADDRESS_DMABUF:
+ start = elm->start_byte + elm->pages->dmabuf.start;
+ break;
}
rc = iopt_alloc_iova(iopt, dst_iova, start, length);
if (rc)
@@ -468,25 +473,53 @@ int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
* @iopt: io_pagetable to act on
* @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains
* the chosen iova on output. Otherwise is the iova to map to on input
- * @file: file to map
+ * @fd: fdno of a file to map
* @start: map file starting at this byte offset
* @length: Number of bytes to map
* @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping
* @flags: IOPT_ALLOC_IOVA or zero
*/
int iopt_map_file_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
- unsigned long *iova, struct file *file,
- unsigned long start, unsigned long length,
- int iommu_prot, unsigned int flags)
+ unsigned long *iova, int fd, unsigned long start,
+ unsigned long length, int iommu_prot,
+ unsigned int flags)
{
struct iopt_pages *pages;
+ struct dma_buf *dmabuf;
+ unsigned long start_byte;
+ unsigned long last;
+
+ if (!length)
+ return -EINVAL;
+ if (check_add_overflow(start, length - 1, &last))
+ return -EOVERFLOW;
+
+ start_byte = start - ALIGN_DOWN(start, PAGE_SIZE);
+ dmabuf = dma_buf_get(fd);
+ if (!IS_ERR(dmabuf)) {
+ pages = iopt_alloc_dmabuf_pages(ictx, dmabuf, start_byte, start,
+ length,
+ iommu_prot & IOMMU_WRITE);
+ if (IS_ERR(pages)) {
+ dma_buf_put(dmabuf);
+ return PTR_ERR(pages);
+ }
+ } else {
+ struct file *file;
+
+ file = fget(fd);
+ if (!file)
+ return -EBADF;
+
+ pages = iopt_alloc_file_pages(file, start_byte, start, length,
+ iommu_prot & IOMMU_WRITE);
+ fput(file);
+ if (IS_ERR(pages))
+ return PTR_ERR(pages);
+ }
- pages = iopt_alloc_file_pages(file, start, length,
- iommu_prot & IOMMU_WRITE);
- if (IS_ERR(pages))
- return PTR_ERR(pages);
return iopt_map_common(ictx, iopt, pages, iova, length,
- start - pages->start, iommu_prot, flags);
+ start_byte, iommu_prot, flags);
}
struct iova_bitmap_fn_arg {
@@ -961,9 +994,15 @@ static void iopt_unfill_domain(struct io_pagetable *iopt,
WARN_ON(!area->storage_domain);
if (area->storage_domain == domain)
area->storage_domain = storage_domain;
+ if (iopt_is_dmabuf(pages)) {
+ if (!iopt_dmabuf_revoked(pages))
+ iopt_area_unmap_domain(area, domain);
+ iopt_dmabuf_untrack_domain(pages, area, domain);
+ }
mutex_unlock(&pages->mutex);
- iopt_area_unmap_domain(area, domain);
+ if (!iopt_is_dmabuf(pages))
+ iopt_area_unmap_domain(area, domain);
}
return;
}
@@ -980,6 +1019,8 @@ static void iopt_unfill_domain(struct io_pagetable *iopt,
WARN_ON(area->storage_domain != domain);
area->storage_domain = NULL;
iopt_area_unfill_domain(area, pages, domain);
+ if (iopt_is_dmabuf(pages))
+ iopt_dmabuf_untrack_domain(pages, area, domain);
mutex_unlock(&pages->mutex);
}
}
@@ -1009,10 +1050,16 @@ static int iopt_fill_domain(struct io_pagetable *iopt,
if (!pages)
continue;
- mutex_lock(&pages->mutex);
+ guard(mutex)(&pages->mutex);
+ if (iopt_is_dmabuf(pages)) {
+ rc = iopt_dmabuf_track_domain(pages, area, domain);
+ if (rc)
+ goto out_unfill;
+ }
rc = iopt_area_fill_domain(area, domain);
if (rc) {
- mutex_unlock(&pages->mutex);
+ if (iopt_is_dmabuf(pages))
+ iopt_dmabuf_untrack_domain(pages, area, domain);
goto out_unfill;
}
if (!area->storage_domain) {
@@ -1021,7 +1068,6 @@ static int iopt_fill_domain(struct io_pagetable *iopt,
interval_tree_insert(&area->pages_node,
&pages->domains_itree);
}
- mutex_unlock(&pages->mutex);
}
return 0;
@@ -1042,6 +1088,8 @@ out_unfill:
area->storage_domain = NULL;
}
iopt_area_unfill_domain(area, pages, domain);
+ if (iopt_is_dmabuf(pages))
+ iopt_dmabuf_untrack_domain(pages, area, domain);
mutex_unlock(&pages->mutex);
}
return rc;
@@ -1252,6 +1300,10 @@ static int iopt_area_split(struct iopt_area *area, unsigned long iova)
if (!pages || area->prevent_access)
return -EBUSY;
+ /* Maintaining the domains_itree below is a bit complicated */
+ if (iopt_is_dmabuf(pages))
+ return -EOPNOTSUPP;
+
if (new_start & (alignment - 1) ||
iopt_area_start_byte(area, new_start) & (alignment - 1))
return -EINVAL;
diff --git a/drivers/iommu/iommufd/io_pagetable.h b/drivers/iommu/iommufd/io_pagetable.h
index b6064f4ce4af..14cd052fd320 100644
--- a/drivers/iommu/iommufd/io_pagetable.h
+++ b/drivers/iommu/iommufd/io_pagetable.h
@@ -5,6 +5,7 @@
#ifndef __IO_PAGETABLE_H
#define __IO_PAGETABLE_H
+#include <linux/dma-buf.h>
#include <linux/interval_tree.h>
#include <linux/kref.h>
#include <linux/mutex.h>
@@ -69,6 +70,16 @@ void iopt_area_unfill_domain(struct iopt_area *area, struct iopt_pages *pages,
void iopt_area_unmap_domain(struct iopt_area *area,
struct iommu_domain *domain);
+int iopt_dmabuf_track_domain(struct iopt_pages *pages, struct iopt_area *area,
+ struct iommu_domain *domain);
+void iopt_dmabuf_untrack_domain(struct iopt_pages *pages,
+ struct iopt_area *area,
+ struct iommu_domain *domain);
+int iopt_dmabuf_track_all_domains(struct iopt_area *area,
+ struct iopt_pages *pages);
+void iopt_dmabuf_untrack_all_domains(struct iopt_area *area,
+ struct iopt_pages *pages);
+
static inline unsigned long iopt_area_index(struct iopt_area *area)
{
return area->pages_node.start;
@@ -179,7 +190,22 @@ enum {
enum iopt_address_type {
IOPT_ADDRESS_USER = 0,
- IOPT_ADDRESS_FILE = 1,
+ IOPT_ADDRESS_FILE,
+ IOPT_ADDRESS_DMABUF,
+};
+
+struct iopt_pages_dmabuf_track {
+ struct iommu_domain *domain;
+ struct iopt_area *area;
+ struct list_head elm;
+};
+
+struct iopt_pages_dmabuf {
+ struct dma_buf_attachment *attach;
+ struct dma_buf_phys_vec phys;
+ /* Always PAGE_SIZE aligned */
+ unsigned long start;
+ struct list_head tracker;
};
/*
@@ -209,6 +235,8 @@ struct iopt_pages {
struct file *file;
unsigned long start;
};
+ /* IOPT_ADDRESS_DMABUF */
+ struct iopt_pages_dmabuf dmabuf;
};
bool writable:1;
u8 account_mode;
@@ -220,10 +248,32 @@ struct iopt_pages {
struct rb_root_cached domains_itree;
};
+static inline bool iopt_is_dmabuf(struct iopt_pages *pages)
+{
+ if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER))
+ return false;
+ return pages->type == IOPT_ADDRESS_DMABUF;
+}
+
+static inline bool iopt_dmabuf_revoked(struct iopt_pages *pages)
+{
+ lockdep_assert_held(&pages->mutex);
+ if (iopt_is_dmabuf(pages))
+ return pages->dmabuf.phys.len == 0;
+ return false;
+}
+
struct iopt_pages *iopt_alloc_user_pages(void __user *uptr,
unsigned long length, bool writable);
-struct iopt_pages *iopt_alloc_file_pages(struct file *file, unsigned long start,
+struct iopt_pages *iopt_alloc_file_pages(struct file *file,
+ unsigned long start_byte,
+ unsigned long start,
unsigned long length, bool writable);
+struct iopt_pages *iopt_alloc_dmabuf_pages(struct iommufd_ctx *ictx,
+ struct dma_buf *dmabuf,
+ unsigned long start_byte,
+ unsigned long start,
+ unsigned long length, bool writable);
void iopt_release_pages(struct kref *kref);
static inline void iopt_put_pages(struct iopt_pages *pages)
{
diff --git a/drivers/iommu/iommufd/ioas.c b/drivers/iommu/iommufd/ioas.c
index 459a7c516915..f4721afedadc 100644
--- a/drivers/iommu/iommufd/ioas.c
+++ b/drivers/iommu/iommufd/ioas.c
@@ -207,7 +207,6 @@ int iommufd_ioas_map_file(struct iommufd_ucmd *ucmd)
unsigned long iova = cmd->iova;
struct iommufd_ioas *ioas;
unsigned int flags = 0;
- struct file *file;
int rc;
if (cmd->flags &
@@ -229,11 +228,7 @@ int iommufd_ioas_map_file(struct iommufd_ucmd *ucmd)
if (!(cmd->flags & IOMMU_IOAS_MAP_FIXED_IOVA))
flags = IOPT_ALLOC_IOVA;
- file = fget(cmd->fd);
- if (!file)
- return -EBADF;
-
- rc = iopt_map_file_pages(ucmd->ictx, &ioas->iopt, &iova, file,
+ rc = iopt_map_file_pages(ucmd->ictx, &ioas->iopt, &iova, cmd->fd,
cmd->start, cmd->length,
conv_iommu_prot(cmd->flags), flags);
if (rc)
@@ -243,7 +238,6 @@ int iommufd_ioas_map_file(struct iommufd_ucmd *ucmd)
rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
out_put:
iommufd_put_object(ucmd->ictx, &ioas->obj);
- fput(file);
return rc;
}
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 85d0843ed07b..eb6d1a70f673 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -19,6 +19,8 @@ struct iommu_domain;
struct iommu_group;
struct iommu_option;
struct iommufd_device;
+struct dma_buf_attachment;
+struct dma_buf_phys_vec;
struct iommufd_sw_msi_map {
struct list_head sw_msi_item;
@@ -108,7 +110,7 @@ int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
unsigned long length, int iommu_prot,
unsigned int flags);
int iopt_map_file_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
- unsigned long *iova, struct file *file,
+ unsigned long *iova, int fd,
unsigned long start, unsigned long length,
int iommu_prot, unsigned int flags);
int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list,
@@ -504,6 +506,8 @@ void iommufd_device_pre_destroy(struct iommufd_object *obj);
void iommufd_device_destroy(struct iommufd_object *obj);
int iommufd_get_hw_info(struct iommufd_ucmd *ucmd);
+struct device *iommufd_global_device(void);
+
struct iommufd_access {
struct iommufd_object obj;
struct iommufd_ctx *ictx;
@@ -713,6 +717,8 @@ bool iommufd_should_fail(void);
int __init iommufd_test_init(void);
void iommufd_test_exit(void);
bool iommufd_selftest_is_mock_dev(struct device *dev);
+int iommufd_test_dma_buf_iommufd_map(struct dma_buf_attachment *attachment,
+ struct dma_buf_phys_vec *phys);
#else
static inline void iommufd_test_syz_conv_iova_id(struct iommufd_ucmd *ucmd,
unsigned int ioas_id,
@@ -734,5 +740,11 @@ static inline bool iommufd_selftest_is_mock_dev(struct device *dev)
{
return false;
}
+static inline int
+iommufd_test_dma_buf_iommufd_map(struct dma_buf_attachment *attachment,
+ struct dma_buf_phys_vec *phys)
+{
+ return -EOPNOTSUPP;
+}
#endif
#endif
diff --git a/drivers/iommu/iommufd/iommufd_test.h b/drivers/iommu/iommufd/iommufd_test.h
index 8fc618b2bcf9..73e73e1ec158 100644
--- a/drivers/iommu/iommufd/iommufd_test.h
+++ b/drivers/iommu/iommufd/iommufd_test.h
@@ -29,11 +29,22 @@ enum {
IOMMU_TEST_OP_PASID_REPLACE,
IOMMU_TEST_OP_PASID_DETACH,
IOMMU_TEST_OP_PASID_CHECK_HWPT,
+ IOMMU_TEST_OP_DMABUF_GET,
+ IOMMU_TEST_OP_DMABUF_REVOKE,
};
enum {
+ MOCK_IOMMUPT_DEFAULT = 0,
+ MOCK_IOMMUPT_HUGE,
+ MOCK_IOMMUPT_AMDV1,
+};
+
+/* These values are true for MOCK_IOMMUPT_DEFAULT */
+enum {
MOCK_APERTURE_START = 1UL << 24,
MOCK_APERTURE_LAST = (1UL << 31) - 1,
+ MOCK_PAGE_SIZE = 2048,
+ MOCK_HUGE_PAGE_SIZE = 512 * MOCK_PAGE_SIZE,
};
enum {
@@ -52,7 +63,6 @@ enum {
enum {
MOCK_FLAGS_DEVICE_NO_DIRTY = 1 << 0,
- MOCK_FLAGS_DEVICE_HUGE_IOVA = 1 << 1,
MOCK_FLAGS_DEVICE_PASID = 1 << 2,
};
@@ -176,6 +186,14 @@ struct iommu_test_cmd {
__u32 hwpt_id;
/* @id is stdev_id */
} pasid_check;
+ struct {
+ __u32 length;
+ __u32 open_flags;
+ } dmabuf_get;
+ struct {
+ __s32 dmabuf_fd;
+ __u32 revoked;
+ } dmabuf_revoke;
};
__u32 last;
};
@@ -205,6 +223,7 @@ struct iommu_test_hw_info {
*/
struct iommu_hwpt_selftest {
__u32 iotlb;
+ __u32 pagetable_type;
};
/* Should not be equal to any defined value in enum iommu_hwpt_invalidate_data_type */
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index ce775fbbae94..5cc4b08c25f5 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -751,6 +751,15 @@ static struct miscdevice vfio_misc_dev = {
.mode = 0666,
};
+/*
+ * Used only by DMABUF, returns a valid struct device to use as a dummy struct
+ * device for attachment.
+ */
+struct device *iommufd_global_device(void)
+{
+ return iommu_misc_dev.this_device;
+}
+
static int __init iommufd_init(void)
{
int ret;
@@ -794,5 +803,6 @@ MODULE_ALIAS("devname:vfio/vfio");
#endif
MODULE_IMPORT_NS("IOMMUFD_INTERNAL");
MODULE_IMPORT_NS("IOMMUFD");
+MODULE_IMPORT_NS("DMA_BUF");
MODULE_DESCRIPTION("I/O Address Space Management for passthrough devices");
MODULE_LICENSE("GPL");
diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c
index c3433b845561..dbe51ecb9a20 100644
--- a/drivers/iommu/iommufd/pages.c
+++ b/drivers/iommu/iommufd/pages.c
@@ -45,6 +45,8 @@
* last_iova + 1 can overflow. An iopt_pages index will always be much less than
* ULONG_MAX so last_index + 1 cannot overflow.
*/
+#include <linux/dma-buf.h>
+#include <linux/dma-resv.h>
#include <linux/file.h>
#include <linux/highmem.h>
#include <linux/iommu.h>
@@ -53,6 +55,7 @@
#include <linux/overflow.h>
#include <linux/slab.h>
#include <linux/sched/mm.h>
+#include <linux/vfio_pci_core.h>
#include "double_span.h"
#include "io_pagetable.h"
@@ -258,6 +261,11 @@ static struct iopt_area *iopt_pages_find_domain_area(struct iopt_pages *pages,
return container_of(node, struct iopt_area, pages_node);
}
+enum batch_kind {
+ BATCH_CPU_MEMORY = 0,
+ BATCH_MMIO,
+};
+
/*
* A simple datastructure to hold a vector of PFNs, optimized for contiguous
* PFNs. This is used as a temporary holding memory for shuttling pfns from one
@@ -271,7 +279,9 @@ struct pfn_batch {
unsigned int array_size;
unsigned int end;
unsigned int total_pfns;
+ enum batch_kind kind;
};
+enum { MAX_NPFNS = type_max(typeof(((struct pfn_batch *)0)->npfns[0])) };
static void batch_clear(struct pfn_batch *batch)
{
@@ -348,11 +358,17 @@ static void batch_destroy(struct pfn_batch *batch, void *backup)
}
static bool batch_add_pfn_num(struct pfn_batch *batch, unsigned long pfn,
- u32 nr)
+ u32 nr, enum batch_kind kind)
{
- const unsigned int MAX_NPFNS = type_max(typeof(*batch->npfns));
unsigned int end = batch->end;
+ if (batch->kind != kind) {
+ /* One kind per batch */
+ if (batch->end != 0)
+ return false;
+ batch->kind = kind;
+ }
+
if (end && pfn == batch->pfns[end - 1] + batch->npfns[end - 1] &&
nr <= MAX_NPFNS - batch->npfns[end - 1]) {
batch->npfns[end - 1] += nr;
@@ -379,7 +395,7 @@ static void batch_remove_pfn_num(struct pfn_batch *batch, unsigned long nr)
/* true if the pfn was added, false otherwise */
static bool batch_add_pfn(struct pfn_batch *batch, unsigned long pfn)
{
- return batch_add_pfn_num(batch, pfn, 1);
+ return batch_add_pfn_num(batch, pfn, 1, BATCH_CPU_MEMORY);
}
/*
@@ -492,6 +508,7 @@ static int batch_to_domain(struct pfn_batch *batch, struct iommu_domain *domain,
{
bool disable_large_pages = area->iopt->disable_large_pages;
unsigned long last_iova = iopt_area_last_iova(area);
+ int iommu_prot = area->iommu_prot;
unsigned int page_offset = 0;
unsigned long start_iova;
unsigned long next_iova;
@@ -499,6 +516,11 @@ static int batch_to_domain(struct pfn_batch *batch, struct iommu_domain *domain,
unsigned long iova;
int rc;
+ if (batch->kind == BATCH_MMIO) {
+ iommu_prot &= ~IOMMU_CACHE;
+ iommu_prot |= IOMMU_MMIO;
+ }
+
/* The first index might be a partial page */
if (start_index == iopt_area_index(area))
page_offset = area->page_offset;
@@ -512,11 +534,11 @@ static int batch_to_domain(struct pfn_batch *batch, struct iommu_domain *domain,
rc = batch_iommu_map_small(
domain, iova,
PFN_PHYS(batch->pfns[cur]) + page_offset,
- next_iova - iova, area->iommu_prot);
+ next_iova - iova, iommu_prot);
else
rc = iommu_map(domain, iova,
PFN_PHYS(batch->pfns[cur]) + page_offset,
- next_iova - iova, area->iommu_prot,
+ next_iova - iova, iommu_prot,
GFP_KERNEL_ACCOUNT);
if (rc)
goto err_unmap;
@@ -652,7 +674,7 @@ static int batch_from_folios(struct pfn_batch *batch, struct folio ***folios_p,
nr = min(nr, npages);
npages -= nr;
- if (!batch_add_pfn_num(batch, pfn, nr))
+ if (!batch_add_pfn_num(batch, pfn, nr, BATCH_CPU_MEMORY))
break;
if (nr > 1) {
rc = folio_add_pins(folio, nr - 1);
@@ -1054,6 +1076,41 @@ static int pfn_reader_user_update_pinned(struct pfn_reader_user *user,
return iopt_pages_update_pinned(pages, npages, inc, user);
}
+struct pfn_reader_dmabuf {
+ struct dma_buf_phys_vec phys;
+ unsigned long start_offset;
+};
+
+static int pfn_reader_dmabuf_init(struct pfn_reader_dmabuf *dmabuf,
+ struct iopt_pages *pages)
+{
+ /* Callers must not get here if the dmabuf was already revoked */
+ if (WARN_ON(iopt_dmabuf_revoked(pages)))
+ return -EINVAL;
+
+ dmabuf->phys = pages->dmabuf.phys;
+ dmabuf->start_offset = pages->dmabuf.start;
+ return 0;
+}
+
+static int pfn_reader_fill_dmabuf(struct pfn_reader_dmabuf *dmabuf,
+ struct pfn_batch *batch,
+ unsigned long start_index,
+ unsigned long last_index)
+{
+ unsigned long start = dmabuf->start_offset + start_index * PAGE_SIZE;
+
+ /*
+ * start/last_index and start are all PAGE_SIZE aligned, the batch is
+ * always filled using page size aligned PFNs just like the other types.
+ * If the dmabuf has been sliced on a sub page offset then the common
+ * batch to domain code will adjust it before mapping to the domain.
+ */
+ batch_add_pfn_num(batch, PHYS_PFN(dmabuf->phys.paddr + start),
+ last_index - start_index + 1, BATCH_MMIO);
+ return 0;
+}
+
/*
* PFNs are stored in three places, in order of preference:
* - The iopt_pages xarray. This is only populated if there is a
@@ -1072,7 +1129,10 @@ struct pfn_reader {
unsigned long batch_end_index;
unsigned long last_index;
- struct pfn_reader_user user;
+ union {
+ struct pfn_reader_user user;
+ struct pfn_reader_dmabuf dmabuf;
+ };
};
static int pfn_reader_update_pinned(struct pfn_reader *pfns)
@@ -1108,7 +1168,7 @@ static int pfn_reader_fill_span(struct pfn_reader *pfns)
{
struct interval_tree_double_span_iter *span = &pfns->span;
unsigned long start_index = pfns->batch_end_index;
- struct pfn_reader_user *user = &pfns->user;
+ struct pfn_reader_user *user;
unsigned long npages;
struct iopt_area *area;
int rc;
@@ -1140,8 +1200,13 @@ static int pfn_reader_fill_span(struct pfn_reader *pfns)
return 0;
}
- if (start_index >= pfns->user.upages_end) {
- rc = pfn_reader_user_pin(&pfns->user, pfns->pages, start_index,
+ if (iopt_is_dmabuf(pfns->pages))
+ return pfn_reader_fill_dmabuf(&pfns->dmabuf, &pfns->batch,
+ start_index, span->last_hole);
+
+ user = &pfns->user;
+ if (start_index >= user->upages_end) {
+ rc = pfn_reader_user_pin(user, pfns->pages, start_index,
span->last_hole);
if (rc)
return rc;
@@ -1209,7 +1274,10 @@ static int pfn_reader_init(struct pfn_reader *pfns, struct iopt_pages *pages,
pfns->batch_start_index = start_index;
pfns->batch_end_index = start_index;
pfns->last_index = last_index;
- pfn_reader_user_init(&pfns->user, pages);
+ if (iopt_is_dmabuf(pages))
+ pfn_reader_dmabuf_init(&pfns->dmabuf, pages);
+ else
+ pfn_reader_user_init(&pfns->user, pages);
rc = batch_init(&pfns->batch, last_index - start_index + 1);
if (rc)
return rc;
@@ -1230,8 +1298,12 @@ static int pfn_reader_init(struct pfn_reader *pfns, struct iopt_pages *pages,
static void pfn_reader_release_pins(struct pfn_reader *pfns)
{
struct iopt_pages *pages = pfns->pages;
- struct pfn_reader_user *user = &pfns->user;
+ struct pfn_reader_user *user;
+
+ if (iopt_is_dmabuf(pages))
+ return;
+ user = &pfns->user;
if (user->upages_end > pfns->batch_end_index) {
/* Any pages not transferred to the batch are just unpinned */
@@ -1261,7 +1333,8 @@ static void pfn_reader_destroy(struct pfn_reader *pfns)
struct iopt_pages *pages = pfns->pages;
pfn_reader_release_pins(pfns);
- pfn_reader_user_destroy(&pfns->user, pfns->pages);
+ if (!iopt_is_dmabuf(pfns->pages))
+ pfn_reader_user_destroy(&pfns->user, pfns->pages);
batch_destroy(&pfns->batch, NULL);
WARN_ON(pages->last_npinned != pages->npinned);
}
@@ -1340,26 +1413,234 @@ struct iopt_pages *iopt_alloc_user_pages(void __user *uptr,
return pages;
}
-struct iopt_pages *iopt_alloc_file_pages(struct file *file, unsigned long start,
+struct iopt_pages *iopt_alloc_file_pages(struct file *file,
+ unsigned long start_byte,
+ unsigned long start,
unsigned long length, bool writable)
{
struct iopt_pages *pages;
- unsigned long start_down = ALIGN_DOWN(start, PAGE_SIZE);
- unsigned long end;
- if (length && check_add_overflow(start, length - 1, &end))
- return ERR_PTR(-EOVERFLOW);
-
- pages = iopt_alloc_pages(start - start_down, length, writable);
+ pages = iopt_alloc_pages(start_byte, length, writable);
if (IS_ERR(pages))
return pages;
pages->file = get_file(file);
- pages->start = start_down;
+ pages->start = start - start_byte;
pages->type = IOPT_ADDRESS_FILE;
return pages;
}
+static void iopt_revoke_notify(struct dma_buf_attachment *attach)
+{
+ struct iopt_pages *pages = attach->importer_priv;
+ struct iopt_pages_dmabuf_track *track;
+
+ guard(mutex)(&pages->mutex);
+ if (iopt_dmabuf_revoked(pages))
+ return;
+
+ list_for_each_entry(track, &pages->dmabuf.tracker, elm) {
+ struct iopt_area *area = track->area;
+
+ iopt_area_unmap_domain_range(area, track->domain,
+ iopt_area_index(area),
+ iopt_area_last_index(area));
+ }
+ pages->dmabuf.phys.len = 0;
+}
+
+static struct dma_buf_attach_ops iopt_dmabuf_attach_revoke_ops = {
+ .allow_peer2peer = true,
+ .move_notify = iopt_revoke_notify,
+};
+
+/*
+ * iommufd and vfio have a circular dependency. Future work for a phys
+ * based private interconnect will remove this.
+ */
+static int
+sym_vfio_pci_dma_buf_iommufd_map(struct dma_buf_attachment *attachment,
+ struct dma_buf_phys_vec *phys)
+{
+ typeof(&vfio_pci_dma_buf_iommufd_map) fn;
+ int rc;
+
+ rc = iommufd_test_dma_buf_iommufd_map(attachment, phys);
+ if (rc != -EOPNOTSUPP)
+ return rc;
+
+ if (!IS_ENABLED(CONFIG_VFIO_PCI_DMABUF))
+ return -EOPNOTSUPP;
+
+ fn = symbol_get(vfio_pci_dma_buf_iommufd_map);
+ if (!fn)
+ return -EOPNOTSUPP;
+ rc = fn(attachment, phys);
+ symbol_put(vfio_pci_dma_buf_iommufd_map);
+ return rc;
+}
+
+static int iopt_map_dmabuf(struct iommufd_ctx *ictx, struct iopt_pages *pages,
+ struct dma_buf *dmabuf)
+{
+ struct dma_buf_attachment *attach;
+ int rc;
+
+ attach = dma_buf_dynamic_attach(dmabuf, iommufd_global_device(),
+ &iopt_dmabuf_attach_revoke_ops, pages);
+ if (IS_ERR(attach))
+ return PTR_ERR(attach);
+
+ dma_resv_lock(dmabuf->resv, NULL);
+ /*
+ * Lock ordering requires the mutex to be taken inside the reservation,
+ * make sure lockdep sees this.
+ */
+ if (IS_ENABLED(CONFIG_LOCKDEP)) {
+ mutex_lock(&pages->mutex);
+ mutex_unlock(&pages->mutex);
+ }
+
+ rc = sym_vfio_pci_dma_buf_iommufd_map(attach, &pages->dmabuf.phys);
+ if (rc)
+ goto err_detach;
+
+ dma_resv_unlock(dmabuf->resv);
+
+ /* On success iopt_release_pages() will detach and put the dmabuf. */
+ pages->dmabuf.attach = attach;
+ return 0;
+
+err_detach:
+ dma_resv_unlock(dmabuf->resv);
+ dma_buf_detach(dmabuf, attach);
+ return rc;
+}
+
+struct iopt_pages *iopt_alloc_dmabuf_pages(struct iommufd_ctx *ictx,
+ struct dma_buf *dmabuf,
+ unsigned long start_byte,
+ unsigned long start,
+ unsigned long length, bool writable)
+{
+ static struct lock_class_key pages_dmabuf_mutex_key;
+ struct iopt_pages *pages;
+ int rc;
+
+ if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ if (dmabuf->size <= (start + length - 1) ||
+ length / PAGE_SIZE >= MAX_NPFNS)
+ return ERR_PTR(-EINVAL);
+
+ pages = iopt_alloc_pages(start_byte, length, writable);
+ if (IS_ERR(pages))
+ return pages;
+
+ /*
+ * The mmap_lock can be held when obtaining the dmabuf reservation lock
+ * which creates a locking cycle with the pages mutex which is held
+ * while obtaining the mmap_lock. This locking path is not present for
+ * IOPT_ADDRESS_DMABUF so split the lock class.
+ */
+ lockdep_set_class(&pages->mutex, &pages_dmabuf_mutex_key);
+
+ /* dmabuf does not use pinned page accounting. */
+ pages->account_mode = IOPT_PAGES_ACCOUNT_NONE;
+ pages->type = IOPT_ADDRESS_DMABUF;
+ pages->dmabuf.start = start - start_byte;
+ INIT_LIST_HEAD(&pages->dmabuf.tracker);
+
+ rc = iopt_map_dmabuf(ictx, pages, dmabuf);
+ if (rc) {
+ iopt_put_pages(pages);
+ return ERR_PTR(rc);
+ }
+
+ return pages;
+}
+
+int iopt_dmabuf_track_domain(struct iopt_pages *pages, struct iopt_area *area,
+ struct iommu_domain *domain)
+{
+ struct iopt_pages_dmabuf_track *track;
+
+ lockdep_assert_held(&pages->mutex);
+ if (WARN_ON(!iopt_is_dmabuf(pages)))
+ return -EINVAL;
+
+ list_for_each_entry(track, &pages->dmabuf.tracker, elm)
+ if (WARN_ON(track->domain == domain && track->area == area))
+ return -EINVAL;
+
+ track = kzalloc(sizeof(*track), GFP_KERNEL);
+ if (!track)
+ return -ENOMEM;
+ track->domain = domain;
+ track->area = area;
+ list_add_tail(&track->elm, &pages->dmabuf.tracker);
+
+ return 0;
+}
+
+void iopt_dmabuf_untrack_domain(struct iopt_pages *pages,
+ struct iopt_area *area,
+ struct iommu_domain *domain)
+{
+ struct iopt_pages_dmabuf_track *track;
+
+ lockdep_assert_held(&pages->mutex);
+ WARN_ON(!iopt_is_dmabuf(pages));
+
+ list_for_each_entry(track, &pages->dmabuf.tracker, elm) {
+ if (track->domain == domain && track->area == area) {
+ list_del(&track->elm);
+ kfree(track);
+ return;
+ }
+ }
+ WARN_ON(true);
+}
+
+int iopt_dmabuf_track_all_domains(struct iopt_area *area,
+ struct iopt_pages *pages)
+{
+ struct iopt_pages_dmabuf_track *track;
+ struct iommu_domain *domain;
+ unsigned long index;
+ int rc;
+
+ list_for_each_entry(track, &pages->dmabuf.tracker, elm)
+ if (WARN_ON(track->area == area))
+ return -EINVAL;
+
+ xa_for_each(&area->iopt->domains, index, domain) {
+ rc = iopt_dmabuf_track_domain(pages, area, domain);
+ if (rc)
+ goto err_untrack;
+ }
+ return 0;
+err_untrack:
+ iopt_dmabuf_untrack_all_domains(area, pages);
+ return rc;
+}
+
+void iopt_dmabuf_untrack_all_domains(struct iopt_area *area,
+ struct iopt_pages *pages)
+{
+ struct iopt_pages_dmabuf_track *track;
+ struct iopt_pages_dmabuf_track *tmp;
+
+ list_for_each_entry_safe(track, tmp, &pages->dmabuf.tracker,
+ elm) {
+ if (track->area == area) {
+ list_del(&track->elm);
+ kfree(track);
+ }
+ }
+}
+
void iopt_release_pages(struct kref *kref)
{
struct iopt_pages *pages = container_of(kref, struct iopt_pages, kref);
@@ -1372,8 +1653,15 @@ void iopt_release_pages(struct kref *kref)
mutex_destroy(&pages->mutex);
put_task_struct(pages->source_task);
free_uid(pages->source_user);
- if (pages->type == IOPT_ADDRESS_FILE)
+ if (iopt_is_dmabuf(pages) && pages->dmabuf.attach) {
+ struct dma_buf *dmabuf = pages->dmabuf.attach->dmabuf;
+
+ dma_buf_detach(dmabuf, pages->dmabuf.attach);
+ dma_buf_put(dmabuf);
+ WARN_ON(!list_empty(&pages->dmabuf.tracker));
+ } else if (pages->type == IOPT_ADDRESS_FILE) {
fput(pages->file);
+ }
kfree(pages);
}
@@ -1451,6 +1739,14 @@ static void __iopt_area_unfill_domain(struct iopt_area *area,
lockdep_assert_held(&pages->mutex);
+ if (iopt_is_dmabuf(pages)) {
+ if (WARN_ON(iopt_dmabuf_revoked(pages)))
+ return;
+ iopt_area_unmap_domain_range(area, domain, start_index,
+ last_index);
+ return;
+ }
+
/*
* For security we must not unpin something that is still DMA mapped,
* so this must unmap any IOVA before we go ahead and unpin the pages.
@@ -1526,6 +1822,9 @@ void iopt_area_unmap_domain(struct iopt_area *area, struct iommu_domain *domain)
void iopt_area_unfill_domain(struct iopt_area *area, struct iopt_pages *pages,
struct iommu_domain *domain)
{
+ if (iopt_dmabuf_revoked(pages))
+ return;
+
__iopt_area_unfill_domain(area, pages, domain,
iopt_area_last_index(area));
}
@@ -1546,6 +1845,9 @@ int iopt_area_fill_domain(struct iopt_area *area, struct iommu_domain *domain)
lockdep_assert_held(&area->pages->mutex);
+ if (iopt_dmabuf_revoked(area->pages))
+ return 0;
+
rc = pfn_reader_first(&pfns, area->pages, iopt_area_index(area),
iopt_area_last_index(area));
if (rc)
@@ -1605,33 +1907,44 @@ int iopt_area_fill_domains(struct iopt_area *area, struct iopt_pages *pages)
return 0;
mutex_lock(&pages->mutex);
- rc = pfn_reader_first(&pfns, pages, iopt_area_index(area),
- iopt_area_last_index(area));
- if (rc)
- goto out_unlock;
+ if (iopt_is_dmabuf(pages)) {
+ rc = iopt_dmabuf_track_all_domains(area, pages);
+ if (rc)
+ goto out_unlock;
+ }
- while (!pfn_reader_done(&pfns)) {
- done_first_end_index = pfns.batch_end_index;
- done_all_end_index = pfns.batch_start_index;
- xa_for_each(&area->iopt->domains, index, domain) {
- rc = batch_to_domain(&pfns.batch, domain, area,
- pfns.batch_start_index);
+ if (!iopt_dmabuf_revoked(pages)) {
+ rc = pfn_reader_first(&pfns, pages, iopt_area_index(area),
+ iopt_area_last_index(area));
+ if (rc)
+ goto out_untrack;
+
+ while (!pfn_reader_done(&pfns)) {
+ done_first_end_index = pfns.batch_end_index;
+ done_all_end_index = pfns.batch_start_index;
+ xa_for_each(&area->iopt->domains, index, domain) {
+ rc = batch_to_domain(&pfns.batch, domain, area,
+ pfns.batch_start_index);
+ if (rc)
+ goto out_unmap;
+ }
+ done_all_end_index = done_first_end_index;
+
+ rc = pfn_reader_next(&pfns);
if (rc)
goto out_unmap;
}
- done_all_end_index = done_first_end_index;
-
- rc = pfn_reader_next(&pfns);
+ rc = pfn_reader_update_pinned(&pfns);
if (rc)
goto out_unmap;
+
+ pfn_reader_destroy(&pfns);
}
- rc = pfn_reader_update_pinned(&pfns);
- if (rc)
- goto out_unmap;
area->storage_domain = xa_load(&area->iopt->domains, 0);
interval_tree_insert(&area->pages_node, &pages->domains_itree);
- goto out_destroy;
+ mutex_unlock(&pages->mutex);
+ return 0;
out_unmap:
pfn_reader_release_pins(&pfns);
@@ -1658,8 +1971,10 @@ out_unmap:
end_index);
}
}
-out_destroy:
pfn_reader_destroy(&pfns);
+out_untrack:
+ if (iopt_is_dmabuf(pages))
+ iopt_dmabuf_untrack_all_domains(area, pages);
out_unlock:
mutex_unlock(&pages->mutex);
return rc;
@@ -1685,16 +2000,22 @@ void iopt_area_unfill_domains(struct iopt_area *area, struct iopt_pages *pages)
if (!area->storage_domain)
goto out_unlock;
- xa_for_each(&iopt->domains, index, domain)
- if (domain != area->storage_domain)
+ xa_for_each(&iopt->domains, index, domain) {
+ if (domain == area->storage_domain)
+ continue;
+
+ if (!iopt_dmabuf_revoked(pages))
iopt_area_unmap_domain_range(
area, domain, iopt_area_index(area),
iopt_area_last_index(area));
+ }
if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
WARN_ON(RB_EMPTY_NODE(&area->pages_node.rb));
interval_tree_remove(&area->pages_node, &pages->domains_itree);
iopt_area_unfill_domain(area, pages, area->storage_domain);
+ if (iopt_is_dmabuf(pages))
+ iopt_dmabuf_untrack_all_domains(area, pages);
area->storage_domain = NULL;
out_unlock:
mutex_unlock(&pages->mutex);
@@ -2031,15 +2352,14 @@ int iopt_pages_rw_access(struct iopt_pages *pages, unsigned long start_byte,
if ((flags & IOMMUFD_ACCESS_RW_WRITE) && !pages->writable)
return -EPERM;
- if (pages->type == IOPT_ADDRESS_FILE)
+ if (iopt_is_dmabuf(pages))
+ return -EINVAL;
+
+ if (pages->type != IOPT_ADDRESS_USER)
return iopt_pages_rw_slow(pages, start_index, last_index,
start_byte % PAGE_SIZE, data, length,
flags);
- if (IS_ENABLED(CONFIG_IOMMUFD_TEST) &&
- WARN_ON(pages->type != IOPT_ADDRESS_USER))
- return -EINVAL;
-
if (!(flags & IOMMUFD_ACCESS_RW_KTHREAD) && change_mm) {
if (start_index == last_index)
return iopt_pages_rw_page(pages, start_index,
diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c
index de178827a078..c4322fd26f93 100644
--- a/drivers/iommu/iommufd/selftest.c
+++ b/drivers/iommu/iommufd/selftest.c
@@ -5,6 +5,8 @@
*/
#include <linux/anon_inodes.h>
#include <linux/debugfs.h>
+#include <linux/dma-buf.h>
+#include <linux/dma-resv.h>
#include <linux/fault-inject.h>
#include <linux/file.h>
#include <linux/iommu.h>
@@ -12,6 +14,8 @@
#include <linux/slab.h>
#include <linux/xarray.h>
#include <uapi/linux/iommufd.h>
+#include <linux/generic_pt/iommu.h>
+#include "../iommu-pages.h"
#include "../iommu-priv.h"
#include "io_pagetable.h"
@@ -41,21 +45,6 @@ static DEFINE_IDA(mock_dev_ida);
enum {
MOCK_DIRTY_TRACK = 1,
- MOCK_IO_PAGE_SIZE = PAGE_SIZE / 2,
- MOCK_HUGE_PAGE_SIZE = 512 * MOCK_IO_PAGE_SIZE,
-
- /*
- * Like a real page table alignment requires the low bits of the address
- * to be zero. xarray also requires the high bit to be zero, so we store
- * the pfns shifted. The upper bits are used for metadata.
- */
- MOCK_PFN_MASK = ULONG_MAX / MOCK_IO_PAGE_SIZE,
-
- _MOCK_PFN_START = MOCK_PFN_MASK + 1,
- MOCK_PFN_START_IOVA = _MOCK_PFN_START,
- MOCK_PFN_LAST_IOVA = _MOCK_PFN_START,
- MOCK_PFN_DIRTY_IOVA = _MOCK_PFN_START << 1,
- MOCK_PFN_HUGE_IOVA = _MOCK_PFN_START << 2,
};
static int mock_dev_enable_iopf(struct device *dev, struct iommu_domain *domain);
@@ -124,10 +113,15 @@ void iommufd_test_syz_conv_iova_id(struct iommufd_ucmd *ucmd,
}
struct mock_iommu_domain {
+ union {
+ struct iommu_domain domain;
+ struct pt_iommu iommu;
+ struct pt_iommu_amdv1 amdv1;
+ };
unsigned long flags;
- struct iommu_domain domain;
- struct xarray pfns;
};
+PT_IOMMU_CHECK_DOMAIN(struct mock_iommu_domain, iommu, domain);
+PT_IOMMU_CHECK_DOMAIN(struct mock_iommu_domain, amdv1.iommu, domain);
static inline struct mock_iommu_domain *
to_mock_domain(struct iommu_domain *domain)
@@ -216,7 +210,7 @@ static inline struct selftest_obj *to_selftest_obj(struct iommufd_object *obj)
}
static int mock_domain_nop_attach(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev, struct iommu_domain *old)
{
struct mock_dev *mdev = to_mock_dev(dev);
struct mock_viommu *new_viommu = NULL;
@@ -344,74 +338,6 @@ static int mock_domain_set_dirty_tracking(struct iommu_domain *domain,
return 0;
}
-static bool mock_test_and_clear_dirty(struct mock_iommu_domain *mock,
- unsigned long iova, size_t page_size,
- unsigned long flags)
-{
- unsigned long cur, end = iova + page_size - 1;
- bool dirty = false;
- void *ent, *old;
-
- for (cur = iova; cur < end; cur += MOCK_IO_PAGE_SIZE) {
- ent = xa_load(&mock->pfns, cur / MOCK_IO_PAGE_SIZE);
- if (!ent || !(xa_to_value(ent) & MOCK_PFN_DIRTY_IOVA))
- continue;
-
- dirty = true;
- /* Clear dirty */
- if (!(flags & IOMMU_DIRTY_NO_CLEAR)) {
- unsigned long val;
-
- val = xa_to_value(ent) & ~MOCK_PFN_DIRTY_IOVA;
- old = xa_store(&mock->pfns, cur / MOCK_IO_PAGE_SIZE,
- xa_mk_value(val), GFP_KERNEL);
- WARN_ON_ONCE(ent != old);
- }
- }
-
- return dirty;
-}
-
-static int mock_domain_read_and_clear_dirty(struct iommu_domain *domain,
- unsigned long iova, size_t size,
- unsigned long flags,
- struct iommu_dirty_bitmap *dirty)
-{
- struct mock_iommu_domain *mock = to_mock_domain(domain);
- unsigned long end = iova + size;
- void *ent;
-
- if (!(mock->flags & MOCK_DIRTY_TRACK) && dirty->bitmap)
- return -EINVAL;
-
- do {
- unsigned long pgsize = MOCK_IO_PAGE_SIZE;
- unsigned long head;
-
- ent = xa_load(&mock->pfns, iova / MOCK_IO_PAGE_SIZE);
- if (!ent) {
- iova += pgsize;
- continue;
- }
-
- if (xa_to_value(ent) & MOCK_PFN_HUGE_IOVA)
- pgsize = MOCK_HUGE_PAGE_SIZE;
- head = iova & ~(pgsize - 1);
-
- /* Clear dirty */
- if (mock_test_and_clear_dirty(mock, head, pgsize, flags))
- iommu_dirty_bitmap_record(dirty, iova, pgsize);
- iova += pgsize;
- } while (iova < end);
-
- return 0;
-}
-
-static const struct iommu_dirty_ops dirty_ops = {
- .set_dirty_tracking = mock_domain_set_dirty_tracking,
- .read_and_clear_dirty = mock_domain_read_and_clear_dirty,
-};
-
static struct mock_iommu_domain_nested *
__mock_domain_alloc_nested(const struct iommu_user_data *user_data)
{
@@ -446,7 +372,7 @@ mock_domain_alloc_nested(struct device *dev, struct iommu_domain *parent,
if (flags & ~IOMMU_HWPT_ALLOC_PASID)
return ERR_PTR(-EOPNOTSUPP);
- if (!parent || parent->ops != mock_ops.default_domain_ops)
+ if (!parent || !(parent->type & __IOMMU_DOMAIN_PAGING))
return ERR_PTR(-EINVAL);
mock_parent = to_mock_domain(parent);
@@ -459,159 +385,170 @@ mock_domain_alloc_nested(struct device *dev, struct iommu_domain *parent,
return &mock_nested->domain;
}
-static struct iommu_domain *
-mock_domain_alloc_paging_flags(struct device *dev, u32 flags,
- const struct iommu_user_data *user_data)
-{
- bool has_dirty_flag = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
- const u32 PAGING_FLAGS = IOMMU_HWPT_ALLOC_DIRTY_TRACKING |
- IOMMU_HWPT_ALLOC_NEST_PARENT |
- IOMMU_HWPT_ALLOC_PASID;
- struct mock_dev *mdev = to_mock_dev(dev);
- bool no_dirty_ops = mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY;
- struct mock_iommu_domain *mock;
-
- if (user_data)
- return ERR_PTR(-EOPNOTSUPP);
- if ((flags & ~PAGING_FLAGS) || (has_dirty_flag && no_dirty_ops))
- return ERR_PTR(-EOPNOTSUPP);
-
- mock = kzalloc(sizeof(*mock), GFP_KERNEL);
- if (!mock)
- return ERR_PTR(-ENOMEM);
- mock->domain.geometry.aperture_start = MOCK_APERTURE_START;
- mock->domain.geometry.aperture_end = MOCK_APERTURE_LAST;
- mock->domain.pgsize_bitmap = MOCK_IO_PAGE_SIZE;
- if (dev && mdev->flags & MOCK_FLAGS_DEVICE_HUGE_IOVA)
- mock->domain.pgsize_bitmap |= MOCK_HUGE_PAGE_SIZE;
- mock->domain.ops = mock_ops.default_domain_ops;
- mock->domain.type = IOMMU_DOMAIN_UNMANAGED;
- xa_init(&mock->pfns);
-
- if (has_dirty_flag)
- mock->domain.dirty_ops = &dirty_ops;
- return &mock->domain;
-}
-
static void mock_domain_free(struct iommu_domain *domain)
{
struct mock_iommu_domain *mock = to_mock_domain(domain);
- WARN_ON(!xa_empty(&mock->pfns));
+ pt_iommu_deinit(&mock->iommu);
kfree(mock);
}
-static int mock_domain_map_pages(struct iommu_domain *domain,
- unsigned long iova, phys_addr_t paddr,
- size_t pgsize, size_t pgcount, int prot,
- gfp_t gfp, size_t *mapped)
+static void mock_iotlb_sync(struct iommu_domain *domain,
+ struct iommu_iotlb_gather *gather)
{
- struct mock_iommu_domain *mock = to_mock_domain(domain);
- unsigned long flags = MOCK_PFN_START_IOVA;
- unsigned long start_iova = iova;
+ iommu_put_pages_list(&gather->freelist);
+}
- /*
- * xarray does not reliably work with fault injection because it does a
- * retry allocation, so put our own failure point.
- */
- if (iommufd_should_fail())
- return -ENOENT;
+static const struct iommu_domain_ops amdv1_mock_ops = {
+ IOMMU_PT_DOMAIN_OPS(amdv1_mock),
+ .free = mock_domain_free,
+ .attach_dev = mock_domain_nop_attach,
+ .set_dev_pasid = mock_domain_set_dev_pasid_nop,
+ .iotlb_sync = &mock_iotlb_sync,
+};
- WARN_ON(iova % MOCK_IO_PAGE_SIZE);
- WARN_ON(pgsize % MOCK_IO_PAGE_SIZE);
- for (; pgcount; pgcount--) {
- size_t cur;
+static const struct iommu_domain_ops amdv1_mock_huge_ops = {
+ IOMMU_PT_DOMAIN_OPS(amdv1_mock),
+ .free = mock_domain_free,
+ .attach_dev = mock_domain_nop_attach,
+ .set_dev_pasid = mock_domain_set_dev_pasid_nop,
+ .iotlb_sync = &mock_iotlb_sync,
+};
+#undef pt_iommu_amdv1_mock_map_pages
- for (cur = 0; cur != pgsize; cur += MOCK_IO_PAGE_SIZE) {
- void *old;
+static const struct iommu_dirty_ops amdv1_mock_dirty_ops = {
+ IOMMU_PT_DIRTY_OPS(amdv1_mock),
+ .set_dirty_tracking = mock_domain_set_dirty_tracking,
+};
- if (pgcount == 1 && cur + MOCK_IO_PAGE_SIZE == pgsize)
- flags = MOCK_PFN_LAST_IOVA;
- if (pgsize != MOCK_IO_PAGE_SIZE) {
- flags |= MOCK_PFN_HUGE_IOVA;
- }
- old = xa_store(&mock->pfns, iova / MOCK_IO_PAGE_SIZE,
- xa_mk_value((paddr / MOCK_IO_PAGE_SIZE) |
- flags),
- gfp);
- if (xa_is_err(old)) {
- for (; start_iova != iova;
- start_iova += MOCK_IO_PAGE_SIZE)
- xa_erase(&mock->pfns,
- start_iova /
- MOCK_IO_PAGE_SIZE);
- return xa_err(old);
- }
- WARN_ON(old);
- iova += MOCK_IO_PAGE_SIZE;
- paddr += MOCK_IO_PAGE_SIZE;
- *mapped += MOCK_IO_PAGE_SIZE;
- flags = 0;
- }
- }
- return 0;
-}
+static const struct iommu_domain_ops amdv1_ops = {
+ IOMMU_PT_DOMAIN_OPS(amdv1),
+ .free = mock_domain_free,
+ .attach_dev = mock_domain_nop_attach,
+ .set_dev_pasid = mock_domain_set_dev_pasid_nop,
+ .iotlb_sync = &mock_iotlb_sync,
+};
-static size_t mock_domain_unmap_pages(struct iommu_domain *domain,
- unsigned long iova, size_t pgsize,
- size_t pgcount,
- struct iommu_iotlb_gather *iotlb_gather)
+static const struct iommu_dirty_ops amdv1_dirty_ops = {
+ IOMMU_PT_DIRTY_OPS(amdv1),
+ .set_dirty_tracking = mock_domain_set_dirty_tracking,
+};
+
+static struct mock_iommu_domain *
+mock_domain_alloc_pgtable(struct device *dev,
+ const struct iommu_hwpt_selftest *user_cfg, u32 flags)
{
- struct mock_iommu_domain *mock = to_mock_domain(domain);
- bool first = true;
- size_t ret = 0;
- void *ent;
+ struct mock_iommu_domain *mock;
+ int rc;
- WARN_ON(iova % MOCK_IO_PAGE_SIZE);
- WARN_ON(pgsize % MOCK_IO_PAGE_SIZE);
+ mock = kzalloc(sizeof(*mock), GFP_KERNEL);
+ if (!mock)
+ return ERR_PTR(-ENOMEM);
+ mock->domain.type = IOMMU_DOMAIN_UNMANAGED;
- for (; pgcount; pgcount--) {
- size_t cur;
+ mock->amdv1.iommu.nid = NUMA_NO_NODE;
+
+ switch (user_cfg->pagetable_type) {
+ case MOCK_IOMMUPT_DEFAULT:
+ case MOCK_IOMMUPT_HUGE: {
+ struct pt_iommu_amdv1_cfg cfg = {};
+
+ /* The mock version has a 2k page size */
+ cfg.common.hw_max_vasz_lg2 = 56;
+ cfg.common.hw_max_oasz_lg2 = 51;
+ cfg.starting_level = 2;
+ if (user_cfg->pagetable_type == MOCK_IOMMUPT_HUGE)
+ mock->domain.ops = &amdv1_mock_huge_ops;
+ else
+ mock->domain.ops = &amdv1_mock_ops;
+ rc = pt_iommu_amdv1_mock_init(&mock->amdv1, &cfg, GFP_KERNEL);
+ if (rc)
+ goto err_free;
+
+ /*
+ * In huge mode userspace should only provide huge pages, we
+ * have to include PAGE_SIZE for the domain to be accepted by
+ * iommufd.
+ */
+ if (user_cfg->pagetable_type == MOCK_IOMMUPT_HUGE)
+ mock->domain.pgsize_bitmap = MOCK_HUGE_PAGE_SIZE |
+ PAGE_SIZE;
+ if (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING)
+ mock->domain.dirty_ops = &amdv1_mock_dirty_ops;
+ break;
+ }
- for (cur = 0; cur != pgsize; cur += MOCK_IO_PAGE_SIZE) {
- ent = xa_erase(&mock->pfns, iova / MOCK_IO_PAGE_SIZE);
+ case MOCK_IOMMUPT_AMDV1: {
+ struct pt_iommu_amdv1_cfg cfg = {};
+
+ cfg.common.hw_max_vasz_lg2 = 64;
+ cfg.common.hw_max_oasz_lg2 = 52;
+ cfg.common.features = BIT(PT_FEAT_DYNAMIC_TOP) |
+ BIT(PT_FEAT_AMDV1_ENCRYPT_TABLES) |
+ BIT(PT_FEAT_AMDV1_FORCE_COHERENCE);
+ cfg.starting_level = 2;
+ mock->domain.ops = &amdv1_ops;
+ rc = pt_iommu_amdv1_init(&mock->amdv1, &cfg, GFP_KERNEL);
+ if (rc)
+ goto err_free;
+ if (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING)
+ mock->domain.dirty_ops = &amdv1_dirty_ops;
+ break;
+ }
+ default:
+ rc = -EOPNOTSUPP;
+ goto err_free;
+ }
- /*
- * iommufd generates unmaps that must be a strict
- * superset of the map's performend So every
- * starting/ending IOVA should have been an iova passed
- * to map.
- *
- * This simple logic doesn't work when the HUGE_PAGE is
- * turned on since the core code will automatically
- * switch between the two page sizes creating a break in
- * the unmap calls. The break can land in the middle of
- * contiguous IOVA.
- */
- if (!(domain->pgsize_bitmap & MOCK_HUGE_PAGE_SIZE)) {
- if (first) {
- WARN_ON(ent && !(xa_to_value(ent) &
- MOCK_PFN_START_IOVA));
- first = false;
- }
- if (pgcount == 1 &&
- cur + MOCK_IO_PAGE_SIZE == pgsize)
- WARN_ON(ent && !(xa_to_value(ent) &
- MOCK_PFN_LAST_IOVA));
- }
+ /*
+ * Override the real aperture to the MOCK aperture for test purposes.
+ */
+ if (user_cfg->pagetable_type == MOCK_IOMMUPT_DEFAULT) {
+ WARN_ON(mock->domain.geometry.aperture_start != 0);
+ WARN_ON(mock->domain.geometry.aperture_end < MOCK_APERTURE_LAST);
- iova += MOCK_IO_PAGE_SIZE;
- ret += MOCK_IO_PAGE_SIZE;
- }
+ mock->domain.geometry.aperture_start = MOCK_APERTURE_START;
+ mock->domain.geometry.aperture_end = MOCK_APERTURE_LAST;
}
- return ret;
+
+ return mock;
+err_free:
+ kfree(mock);
+ return ERR_PTR(rc);
}
-static phys_addr_t mock_domain_iova_to_phys(struct iommu_domain *domain,
- dma_addr_t iova)
+static struct iommu_domain *
+mock_domain_alloc_paging_flags(struct device *dev, u32 flags,
+ const struct iommu_user_data *user_data)
{
- struct mock_iommu_domain *mock = to_mock_domain(domain);
- void *ent;
+ bool has_dirty_flag = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
+ const u32 PAGING_FLAGS = IOMMU_HWPT_ALLOC_DIRTY_TRACKING |
+ IOMMU_HWPT_ALLOC_NEST_PARENT |
+ IOMMU_HWPT_ALLOC_PASID;
+ struct mock_dev *mdev = to_mock_dev(dev);
+ bool no_dirty_ops = mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY;
+ struct iommu_hwpt_selftest user_cfg = {};
+ struct mock_iommu_domain *mock;
+ int rc;
- WARN_ON(iova % MOCK_IO_PAGE_SIZE);
- ent = xa_load(&mock->pfns, iova / MOCK_IO_PAGE_SIZE);
- WARN_ON(!ent);
- return (xa_to_value(ent) & MOCK_PFN_MASK) * MOCK_IO_PAGE_SIZE;
+ if ((flags & ~PAGING_FLAGS) || (has_dirty_flag && no_dirty_ops))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ if (user_data && (user_data->type != IOMMU_HWPT_DATA_SELFTEST &&
+ user_data->type != IOMMU_HWPT_DATA_NONE))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ if (user_data) {
+ rc = iommu_copy_struct_from_user(
+ &user_cfg, user_data, IOMMU_HWPT_DATA_SELFTEST, iotlb);
+ if (rc)
+ return ERR_PTR(rc);
+ }
+
+ mock = mock_domain_alloc_pgtable(dev, &user_cfg, flags);
+ if (IS_ERR(mock))
+ return ERR_CAST(mock);
+ return &mock->domain;
}
static bool mock_domain_capable(struct device *dev, enum iommu_cap cap)
@@ -955,15 +892,6 @@ static const struct iommu_ops mock_ops = {
.user_pasid_table = true,
.get_viommu_size = mock_get_viommu_size,
.viommu_init = mock_viommu_init,
- .default_domain_ops =
- &(struct iommu_domain_ops){
- .free = mock_domain_free,
- .attach_dev = mock_domain_nop_attach,
- .map_pages = mock_domain_map_pages,
- .unmap_pages = mock_domain_unmap_pages,
- .iova_to_phys = mock_domain_iova_to_phys,
- .set_dev_pasid = mock_domain_set_dev_pasid_nop,
- },
};
static void mock_domain_free_nested(struct iommu_domain *domain)
@@ -1047,7 +975,7 @@ get_md_pagetable(struct iommufd_ucmd *ucmd, u32 mockpt_id,
if (IS_ERR(hwpt))
return hwpt;
if (hwpt->domain->type != IOMMU_DOMAIN_UNMANAGED ||
- hwpt->domain->ops != mock_ops.default_domain_ops) {
+ hwpt->domain->owner != &mock_ops) {
iommufd_put_object(ucmd->ictx, &hwpt->obj);
return ERR_PTR(-EINVAL);
}
@@ -1088,7 +1016,6 @@ static struct mock_dev *mock_dev_create(unsigned long dev_flags)
{},
};
const u32 valid_flags = MOCK_FLAGS_DEVICE_NO_DIRTY |
- MOCK_FLAGS_DEVICE_HUGE_IOVA |
MOCK_FLAGS_DEVICE_PASID;
struct mock_dev *mdev;
int rc, i;
@@ -1277,23 +1204,25 @@ static int iommufd_test_md_check_pa(struct iommufd_ucmd *ucmd,
{
struct iommufd_hw_pagetable *hwpt;
struct mock_iommu_domain *mock;
+ unsigned int page_size;
uintptr_t end;
int rc;
- if (iova % MOCK_IO_PAGE_SIZE || length % MOCK_IO_PAGE_SIZE ||
- (uintptr_t)uptr % MOCK_IO_PAGE_SIZE ||
- check_add_overflow((uintptr_t)uptr, (uintptr_t)length, &end))
- return -EINVAL;
-
hwpt = get_md_pagetable(ucmd, mockpt_id, &mock);
if (IS_ERR(hwpt))
return PTR_ERR(hwpt);
- for (; length; length -= MOCK_IO_PAGE_SIZE) {
+ page_size = 1 << __ffs(mock->domain.pgsize_bitmap);
+ if (iova % page_size || length % page_size ||
+ (uintptr_t)uptr % page_size ||
+ check_add_overflow((uintptr_t)uptr, (uintptr_t)length, &end))
+ return -EINVAL;
+
+ for (; length; length -= page_size) {
struct page *pages[1];
+ phys_addr_t io_phys;
unsigned long pfn;
long npages;
- void *ent;
npages = get_user_pages_fast((uintptr_t)uptr & PAGE_MASK, 1, 0,
pages);
@@ -1308,15 +1237,14 @@ static int iommufd_test_md_check_pa(struct iommufd_ucmd *ucmd,
pfn = page_to_pfn(pages[0]);
put_page(pages[0]);
- ent = xa_load(&mock->pfns, iova / MOCK_IO_PAGE_SIZE);
- if (!ent ||
- (xa_to_value(ent) & MOCK_PFN_MASK) * MOCK_IO_PAGE_SIZE !=
- pfn * PAGE_SIZE + ((uintptr_t)uptr % PAGE_SIZE)) {
+ io_phys = mock->domain.ops->iova_to_phys(&mock->domain, iova);
+ if (io_phys !=
+ pfn * PAGE_SIZE + ((uintptr_t)uptr % PAGE_SIZE)) {
rc = -EINVAL;
goto out_put;
}
- iova += MOCK_IO_PAGE_SIZE;
- uptr += MOCK_IO_PAGE_SIZE;
+ iova += page_size;
+ uptr += page_size;
}
rc = 0;
@@ -1795,7 +1723,7 @@ static int iommufd_test_dirty(struct iommufd_ucmd *ucmd, unsigned int mockpt_id,
if (IS_ERR(hwpt))
return PTR_ERR(hwpt);
- if (!(mock->flags & MOCK_DIRTY_TRACK)) {
+ if (!(mock->flags & MOCK_DIRTY_TRACK) || !mock->iommu.ops->set_dirty) {
rc = -EINVAL;
goto out_put;
}
@@ -1814,22 +1742,10 @@ static int iommufd_test_dirty(struct iommufd_ucmd *ucmd, unsigned int mockpt_id,
}
for (i = 0; i < max; i++) {
- unsigned long cur = iova + i * page_size;
- void *ent, *old;
-
if (!test_bit(i, (unsigned long *)tmp))
continue;
-
- ent = xa_load(&mock->pfns, cur / page_size);
- if (ent) {
- unsigned long val;
-
- val = xa_to_value(ent) | MOCK_PFN_DIRTY_IOVA;
- old = xa_store(&mock->pfns, cur / page_size,
- xa_mk_value(val), GFP_KERNEL);
- WARN_ON_ONCE(ent != old);
- count++;
- }
+ mock->iommu.ops->set_dirty(&mock->iommu, iova + i * page_size);
+ count++;
}
cmd->dirty.out_nr_dirty = count;
@@ -2031,6 +1947,140 @@ void iommufd_selftest_destroy(struct iommufd_object *obj)
}
}
+struct iommufd_test_dma_buf {
+ void *memory;
+ size_t length;
+ bool revoked;
+};
+
+static int iommufd_test_dma_buf_attach(struct dma_buf *dmabuf,
+ struct dma_buf_attachment *attachment)
+{
+ return 0;
+}
+
+static void iommufd_test_dma_buf_detach(struct dma_buf *dmabuf,
+ struct dma_buf_attachment *attachment)
+{
+}
+
+static struct sg_table *
+iommufd_test_dma_buf_map(struct dma_buf_attachment *attachment,
+ enum dma_data_direction dir)
+{
+ return ERR_PTR(-EOPNOTSUPP);
+}
+
+static void iommufd_test_dma_buf_unmap(struct dma_buf_attachment *attachment,
+ struct sg_table *sgt,
+ enum dma_data_direction dir)
+{
+}
+
+static void iommufd_test_dma_buf_release(struct dma_buf *dmabuf)
+{
+ struct iommufd_test_dma_buf *priv = dmabuf->priv;
+
+ kfree(priv->memory);
+ kfree(priv);
+}
+
+static const struct dma_buf_ops iommufd_test_dmabuf_ops = {
+ .attach = iommufd_test_dma_buf_attach,
+ .detach = iommufd_test_dma_buf_detach,
+ .map_dma_buf = iommufd_test_dma_buf_map,
+ .release = iommufd_test_dma_buf_release,
+ .unmap_dma_buf = iommufd_test_dma_buf_unmap,
+};
+
+int iommufd_test_dma_buf_iommufd_map(struct dma_buf_attachment *attachment,
+ struct dma_buf_phys_vec *phys)
+{
+ struct iommufd_test_dma_buf *priv = attachment->dmabuf->priv;
+
+ dma_resv_assert_held(attachment->dmabuf->resv);
+
+ if (attachment->dmabuf->ops != &iommufd_test_dmabuf_ops)
+ return -EOPNOTSUPP;
+
+ if (priv->revoked)
+ return -ENODEV;
+
+ phys->paddr = virt_to_phys(priv->memory);
+ phys->len = priv->length;
+ return 0;
+}
+
+static int iommufd_test_dmabuf_get(struct iommufd_ucmd *ucmd,
+ unsigned int open_flags,
+ size_t len)
+{
+ DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
+ struct iommufd_test_dma_buf *priv;
+ struct dma_buf *dmabuf;
+ int rc;
+
+ len = ALIGN(len, PAGE_SIZE);
+ if (len == 0 || len > PAGE_SIZE * 512)
+ return -EINVAL;
+
+ priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+ if (!priv)
+ return -ENOMEM;
+
+ priv->length = len;
+ priv->memory = kzalloc(len, GFP_KERNEL);
+ if (!priv->memory) {
+ rc = -ENOMEM;
+ goto err_free;
+ }
+
+ exp_info.ops = &iommufd_test_dmabuf_ops;
+ exp_info.size = len;
+ exp_info.flags = open_flags;
+ exp_info.priv = priv;
+
+ dmabuf = dma_buf_export(&exp_info);
+ if (IS_ERR(dmabuf)) {
+ rc = PTR_ERR(dmabuf);
+ goto err_free;
+ }
+
+ return dma_buf_fd(dmabuf, open_flags);
+
+err_free:
+ kfree(priv->memory);
+ kfree(priv);
+ return rc;
+}
+
+static int iommufd_test_dmabuf_revoke(struct iommufd_ucmd *ucmd, int fd,
+ bool revoked)
+{
+ struct iommufd_test_dma_buf *priv;
+ struct dma_buf *dmabuf;
+ int rc = 0;
+
+ dmabuf = dma_buf_get(fd);
+ if (IS_ERR(dmabuf))
+ return PTR_ERR(dmabuf);
+
+ if (dmabuf->ops != &iommufd_test_dmabuf_ops) {
+ rc = -EOPNOTSUPP;
+ goto err_put;
+ }
+
+ priv = dmabuf->priv;
+ dma_resv_lock(dmabuf->resv, NULL);
+ priv->revoked = revoked;
+ dma_buf_move_notify(dmabuf);
+ dma_resv_unlock(dmabuf->resv);
+
+err_put:
+ dma_buf_put(dmabuf);
+ return rc;
+}
+
int iommufd_test(struct iommufd_ucmd *ucmd)
{
struct iommu_test_cmd *cmd = ucmd->cmd;
@@ -2109,6 +2159,13 @@ int iommufd_test(struct iommufd_ucmd *ucmd)
return iommufd_test_pasid_detach(ucmd, cmd);
case IOMMU_TEST_OP_PASID_CHECK_HWPT:
return iommufd_test_pasid_check_hwpt(ucmd, cmd);
+ case IOMMU_TEST_OP_DMABUF_GET:
+ return iommufd_test_dmabuf_get(ucmd, cmd->dmabuf_get.open_flags,
+ cmd->dmabuf_get.length);
+ case IOMMU_TEST_OP_DMABUF_REVOKE:
+ return iommufd_test_dmabuf_revoke(ucmd,
+ cmd->dmabuf_revoke.dmabuf_fd,
+ cmd->dmabuf_revoke.revoked);
default:
return -EOPNOTSUPP;
}
@@ -2202,3 +2259,5 @@ void iommufd_test_exit(void)
platform_device_unregister(selftest_iommu_dev);
debugfs_remove_recursive(dbgfs_root);
}
+
+MODULE_IMPORT_NS("GENERIC_PT_IOMMU");
diff --git a/drivers/iommu/ipmmu-vmsa.c b/drivers/iommu/ipmmu-vmsa.c
index ffa892f65714..ca848288dbf2 100644
--- a/drivers/iommu/ipmmu-vmsa.c
+++ b/drivers/iommu/ipmmu-vmsa.c
@@ -590,7 +590,7 @@ static void ipmmu_domain_free(struct iommu_domain *io_domain)
}
static int ipmmu_attach_device(struct iommu_domain *io_domain,
- struct device *dev)
+ struct device *dev, struct iommu_domain *old)
{
struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
struct ipmmu_vmsa_device *mmu = to_ipmmu(dev);
@@ -637,17 +637,17 @@ static int ipmmu_attach_device(struct iommu_domain *io_domain,
}
static int ipmmu_iommu_identity_attach(struct iommu_domain *identity_domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
- struct iommu_domain *io_domain = iommu_get_domain_for_dev(dev);
struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
struct ipmmu_vmsa_domain *domain;
unsigned int i;
- if (io_domain == identity_domain || !io_domain)
+ if (old == identity_domain || !old)
return 0;
- domain = to_vmsa_domain(io_domain);
+ domain = to_vmsa_domain(old);
for (i = 0; i < fwspec->num_ids; ++i)
ipmmu_utlb_disable(domain, fwspec->ids[i]);
@@ -720,6 +720,8 @@ static int ipmmu_init_platform_device(struct device *dev,
dev_iommu_priv_set(dev, platform_get_drvdata(ipmmu_pdev));
+ put_device(&ipmmu_pdev->dev);
+
return 0;
}
diff --git a/drivers/iommu/msm_iommu.c b/drivers/iommu/msm_iommu.c
index 43a61ba021a5..819add75a665 100644
--- a/drivers/iommu/msm_iommu.c
+++ b/drivers/iommu/msm_iommu.c
@@ -391,7 +391,8 @@ static struct iommu_device *msm_iommu_probe_device(struct device *dev)
return &iommu->iommu;
}
-static int msm_iommu_attach_dev(struct iommu_domain *domain, struct device *dev)
+static int msm_iommu_attach_dev(struct iommu_domain *domain, struct device *dev,
+ struct iommu_domain *old)
{
int ret = 0;
unsigned long flags;
@@ -441,19 +442,19 @@ fail:
}
static int msm_iommu_identity_attach(struct iommu_domain *identity_domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
- struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
struct msm_priv *priv;
unsigned long flags;
struct msm_iommu_dev *iommu;
struct msm_iommu_ctx_dev *master;
int ret = 0;
- if (domain == identity_domain || !domain)
+ if (old == identity_domain || !old)
return 0;
- priv = to_msm_priv(domain);
+ priv = to_msm_priv(old);
free_io_pgtable_ops(priv->iop);
spin_lock_irqsave(&msm_iommu_lock, flags);
diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c
index 0e0285348d2b..60fcd3d3b5eb 100644
--- a/drivers/iommu/mtk_iommu.c
+++ b/drivers/iommu/mtk_iommu.c
@@ -139,6 +139,7 @@
/* 2 bits: iommu type */
#define MTK_IOMMU_TYPE_MM (0x0 << 13)
#define MTK_IOMMU_TYPE_INFRA (0x1 << 13)
+#define MTK_IOMMU_TYPE_APU (0x2 << 13)
#define MTK_IOMMU_TYPE_MASK (0x3 << 13)
/* PM and clock always on. e.g. infra iommu */
#define PM_CLK_AO BIT(15)
@@ -147,6 +148,7 @@
#define TF_PORT_TO_ADDR_MT8173 BIT(18)
#define INT_ID_PORT_WIDTH_6 BIT(19)
#define CFG_IFA_MASTER_IN_ATF BIT(20)
+#define DL_WITH_MULTI_LARB BIT(21)
#define MTK_IOMMU_HAS_FLAG_MASK(pdata, _x, mask) \
((((pdata)->flags) & (mask)) == (_x))
@@ -172,6 +174,7 @@ enum mtk_iommu_plat {
M4U_MT8183,
M4U_MT8186,
M4U_MT8188,
+ M4U_MT8189,
M4U_MT8192,
M4U_MT8195,
M4U_MT8365,
@@ -335,6 +338,8 @@ static int mtk_iommu_hw_init(const struct mtk_iommu_data *data, unsigned int ban
*/
#define MTK_IOMMU_4GB_MODE_REMAP_BASE 0x140000000UL
+static LIST_HEAD(apulist); /* List the apu iommu HWs */
+static LIST_HEAD(infralist); /* List the iommu_infra HW */
static LIST_HEAD(m4ulist); /* List all the M4U HWs */
#define for_each_m4u(data, head) list_for_each_entry(data, head, list)
@@ -350,6 +355,15 @@ static const struct mtk_iommu_iova_region single_domain[] = {
#define MT8192_MULTI_REGION_NR (IS_ENABLED(CONFIG_ARCH_DMA_ADDR_T_64BIT) ? \
MT8192_MULTI_REGION_NR_MAX : 1)
+static const struct mtk_iommu_iova_region mt8189_multi_dom_apu[] = {
+ { .iova_base = 0x200000ULL, .size = SZ_512M}, /* APU SECURE */
+#if IS_ENABLED(CONFIG_ARCH_DMA_ADDR_T_64BIT)
+ { .iova_base = SZ_1G, .size = 0xc0000000}, /* APU CODE */
+ { .iova_base = 0x70000000ULL, .size = 0x12600000}, /* APU VLM */
+ { .iova_base = SZ_4G, .size = SZ_4G * 3}, /* APU VPU */
+#endif
+};
+
static const struct mtk_iommu_iova_region mt8192_multi_dom[MT8192_MULTI_REGION_NR] = {
{ .iova_base = 0x0, .size = MTK_IOMMU_IOVA_SZ_4G}, /* 0 ~ 4G, */
#if IS_ENABLED(CONFIG_ARCH_DMA_ADDR_T_64BIT)
@@ -705,7 +719,7 @@ static void mtk_iommu_domain_free(struct iommu_domain *domain)
}
static int mtk_iommu_attach_device(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev, struct iommu_domain *old)
{
struct mtk_iommu_data *data = dev_iommu_priv_get(dev), *frstdata;
struct mtk_iommu_domain *dom = to_mtk_domain(domain);
@@ -773,12 +787,12 @@ err_unlock:
}
static int mtk_iommu_identity_attach(struct iommu_domain *identity_domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
- struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
struct mtk_iommu_data *data = dev_iommu_priv_get(dev);
- if (domain == identity_domain || !domain)
+ if (old == identity_domain || !old)
return 0;
mtk_iommu_config(data, dev, false, 0);
@@ -865,6 +879,7 @@ static struct iommu_device *mtk_iommu_probe_device(struct device *dev)
struct mtk_iommu_data *data = dev_iommu_priv_get(dev);
struct device_link *link;
struct device *larbdev;
+ unsigned long larbid_msk = 0;
unsigned int larbid, larbidx, i;
if (!MTK_IOMMU_IS_TYPE(data->plat_data, MTK_IOMMU_TYPE_MM))
@@ -872,30 +887,50 @@ static struct iommu_device *mtk_iommu_probe_device(struct device *dev)
/*
* Link the consumer device with the smi-larb device(supplier).
- * The device that connects with each a larb is a independent HW.
- * All the ports in each a device should be in the same larbs.
+ * w/DL_WITH_MULTI_LARB: the master may connect with multi larbs,
+ * we should create device link with each larb.
+ * w/o DL_WITH_MULTI_LARB: the master must connect with one larb,
+ * otherwise fail.
*/
larbid = MTK_M4U_TO_LARB(fwspec->ids[0]);
if (larbid >= MTK_LARB_NR_MAX)
return ERR_PTR(-EINVAL);
+ larbid_msk |= BIT(larbid);
+
for (i = 1; i < fwspec->num_ids; i++) {
larbidx = MTK_M4U_TO_LARB(fwspec->ids[i]);
- if (larbid != larbidx) {
+ if (MTK_IOMMU_HAS_FLAG(data->plat_data, DL_WITH_MULTI_LARB)) {
+ larbid_msk |= BIT(larbidx);
+ } else if (larbid != larbidx) {
dev_err(dev, "Can only use one larb. Fail@larb%d-%d.\n",
larbid, larbidx);
return ERR_PTR(-EINVAL);
}
}
- larbdev = data->larb_imu[larbid].dev;
- if (!larbdev)
- return ERR_PTR(-EINVAL);
- link = device_link_add(dev, larbdev,
- DL_FLAG_PM_RUNTIME | DL_FLAG_STATELESS);
- if (!link)
- dev_err(dev, "Unable to link %s\n", dev_name(larbdev));
+ for_each_set_bit(larbid, &larbid_msk, 32) {
+ larbdev = data->larb_imu[larbid].dev;
+ if (!larbdev)
+ return ERR_PTR(-EINVAL);
+
+ link = device_link_add(dev, larbdev,
+ DL_FLAG_PM_RUNTIME | DL_FLAG_STATELESS);
+ if (!link) {
+ dev_err(dev, "Unable to link %s\n", dev_name(larbdev));
+ goto link_remove;
+ }
+ }
+
return &data->iommu;
+
+link_remove:
+ for_each_set_bit(i, &larbid_msk, larbid) {
+ larbdev = data->larb_imu[i].dev;
+ device_link_remove(dev, larbdev);
+ }
+
+ return ERR_PTR(-ENODEV);
}
static void mtk_iommu_release_device(struct device *dev)
@@ -903,11 +938,19 @@ static void mtk_iommu_release_device(struct device *dev)
struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
struct mtk_iommu_data *data;
struct device *larbdev;
- unsigned int larbid;
+ unsigned int larbid, i;
+ unsigned long larbid_msk = 0;
data = dev_iommu_priv_get(dev);
- if (MTK_IOMMU_IS_TYPE(data->plat_data, MTK_IOMMU_TYPE_MM)) {
- larbid = MTK_M4U_TO_LARB(fwspec->ids[0]);
+ if (!MTK_IOMMU_IS_TYPE(data->plat_data, MTK_IOMMU_TYPE_MM))
+ return;
+
+ for (i = 0; i < fwspec->num_ids; i++) {
+ larbid = MTK_M4U_TO_LARB(fwspec->ids[i]);
+ larbid_msk |= BIT(larbid);
+ }
+
+ for_each_set_bit(larbid, &larbid_msk, 32) {
larbdev = data->larb_imu[larbid].dev;
device_link_remove(dev, larbdev);
}
@@ -974,6 +1017,8 @@ static int mtk_iommu_of_xlate(struct device *dev,
return -EINVAL;
dev_iommu_priv_set(dev, platform_get_drvdata(m4updev));
+
+ put_device(&m4updev->dev);
}
return iommu_fwspec_add_ids(dev, args->args, 1);
@@ -1211,16 +1256,19 @@ static int mtk_iommu_mm_dts_parse(struct device *dev, struct component_match **m
}
component_match_add(dev, match, component_compare_dev, &plarbdev->dev);
- platform_device_put(plarbdev);
}
- if (!frst_avail_smicomm_node)
- return -EINVAL;
+ if (!frst_avail_smicomm_node) {
+ ret = -EINVAL;
+ goto err_larbdev_put;
+ }
pcommdev = of_find_device_by_node(frst_avail_smicomm_node);
of_node_put(frst_avail_smicomm_node);
- if (!pcommdev)
- return -ENODEV;
+ if (!pcommdev) {
+ ret = -ENODEV;
+ goto err_larbdev_put;
+ }
data->smicomm_dev = &pcommdev->dev;
link = device_link_add(data->smicomm_dev, dev,
@@ -1228,16 +1276,16 @@ static int mtk_iommu_mm_dts_parse(struct device *dev, struct component_match **m
platform_device_put(pcommdev);
if (!link) {
dev_err(dev, "Unable to link %s.\n", dev_name(data->smicomm_dev));
- return -EINVAL;
+ ret = -EINVAL;
+ goto err_larbdev_put;
}
return 0;
err_larbdev_put:
- for (i = MTK_LARB_NR_MAX - 1; i >= 0; i--) {
- if (!data->larb_imu[i].dev)
- continue;
+ /* id mapping may not be linear, loop the whole array */
+ for (i = 0; i < MTK_LARB_NR_MAX; i++)
put_device(data->larb_imu[i].dev);
- }
+
return ret;
}
@@ -1400,8 +1448,12 @@ out_sysfs_remove:
iommu_device_sysfs_remove(&data->iommu);
out_list_del:
list_del(&data->list);
- if (MTK_IOMMU_IS_TYPE(data->plat_data, MTK_IOMMU_TYPE_MM))
+ if (MTK_IOMMU_IS_TYPE(data->plat_data, MTK_IOMMU_TYPE_MM)) {
device_link_remove(data->smicomm_dev, dev);
+
+ for (i = 0; i < MTK_LARB_NR_MAX; i++)
+ put_device(data->larb_imu[i].dev);
+ }
out_runtime_disable:
pm_runtime_disable(dev);
return ret;
@@ -1421,6 +1473,9 @@ static void mtk_iommu_remove(struct platform_device *pdev)
if (MTK_IOMMU_IS_TYPE(data->plat_data, MTK_IOMMU_TYPE_MM)) {
device_link_remove(data->smicomm_dev, &pdev->dev);
component_master_del(&pdev->dev, &mtk_iommu_com_ops);
+
+ for (i = 0; i < MTK_LARB_NR_MAX; i++)
+ put_device(data->larb_imu[i].dev);
}
pm_runtime_disable(&pdev->dev);
for (i = 0; i < data->plat_data->banks_num; i++) {
@@ -1695,6 +1750,66 @@ static const struct mtk_iommu_plat_data mt8188_data_vpp = {
27, 28 /* ccu0 */, MTK_INVALID_LARBID}, {4, 6}},
};
+static const unsigned int mt8189_apu_region_msk[][MTK_LARB_NR_MAX] = {
+ [0] = {[0] = BIT(2)}, /* Region0: fake larb 0 APU_SECURE */
+ [1] = {[0] = BIT(1)}, /* Region1: fake larb 0 APU_CODE */
+ [2] = {[0] = BIT(3)}, /* Region2: fake larb 0 APU_VLM */
+ [3] = {[0] = BIT(0)}, /* Region3: fake larb 0 APU_DATA */
+};
+
+static const struct mtk_iommu_plat_data mt8189_data_apu = {
+ .m4u_plat = M4U_MT8189,
+ .flags = IOVA_34_EN | DCM_DISABLE |
+ MTK_IOMMU_TYPE_APU | PGTABLE_PA_35_EN,
+ .hw_list = &apulist,
+ .inv_sel_reg = REG_MMU_INV_SEL_GEN2,
+ .banks_num = 1,
+ .banks_enable = {true},
+ .iova_region = mt8189_multi_dom_apu,
+ .iova_region_nr = ARRAY_SIZE(mt8189_multi_dom_apu),
+ .larbid_remap = {{0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}},
+ .iova_region_larb_msk = mt8189_apu_region_msk,
+};
+
+static const struct mtk_iommu_plat_data mt8189_data_infra = {
+ .m4u_plat = M4U_MT8189,
+ .flags = WR_THROT_EN | DCM_DISABLE | MTK_IOMMU_TYPE_INFRA |
+ CFG_IFA_MASTER_IN_ATF | SHARE_PGTABLE | PGTABLE_PA_35_EN,
+ .hw_list = &infralist,
+ .banks_num = 1,
+ .banks_enable = {true},
+ .inv_sel_reg = REG_MMU_INV_SEL_GEN2,
+ .iova_region = single_domain,
+ .iova_region_nr = ARRAY_SIZE(single_domain),
+};
+
+static const u32 mt8189_larb_region_msk[MT8192_MULTI_REGION_NR_MAX][MTK_LARB_NR_MAX] = {
+ [0] = {~0, ~0, ~0, [22] = BIT(0)}, /* Region0: all ports for larb0/1/2 */
+ [1] = {[3] = ~0, [4] = ~0}, /* Region1: all ports for larb4(3)/7(4) */
+ [2] = {[5] = ~0, [6] = ~0, /* Region2: all ports for larb9(5)/11(6) */
+ [7] = ~0, [8] = ~0, /* Region2: all ports for larb13(7)/14(8) */
+ [9] = ~0, [10] = ~0, /* Region2: all ports for larb16(9)/17(10) */
+ [11] = ~0, [12] = ~0, /* Region2: all ports for larb19(11)/20(12) */
+ [21] = ~0}, /* Region2: larb21 fake GCE larb */
+};
+
+static const struct mtk_iommu_plat_data mt8189_data_mm = {
+ .m4u_plat = M4U_MT8189,
+ .flags = HAS_BCLK | HAS_SUB_COMM_3BITS | OUT_ORDER_WR_EN |
+ WR_THROT_EN | IOVA_34_EN | MTK_IOMMU_TYPE_MM |
+ PGTABLE_PA_35_EN | DL_WITH_MULTI_LARB,
+ .hw_list = &m4ulist,
+ .inv_sel_reg = REG_MMU_INV_SEL_GEN2,
+ .banks_num = 5,
+ .banks_enable = {true, false, false, false, false},
+ .iova_region = mt8192_multi_dom,
+ .iova_region_nr = ARRAY_SIZE(mt8192_multi_dom),
+ .iova_region_larb_msk = mt8189_larb_region_msk,
+ .larbid_remap = {{0}, {1}, {21/* GCE_D */, 21/* GCE_M */, 2},
+ {19, 20, 9, 11}, {7}, {4},
+ {13, 17}, {14, 16}},
+};
+
static const struct mtk_iommu_plat_data mt8192_data = {
.m4u_plat = M4U_MT8192,
.flags = HAS_BCLK | HAS_SUB_COMM_2BITS | OUT_ORDER_WR_EN |
@@ -1796,6 +1911,9 @@ static const struct of_device_id mtk_iommu_of_ids[] = {
{ .compatible = "mediatek,mt8188-iommu-infra", .data = &mt8188_data_infra},
{ .compatible = "mediatek,mt8188-iommu-vdo", .data = &mt8188_data_vdo},
{ .compatible = "mediatek,mt8188-iommu-vpp", .data = &mt8188_data_vpp},
+ { .compatible = "mediatek,mt8189-iommu-apu", .data = &mt8189_data_apu},
+ { .compatible = "mediatek,mt8189-iommu-infra", .data = &mt8189_data_infra},
+ { .compatible = "mediatek,mt8189-iommu-mm", .data = &mt8189_data_mm},
{ .compatible = "mediatek,mt8192-m4u", .data = &mt8192_data},
{ .compatible = "mediatek,mt8195-iommu-infra", .data = &mt8195_data_infra},
{ .compatible = "mediatek,mt8195-iommu-vdo", .data = &mt8195_data_vdo},
diff --git a/drivers/iommu/mtk_iommu_v1.c b/drivers/iommu/mtk_iommu_v1.c
index 10cc0b1197e8..c8d8eff5373d 100644
--- a/drivers/iommu/mtk_iommu_v1.c
+++ b/drivers/iommu/mtk_iommu_v1.c
@@ -303,7 +303,9 @@ static void mtk_iommu_v1_domain_free(struct iommu_domain *domain)
kfree(to_mtk_domain(domain));
}
-static int mtk_iommu_v1_attach_device(struct iommu_domain *domain, struct device *dev)
+static int mtk_iommu_v1_attach_device(struct iommu_domain *domain,
+ struct device *dev,
+ struct iommu_domain *old)
{
struct mtk_iommu_v1_data *data = dev_iommu_priv_get(dev);
struct mtk_iommu_v1_domain *dom = to_mtk_domain(domain);
@@ -329,7 +331,8 @@ static int mtk_iommu_v1_attach_device(struct iommu_domain *domain, struct device
}
static int mtk_iommu_v1_identity_attach(struct iommu_domain *identity_domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
struct mtk_iommu_v1_data *data = dev_iommu_priv_get(dev);
@@ -435,6 +438,8 @@ static int mtk_iommu_v1_create_mapping(struct device *dev,
return -EINVAL;
dev_iommu_priv_set(dev, platform_get_drvdata(m4updev));
+
+ put_device(&m4updev->dev);
}
ret = iommu_fwspec_add_ids(dev, args->args, 1);
@@ -641,13 +646,18 @@ static int mtk_iommu_v1_probe(struct platform_device *pdev)
if (larb_nr < 0)
return larb_nr;
+ if (larb_nr > MTK_LARB_NR_MAX)
+ return -EINVAL;
+
for (i = 0; i < larb_nr; i++) {
struct device_node *larbnode;
struct platform_device *plarbdev;
larbnode = of_parse_phandle(dev->of_node, "mediatek,larbs", i);
- if (!larbnode)
- return -EINVAL;
+ if (!larbnode) {
+ ret = -EINVAL;
+ goto out_put_larbs;
+ }
if (!of_device_is_available(larbnode)) {
of_node_put(larbnode);
@@ -657,11 +667,14 @@ static int mtk_iommu_v1_probe(struct platform_device *pdev)
plarbdev = of_find_device_by_node(larbnode);
if (!plarbdev) {
of_node_put(larbnode);
- return -ENODEV;
+ ret = -ENODEV;
+ goto out_put_larbs;
}
if (!plarbdev->dev.driver) {
of_node_put(larbnode);
- return -EPROBE_DEFER;
+ put_device(&plarbdev->dev);
+ ret = -EPROBE_DEFER;
+ goto out_put_larbs;
}
data->larb_imu[i].dev = &plarbdev->dev;
@@ -673,7 +686,7 @@ static int mtk_iommu_v1_probe(struct platform_device *pdev)
ret = mtk_iommu_v1_hw_init(data);
if (ret)
- return ret;
+ goto out_put_larbs;
ret = iommu_device_sysfs_add(&data->iommu, &pdev->dev, NULL,
dev_name(&pdev->dev));
@@ -695,12 +708,17 @@ out_sysfs_remove:
iommu_device_sysfs_remove(&data->iommu);
out_clk_unprepare:
clk_disable_unprepare(data->bclk);
+out_put_larbs:
+ for (i = 0; i < MTK_LARB_NR_MAX; i++)
+ put_device(data->larb_imu[i].dev);
+
return ret;
}
static void mtk_iommu_v1_remove(struct platform_device *pdev)
{
struct mtk_iommu_v1_data *data = platform_get_drvdata(pdev);
+ int i;
iommu_device_sysfs_remove(&data->iommu);
iommu_device_unregister(&data->iommu);
@@ -708,6 +726,9 @@ static void mtk_iommu_v1_remove(struct platform_device *pdev)
clk_disable_unprepare(data->bclk);
devm_free_irq(&pdev->dev, data->irq, data);
component_master_del(&pdev->dev, &mtk_iommu_v1_com_ops);
+
+ for (i = 0; i < MTK_LARB_NR_MAX; i++)
+ put_device(data->larb_imu[i].dev);
}
static int __maybe_unused mtk_iommu_v1_suspend(struct device *dev)
diff --git a/drivers/iommu/omap-iommu.c b/drivers/iommu/omap-iommu.c
index 5c6f5943f44b..768973b7e511 100644
--- a/drivers/iommu/omap-iommu.c
+++ b/drivers/iommu/omap-iommu.c
@@ -1431,8 +1431,8 @@ static void omap_iommu_detach_fini(struct omap_iommu_domain *odomain)
odomain->iommus = NULL;
}
-static int
-omap_iommu_attach_dev(struct iommu_domain *domain, struct device *dev)
+static int omap_iommu_attach_dev(struct iommu_domain *domain,
+ struct device *dev, struct iommu_domain *old)
{
struct omap_iommu_arch_data *arch_data = dev_iommu_priv_get(dev);
struct omap_iommu_domain *omap_domain = to_omap_domain(domain);
@@ -1536,15 +1536,15 @@ static void _omap_iommu_detach_dev(struct omap_iommu_domain *omap_domain,
}
static int omap_iommu_identity_attach(struct iommu_domain *identity_domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
- struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
struct omap_iommu_domain *omap_domain;
- if (domain == identity_domain || !domain)
+ if (old == identity_domain || !old)
return 0;
- omap_domain = to_omap_domain(domain);
+ omap_domain = to_omap_domain(old);
spin_lock(&omap_domain->lock);
_omap_iommu_detach_dev(omap_domain, dev);
spin_unlock(&omap_domain->lock);
@@ -1668,23 +1668,20 @@ static struct iommu_device *omap_iommu_probe_device(struct device *dev)
}
pdev = of_find_device_by_node(np);
+ of_node_put(np);
if (!pdev) {
- of_node_put(np);
kfree(arch_data);
return ERR_PTR(-ENODEV);
}
oiommu = platform_get_drvdata(pdev);
+ put_device(&pdev->dev);
if (!oiommu) {
- of_node_put(np);
kfree(arch_data);
return ERR_PTR(-EINVAL);
}
tmp->iommu_dev = oiommu;
- tmp->dev = &pdev->dev;
-
- of_node_put(np);
}
dev_iommu_priv_set(dev, arch_data);
diff --git a/drivers/iommu/omap-iommu.h b/drivers/iommu/omap-iommu.h
index 27697109ec79..50b39be61abc 100644
--- a/drivers/iommu/omap-iommu.h
+++ b/drivers/iommu/omap-iommu.h
@@ -88,7 +88,6 @@ struct omap_iommu {
/**
* struct omap_iommu_arch_data - omap iommu private data
* @iommu_dev: handle of the OMAP iommu device
- * @dev: handle of the iommu device
*
* This is an omap iommu private data object, which binds an iommu user
* to its iommu device. This object should be placed at the iommu user's
@@ -97,7 +96,6 @@ struct omap_iommu {
*/
struct omap_iommu_arch_data {
struct omap_iommu *iommu_dev;
- struct device *dev;
};
struct cr_regs {
diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c
index ebb22979075d..d9429097a2b5 100644
--- a/drivers/iommu/riscv/iommu.c
+++ b/drivers/iommu/riscv/iommu.c
@@ -1321,7 +1321,8 @@ static bool riscv_iommu_pt_supported(struct riscv_iommu_device *iommu, int pgd_m
}
static int riscv_iommu_attach_paging_domain(struct iommu_domain *iommu_domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
struct riscv_iommu_device *iommu = dev_to_iommu(dev);
@@ -1426,7 +1427,8 @@ static struct iommu_domain *riscv_iommu_alloc_paging_domain(struct device *dev)
}
static int riscv_iommu_attach_blocking_domain(struct iommu_domain *iommu_domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
struct riscv_iommu_device *iommu = dev_to_iommu(dev);
struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
@@ -1447,7 +1449,8 @@ static struct iommu_domain riscv_iommu_blocking_domain = {
};
static int riscv_iommu_attach_identity_domain(struct iommu_domain *iommu_domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
struct riscv_iommu_device *iommu = dev_to_iommu(dev);
struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
diff --git a/drivers/iommu/rockchip-iommu.c b/drivers/iommu/rockchip-iommu.c
index 0861dd469bd8..85f3667e797c 100644
--- a/drivers/iommu/rockchip-iommu.c
+++ b/drivers/iommu/rockchip-iommu.c
@@ -960,7 +960,8 @@ out_disable_clocks:
}
static int rk_iommu_identity_attach(struct iommu_domain *identity_domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
struct rk_iommu *iommu;
struct rk_iommu_domain *rk_domain;
@@ -1005,7 +1006,7 @@ static struct iommu_domain rk_identity_domain = {
};
static int rk_iommu_attach_device(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev, struct iommu_domain *old)
{
struct rk_iommu *iommu;
struct rk_iommu_domain *rk_domain = to_rk_domain(domain);
@@ -1026,7 +1027,7 @@ static int rk_iommu_attach_device(struct iommu_domain *domain,
if (iommu->domain == domain)
return 0;
- ret = rk_iommu_identity_attach(&rk_identity_domain, dev);
+ ret = rk_iommu_identity_attach(&rk_identity_domain, dev, old);
if (ret)
return ret;
@@ -1041,8 +1042,17 @@ static int rk_iommu_attach_device(struct iommu_domain *domain,
return 0;
ret = rk_iommu_enable(iommu);
- if (ret)
- WARN_ON(rk_iommu_identity_attach(&rk_identity_domain, dev));
+ if (ret) {
+ /*
+ * Note rk_iommu_identity_attach() might fail before physically
+ * attaching the dev to iommu->domain, in which case the actual
+ * old domain for this revert should be rk_identity_domain v.s.
+ * iommu->domain. Since rk_iommu_identity_attach() does not care
+ * about the old domain argument for now, this is not a problem.
+ */
+ WARN_ON(rk_iommu_identity_attach(&rk_identity_domain, dev,
+ iommu->domain));
+ }
pm_runtime_put(iommu->dev);
diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c
index aa576736d60b..fe679850af28 100644
--- a/drivers/iommu/s390-iommu.c
+++ b/drivers/iommu/s390-iommu.c
@@ -670,7 +670,8 @@ int zpci_iommu_register_ioat(struct zpci_dev *zdev, u8 *status)
}
static int blocking_domain_attach_device(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
struct zpci_dev *zdev = to_zpci_dev(dev);
struct s390_domain *s390_domain;
@@ -694,7 +695,8 @@ static int blocking_domain_attach_device(struct iommu_domain *domain,
}
static int s390_iommu_attach_device(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
struct s390_domain *s390_domain = to_s390_domain(domain);
struct zpci_dev *zdev = to_zpci_dev(dev);
@@ -709,7 +711,7 @@ static int s390_iommu_attach_device(struct iommu_domain *domain,
domain->geometry.aperture_end < zdev->start_dma))
return -EINVAL;
- blocking_domain_attach_device(&blocking_domain, dev);
+ blocking_domain_attach_device(&blocking_domain, dev, old);
/* If we fail now DMA remains blocked via blocking domain */
cc = s390_iommu_domain_reg_ioat(zdev, domain, &status);
@@ -1131,13 +1133,14 @@ static int __init s390_iommu_init(void)
subsys_initcall(s390_iommu_init);
static int s390_attach_dev_identity(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
struct zpci_dev *zdev = to_zpci_dev(dev);
u8 status;
int cc;
- blocking_domain_attach_device(&blocking_domain, dev);
+ blocking_domain_attach_device(&blocking_domain, dev, old);
/* If we fail now DMA remains blocked via blocking domain */
cc = s390_iommu_domain_reg_ioat(zdev, domain, &status);
diff --git a/drivers/iommu/sprd-iommu.c b/drivers/iommu/sprd-iommu.c
index c7ca1d8a0b15..555d4505c747 100644
--- a/drivers/iommu/sprd-iommu.c
+++ b/drivers/iommu/sprd-iommu.c
@@ -247,7 +247,8 @@ static void sprd_iommu_domain_free(struct iommu_domain *domain)
}
static int sprd_iommu_attach_device(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
struct sprd_iommu_device *sdev = dev_iommu_priv_get(dev);
struct sprd_iommu_domain *dom = to_sprd_domain(domain);
diff --git a/drivers/iommu/sun50i-iommu.c b/drivers/iommu/sun50i-iommu.c
index de10b569d9a9..90b26fe21817 100644
--- a/drivers/iommu/sun50i-iommu.c
+++ b/drivers/iommu/sun50i-iommu.c
@@ -771,7 +771,8 @@ static void sun50i_iommu_detach_domain(struct sun50i_iommu *iommu,
}
static int sun50i_iommu_identity_attach(struct iommu_domain *identity_domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
struct sun50i_iommu *iommu = dev_iommu_priv_get(dev);
struct sun50i_iommu_domain *sun50i_domain;
@@ -797,7 +798,8 @@ static struct iommu_domain sun50i_iommu_identity_domain = {
};
static int sun50i_iommu_attach_device(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
struct sun50i_iommu_domain *sun50i_domain = to_sun50i_domain(domain);
struct sun50i_iommu *iommu;
@@ -813,7 +815,7 @@ static int sun50i_iommu_attach_device(struct iommu_domain *domain,
if (iommu->domain == domain)
return 0;
- sun50i_iommu_identity_attach(&sun50i_iommu_identity_domain, dev);
+ sun50i_iommu_identity_attach(&sun50i_iommu_identity_domain, dev, old);
sun50i_iommu_attach_domain(iommu, sun50i_domain);
@@ -839,6 +841,8 @@ static int sun50i_iommu_of_xlate(struct device *dev,
dev_iommu_priv_set(dev, platform_get_drvdata(iommu_pdev));
+ put_device(&iommu_pdev->dev);
+
return iommu_fwspec_add_ids(dev, &id, 1);
}
diff --git a/drivers/iommu/tegra-smmu.c b/drivers/iommu/tegra-smmu.c
index 36cdd5fbab07..c391e7f2cde6 100644
--- a/drivers/iommu/tegra-smmu.c
+++ b/drivers/iommu/tegra-smmu.c
@@ -490,7 +490,7 @@ static void tegra_smmu_as_unprepare(struct tegra_smmu *smmu,
}
static int tegra_smmu_attach_dev(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev, struct iommu_domain *old)
{
struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
struct tegra_smmu *smmu = dev_iommu_priv_get(dev);
@@ -524,9 +524,9 @@ disable:
}
static int tegra_smmu_identity_attach(struct iommu_domain *identity_domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
- struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
struct tegra_smmu_as *as;
struct tegra_smmu *smmu;
@@ -535,10 +535,10 @@ static int tegra_smmu_identity_attach(struct iommu_domain *identity_domain,
if (!fwspec)
return -ENODEV;
- if (domain == identity_domain || !domain)
+ if (old == identity_domain || !old)
return 0;
- as = to_smmu_as(domain);
+ as = to_smmu_as(old);
smmu = as->smmu;
for (index = 0; index < fwspec->num_ids; index++) {
tegra_smmu_disable(smmu, fwspec->ids[index], as->id);
@@ -830,10 +830,9 @@ static struct tegra_smmu *tegra_smmu_find(struct device_node *np)
return NULL;
mc = platform_get_drvdata(pdev);
- if (!mc) {
- put_device(&pdev->dev);
+ put_device(&pdev->dev);
+ if (!mc)
return NULL;
- }
return mc->smmu;
}
diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c
index b39d6f134ab2..d314fa5cd847 100644
--- a/drivers/iommu/virtio-iommu.c
+++ b/drivers/iommu/virtio-iommu.c
@@ -730,7 +730,8 @@ static struct iommu_domain *viommu_domain_alloc_identity(struct device *dev)
return domain;
}
-static int viommu_attach_dev(struct iommu_domain *domain, struct device *dev)
+static int viommu_attach_dev(struct iommu_domain *domain, struct device *dev,
+ struct iommu_domain *old)
{
int ret = 0;
struct virtio_iommu_req_attach req;
@@ -781,7 +782,8 @@ static int viommu_attach_dev(struct iommu_domain *domain, struct device *dev)
}
static int viommu_attach_identity_domain(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
int ret = 0;
struct virtio_iommu_req_attach req;
diff --git a/drivers/net/ethernet/broadcom/bnge/Makefile b/drivers/net/ethernet/broadcom/bnge/Makefile
index 6142d9c57f49..ea6596854e5c 100644
--- a/drivers/net/ethernet/broadcom/bnge/Makefile
+++ b/drivers/net/ethernet/broadcom/bnge/Makefile
@@ -9,4 +9,5 @@ bng_en-y := bnge_core.o \
bnge_rmem.o \
bnge_resc.o \
bnge_netdev.o \
- bnge_ethtool.o
+ bnge_ethtool.o \
+ bnge_auxr.o
diff --git a/drivers/net/ethernet/broadcom/bnge/bnge.h b/drivers/net/ethernet/broadcom/bnge/bnge.h
index 7aed5f81cd51..411744894349 100644
--- a/drivers/net/ethernet/broadcom/bnge/bnge.h
+++ b/drivers/net/ethernet/broadcom/bnge/bnge.h
@@ -11,6 +11,7 @@
#include <linux/bnxt/hsi.h>
#include "bnge_rmem.h"
#include "bnge_resc.h"
+#include "bnge_auxr.h"
#define DRV_VER_MAJ 1
#define DRV_VER_MIN 15
@@ -22,6 +23,12 @@ enum board_idx {
BCM57708,
};
+struct bnge_auxr_priv {
+ struct auxiliary_device aux_dev;
+ struct bnge_auxr_dev *auxr_dev;
+ int id;
+};
+
struct bnge_pf_info {
u16 fw_fid;
u16 port_id;
@@ -197,6 +204,9 @@ struct bnge_dev {
struct bnge_irq *irq_tbl;
u16 irqs_acquired;
+
+ struct bnge_auxr_priv *aux_priv;
+ struct bnge_auxr_dev *auxr_dev;
};
static inline bool bnge_is_roce_en(struct bnge_dev *bd)
diff --git a/drivers/net/ethernet/broadcom/bnge/bnge_auxr.c b/drivers/net/ethernet/broadcom/bnge/bnge_auxr.c
new file mode 100644
index 000000000000..d64592b64e17
--- /dev/null
+++ b/drivers/net/ethernet/broadcom/bnge/bnge_auxr.c
@@ -0,0 +1,258 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2025 Broadcom.
+
+#include <linux/module.h>
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/bitops.h>
+#include <linux/irq.h>
+#include <asm/byteorder.h>
+#include <linux/bitmap.h>
+#include <linux/auxiliary_bus.h>
+#include <linux/bnxt/hsi.h>
+
+#include "bnge.h"
+#include "bnge_hwrm.h"
+#include "bnge_auxr.h"
+
+static DEFINE_IDA(bnge_aux_dev_ids);
+
+static void bnge_fill_msix_vecs(struct bnge_dev *bd,
+ struct bnge_msix_info *info)
+{
+ struct bnge_auxr_dev *auxr_dev = bd->auxr_dev;
+ int num_msix, i;
+
+ if (!auxr_dev->auxr_info->msix_requested) {
+ dev_warn(bd->dev, "Requested MSI-X vectors not allocated\n");
+ return;
+ }
+ num_msix = auxr_dev->auxr_info->msix_requested;
+ for (i = 0; i < num_msix; i++) {
+ info[i].vector = bd->irq_tbl[i].vector;
+ info[i].db_offset = bd->db_offset;
+ info[i].ring_idx = i;
+ }
+}
+
+int bnge_register_dev(struct bnge_auxr_dev *auxr_dev,
+ void *handle)
+{
+ struct bnge_dev *bd = pci_get_drvdata(auxr_dev->pdev);
+ struct bnge_auxr_info *auxr_info;
+ int rc = 0;
+
+ netdev_lock(bd->netdev);
+ mutex_lock(&auxr_dev->auxr_dev_lock);
+ if (!bd->irq_tbl) {
+ rc = -ENODEV;
+ goto exit;
+ }
+
+ if (!bnge_aux_has_enough_resources(bd)) {
+ rc = -ENOMEM;
+ goto exit;
+ }
+
+ auxr_info = auxr_dev->auxr_info;
+ auxr_info->handle = handle;
+
+ auxr_info->msix_requested = bd->aux_num_msix;
+
+ bnge_fill_msix_vecs(bd, bd->auxr_dev->msix_info);
+ auxr_dev->flags |= BNGE_ARDEV_MSIX_ALLOC;
+
+exit:
+ mutex_unlock(&auxr_dev->auxr_dev_lock);
+ netdev_unlock(bd->netdev);
+ return rc;
+}
+EXPORT_SYMBOL(bnge_register_dev);
+
+void bnge_unregister_dev(struct bnge_auxr_dev *auxr_dev)
+{
+ struct bnge_dev *bd = pci_get_drvdata(auxr_dev->pdev);
+ struct bnge_auxr_info *auxr_info;
+
+ auxr_info = auxr_dev->auxr_info;
+ netdev_lock(bd->netdev);
+ mutex_lock(&auxr_dev->auxr_dev_lock);
+ if (auxr_info->msix_requested)
+ auxr_dev->flags &= ~BNGE_ARDEV_MSIX_ALLOC;
+ auxr_info->msix_requested = 0;
+
+ mutex_unlock(&auxr_dev->auxr_dev_lock);
+ netdev_unlock(bd->netdev);
+}
+EXPORT_SYMBOL(bnge_unregister_dev);
+
+int bnge_send_msg(struct bnge_auxr_dev *auxr_dev, struct bnge_fw_msg *fw_msg)
+{
+ struct bnge_dev *bd = pci_get_drvdata(auxr_dev->pdev);
+ struct output *resp;
+ struct input *req;
+ u32 resp_len;
+ int rc;
+
+ rc = bnge_hwrm_req_init(bd, req, 0 /* don't care */);
+ if (rc)
+ return rc;
+
+ rc = bnge_hwrm_req_replace(bd, req, fw_msg->msg, fw_msg->msg_len);
+ if (rc)
+ goto drop_req;
+
+ bnge_hwrm_req_timeout(bd, req, fw_msg->timeout);
+ resp = bnge_hwrm_req_hold(bd, req);
+ rc = bnge_hwrm_req_send(bd, req);
+ resp_len = le16_to_cpu(resp->resp_len);
+ if (resp_len) {
+ if (fw_msg->resp_max_len < resp_len)
+ resp_len = fw_msg->resp_max_len;
+
+ memcpy(fw_msg->resp, resp, resp_len);
+ }
+drop_req:
+ bnge_hwrm_req_drop(bd, req);
+ return rc;
+}
+EXPORT_SYMBOL(bnge_send_msg);
+
+void bnge_rdma_aux_device_uninit(struct bnge_dev *bd)
+{
+ struct bnge_auxr_priv *aux_priv;
+ struct auxiliary_device *adev;
+
+ /* Skip if no auxiliary device init was done. */
+ if (!bd->aux_priv)
+ return;
+
+ aux_priv = bd->aux_priv;
+ adev = &aux_priv->aux_dev;
+ auxiliary_device_uninit(adev);
+}
+
+static void bnge_aux_dev_release(struct device *dev)
+{
+ struct bnge_auxr_priv *aux_priv =
+ container_of(dev, struct bnge_auxr_priv, aux_dev.dev);
+ struct bnge_dev *bd = pci_get_drvdata(aux_priv->auxr_dev->pdev);
+
+ ida_free(&bnge_aux_dev_ids, aux_priv->id);
+ kfree(aux_priv->auxr_dev->auxr_info);
+ bd->auxr_dev = NULL;
+ kfree(aux_priv->auxr_dev);
+ kfree(aux_priv);
+ bd->aux_priv = NULL;
+}
+
+void bnge_rdma_aux_device_del(struct bnge_dev *bd)
+{
+ if (!bd->auxr_dev)
+ return;
+
+ auxiliary_device_delete(&bd->aux_priv->aux_dev);
+}
+
+static void bnge_set_auxr_dev_info(struct bnge_auxr_dev *auxr_dev,
+ struct bnge_dev *bd)
+{
+ auxr_dev->pdev = bd->pdev;
+ auxr_dev->l2_db_size = bd->db_size;
+ auxr_dev->l2_db_size_nc = bd->db_size;
+ auxr_dev->l2_db_offset = bd->db_offset;
+ mutex_init(&auxr_dev->auxr_dev_lock);
+
+ if (bd->flags & BNGE_EN_ROCE_V1)
+ auxr_dev->flags |= BNGE_ARDEV_ROCEV1_SUPP;
+ if (bd->flags & BNGE_EN_ROCE_V2)
+ auxr_dev->flags |= BNGE_ARDEV_ROCEV2_SUPP;
+
+ auxr_dev->chip_num = bd->chip_num;
+ auxr_dev->hw_ring_stats_size = bd->hw_ring_stats_size;
+ auxr_dev->pf_port_id = bd->pf.port_id;
+ auxr_dev->en_state = bd->state;
+ auxr_dev->bar0 = bd->bar0;
+}
+
+void bnge_rdma_aux_device_add(struct bnge_dev *bd)
+{
+ struct auxiliary_device *aux_dev;
+ int rc;
+
+ if (!bd->auxr_dev)
+ return;
+
+ aux_dev = &bd->aux_priv->aux_dev;
+ rc = auxiliary_device_add(aux_dev);
+ if (rc) {
+ dev_warn(bd->dev, "Failed to add auxiliary device for ROCE\n");
+ auxiliary_device_uninit(aux_dev);
+ bd->flags &= ~BNGE_EN_ROCE;
+ }
+
+ bd->auxr_dev->net = bd->netdev;
+}
+
+void bnge_rdma_aux_device_init(struct bnge_dev *bd)
+{
+ struct auxiliary_device *aux_dev;
+ struct bnge_auxr_info *auxr_info;
+ struct bnge_auxr_priv *aux_priv;
+ struct bnge_auxr_dev *auxr_dev;
+ int rc;
+
+ if (!bnge_is_roce_en(bd))
+ return;
+
+ aux_priv = kzalloc(sizeof(*aux_priv), GFP_KERNEL);
+ if (!aux_priv)
+ goto exit;
+
+ aux_priv->id = ida_alloc(&bnge_aux_dev_ids, GFP_KERNEL);
+ if (aux_priv->id < 0) {
+ dev_warn(bd->dev, "ida alloc failed for aux device\n");
+ kfree(aux_priv);
+ goto exit;
+ }
+
+ aux_dev = &aux_priv->aux_dev;
+ aux_dev->id = aux_priv->id;
+ aux_dev->name = "rdma";
+ aux_dev->dev.parent = &bd->pdev->dev;
+ aux_dev->dev.release = bnge_aux_dev_release;
+
+ rc = auxiliary_device_init(aux_dev);
+ if (rc) {
+ ida_free(&bnge_aux_dev_ids, aux_priv->id);
+ kfree(aux_priv);
+ goto exit;
+ }
+ bd->aux_priv = aux_priv;
+
+ auxr_dev = kzalloc(sizeof(*auxr_dev), GFP_KERNEL);
+ if (!auxr_dev)
+ goto aux_dev_uninit;
+
+ aux_priv->auxr_dev = auxr_dev;
+
+ auxr_info = kzalloc(sizeof(*auxr_info), GFP_KERNEL);
+ if (!auxr_info)
+ goto aux_dev_uninit;
+
+ auxr_dev->auxr_info = auxr_info;
+ bd->auxr_dev = auxr_dev;
+ bnge_set_auxr_dev_info(auxr_dev, bd);
+
+ return;
+
+aux_dev_uninit:
+ auxiliary_device_uninit(aux_dev);
+exit:
+ bd->flags &= ~BNGE_EN_ROCE;
+}
diff --git a/drivers/net/ethernet/broadcom/bnge/bnge_auxr.h b/drivers/net/ethernet/broadcom/bnge/bnge_auxr.h
new file mode 100644
index 000000000000..6c5c15ef2b0a
--- /dev/null
+++ b/drivers/net/ethernet/broadcom/bnge/bnge_auxr.h
@@ -0,0 +1,84 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2025 Broadcom */
+
+#ifndef _BNGE_AUXR_H_
+#define _BNGE_AUXR_H_
+
+#include <linux/auxiliary_bus.h>
+
+#define BNGE_MIN_ROCE_CP_RINGS 2
+#define BNGE_MIN_ROCE_STAT_CTXS 1
+
+#define BNGE_MAX_ROCE_MSIX 64
+
+struct hwrm_async_event_cmpl;
+struct bnge;
+
+struct bnge_msix_info {
+ u32 vector;
+ u32 ring_idx;
+ u32 db_offset;
+};
+
+struct bnge_fw_msg {
+ void *msg;
+ int msg_len;
+ void *resp;
+ int resp_max_len;
+ int timeout;
+};
+
+struct bnge_auxr_info {
+ void *handle;
+ u16 msix_requested;
+};
+
+enum {
+ BNGE_ARDEV_ROCEV1_SUPP = BIT(0),
+ BNGE_ARDEV_ROCEV2_SUPP = BIT(1),
+ BNGE_ARDEV_MSIX_ALLOC = BIT(2),
+};
+
+#define BNGE_ARDEV_ROCE_SUPP (BNGE_ARDEV_ROCEV1_SUPP | \
+ BNGE_ARDEV_ROCEV2_SUPP)
+
+struct bnge_auxr_dev {
+ struct net_device *net;
+ struct pci_dev *pdev;
+ void __iomem *bar0;
+
+ struct bnge_msix_info msix_info[BNGE_MAX_ROCE_MSIX];
+
+ u32 flags;
+
+ struct bnge_auxr_info *auxr_info;
+
+ /* Doorbell BAR size in bytes mapped by L2 driver. */
+ int l2_db_size;
+ /* Doorbell BAR size in bytes mapped as non-cacheable. */
+ int l2_db_size_nc;
+ /* Doorbell offset in bytes within l2_db_size_nc. */
+ int l2_db_offset;
+
+ u16 chip_num;
+ u16 hw_ring_stats_size;
+ u16 pf_port_id;
+ unsigned long en_state;
+
+ u16 auxr_num_msix_vec;
+ u16 auxr_num_ctxs;
+
+ /* serialize auxr operations */
+ struct mutex auxr_dev_lock;
+};
+
+void bnge_rdma_aux_device_uninit(struct bnge_dev *bdev);
+void bnge_rdma_aux_device_del(struct bnge_dev *bdev);
+void bnge_rdma_aux_device_add(struct bnge_dev *bdev);
+void bnge_rdma_aux_device_init(struct bnge_dev *bdev);
+int bnge_register_dev(struct bnge_auxr_dev *adev,
+ void *handle);
+void bnge_unregister_dev(struct bnge_auxr_dev *adev);
+int bnge_send_msg(struct bnge_auxr_dev *adev, struct bnge_fw_msg *fw_msg);
+
+#endif /* _BNGE_AUXR_H_ */
diff --git a/drivers/net/ethernet/broadcom/bnge/bnge_core.c b/drivers/net/ethernet/broadcom/bnge/bnge_core.c
index 2c72dd34d50d..c94e132bebc8 100644
--- a/drivers/net/ethernet/broadcom/bnge/bnge_core.c
+++ b/drivers/net/ethernet/broadcom/bnge/bnge_core.c
@@ -41,6 +41,11 @@ static void bnge_print_device_info(struct pci_dev *pdev, enum board_idx idx)
bool bnge_aux_registered(struct bnge_dev *bd)
{
+ struct bnge_auxr_dev *ba_dev = bd->auxr_dev;
+
+ if (ba_dev && ba_dev->auxr_info->msix_requested)
+ return true;
+
return false;
}
@@ -312,16 +317,20 @@ static int bnge_probe_one(struct pci_dev *pdev, const struct pci_device_id *ent)
spin_lock_init(&bd->db_lock);
#endif
+ bnge_rdma_aux_device_init(bd);
+
rc = bnge_alloc_irqs(bd);
if (rc) {
dev_err(&pdev->dev, "Error IRQ allocation rc = %d\n", rc);
- goto err_config_uninit;
+ goto err_uninit_auxr;
}
rc = bnge_netdev_alloc(bd, max_irqs);
if (rc)
goto err_free_irq;
+ bnge_rdma_aux_device_add(bd);
+
pci_save_state(pdev);
return 0;
@@ -329,6 +338,9 @@ static int bnge_probe_one(struct pci_dev *pdev, const struct pci_device_id *ent)
err_free_irq:
bnge_free_irqs(bd);
+err_uninit_auxr:
+ bnge_rdma_aux_device_uninit(bd);
+
err_config_uninit:
bnge_net_uninit_dflt_config(bd);
@@ -354,10 +366,14 @@ static void bnge_remove_one(struct pci_dev *pdev)
{
struct bnge_dev *bd = pci_get_drvdata(pdev);
+ bnge_rdma_aux_device_del(bd);
+
bnge_netdev_free(bd);
bnge_free_irqs(bd);
+ bnge_rdma_aux_device_uninit(bd);
+
bnge_net_uninit_dflt_config(bd);
bnge_devlink_unregister(bd);
diff --git a/drivers/net/ethernet/broadcom/bnge/bnge_hwrm.c b/drivers/net/ethernet/broadcom/bnge/bnge_hwrm.c
index 0f971af24142..c3087e5cd875 100644
--- a/drivers/net/ethernet/broadcom/bnge/bnge_hwrm.c
+++ b/drivers/net/ethernet/broadcom/bnge/bnge_hwrm.c
@@ -98,6 +98,46 @@ void bnge_hwrm_req_alloc_flags(struct bnge_dev *bd, void *req, gfp_t gfp)
ctx->gfp = gfp;
}
+int bnge_hwrm_req_replace(struct bnge_dev *bd, void *req, void *new_req,
+ u32 len)
+{
+ struct bnge_hwrm_ctx *ctx = __hwrm_ctx_get(bd, req);
+ struct input *internal_req = req;
+ u16 req_type;
+
+ if (!ctx)
+ return -EINVAL;
+
+ if (len > BNGE_HWRM_CTX_OFFSET)
+ return -E2BIG;
+
+ /* free any existing slices */
+ ctx->allocated = BNGE_HWRM_DMA_SIZE - BNGE_HWRM_CTX_OFFSET;
+ if (ctx->slice_addr) {
+ dma_free_coherent(bd->dev, ctx->slice_size,
+ ctx->slice_addr, ctx->slice_handle);
+ ctx->slice_addr = NULL;
+ }
+ ctx->gfp = GFP_KERNEL;
+
+ if ((bd->fw_cap & BNGE_FW_CAP_SHORT_CMD) || len > BNGE_HWRM_MAX_REQ_LEN) {
+ memcpy(internal_req, new_req, len);
+ } else {
+ internal_req->req_type = ((struct input *)new_req)->req_type;
+ ctx->req = new_req;
+ }
+
+ ctx->req_len = len;
+ ctx->req->resp_addr = cpu_to_le64(ctx->dma_handle +
+ BNGE_HWRM_RESP_OFFSET);
+
+ /* update sentinel for potentially new request type */
+ req_type = le16_to_cpu(internal_req->req_type);
+ ctx->sentinel = bnge_cal_sentinel(ctx, req_type);
+
+ return 0;
+}
+
void bnge_hwrm_req_flags(struct bnge_dev *bd, void *req,
enum bnge_hwrm_ctx_flags flags)
{
diff --git a/drivers/net/ethernet/broadcom/bnge/bnge_hwrm.h b/drivers/net/ethernet/broadcom/bnge/bnge_hwrm.h
index 83794a12cc81..6df629761d95 100644
--- a/drivers/net/ethernet/broadcom/bnge/bnge_hwrm.h
+++ b/drivers/net/ethernet/broadcom/bnge/bnge_hwrm.h
@@ -107,4 +107,6 @@ int bnge_hwrm_req_send_silent(struct bnge_dev *bd, void *req);
void bnge_hwrm_req_alloc_flags(struct bnge_dev *bd, void *req, gfp_t flags);
void *bnge_hwrm_req_dma_slice(struct bnge_dev *bd, void *req, u32 size,
dma_addr_t *dma);
+int bnge_hwrm_req_replace(struct bnge_dev *bd, void *req, void *new_req,
+ u32 len);
#endif /* _BNGE_HWRM_H_ */
diff --git a/drivers/net/ethernet/broadcom/bnge/bnge_resc.c b/drivers/net/ethernet/broadcom/bnge/bnge_resc.c
index 62ebe03a0dcf..943df5f60f01 100644
--- a/drivers/net/ethernet/broadcom/bnge/bnge_resc.c
+++ b/drivers/net/ethernet/broadcom/bnge/bnge_resc.c
@@ -34,6 +34,18 @@ static unsigned int bnge_get_max_func_stat_ctxs(struct bnge_dev *bd)
return bd->hw_resc.max_stat_ctxs;
}
+bool bnge_aux_has_enough_resources(struct bnge_dev *bd)
+{
+ unsigned int max_stat_ctxs;
+
+ max_stat_ctxs = bnge_get_max_func_stat_ctxs(bd);
+ if (max_stat_ctxs <= BNGE_MIN_ROCE_STAT_CTXS ||
+ bd->nq_nr_rings == max_stat_ctxs)
+ return false;
+
+ return true;
+}
+
static unsigned int bnge_get_max_func_cp_rings(struct bnge_dev *bd)
{
return bd->hw_resc.max_cp_rings;
diff --git a/drivers/net/ethernet/broadcom/bnge/bnge_resc.h b/drivers/net/ethernet/broadcom/bnge/bnge_resc.h
index 0d6213b27580..b62a634669f6 100644
--- a/drivers/net/ethernet/broadcom/bnge/bnge_resc.h
+++ b/drivers/net/ethernet/broadcom/bnge/bnge_resc.h
@@ -74,6 +74,7 @@ void bnge_net_uninit_dflt_config(struct bnge_dev *bd);
void bnge_aux_init_dflt_config(struct bnge_dev *bd);
u32 bnge_get_rxfh_indir_size(struct bnge_dev *bd);
int bnge_cal_nr_rss_ctxs(u16 rx_rings);
+bool bnge_aux_has_enough_resources(struct bnge_dev *bd);
static inline u32
bnge_adjust_pow_two(u32 total_ent, u16 ent_per_blk)
diff --git a/drivers/net/ethernet/broadcom/bnx2.c b/drivers/net/ethernet/broadcom/bnx2.c
index cb1011f6fd30..805daae9dd36 100644
--- a/drivers/net/ethernet/broadcom/bnx2.c
+++ b/drivers/net/ethernet/broadcom/bnx2.c
@@ -6444,7 +6444,6 @@ bnx2_reset_task(struct work_struct *work)
if (!(pcicmd & PCI_COMMAND_MEMORY)) {
/* in case PCI block has reset */
pci_restore_state(bp->pdev);
- pci_save_state(bp->pdev);
}
rc = bnx2_init_nic(bp, 1);
if (rc) {
@@ -8718,7 +8717,6 @@ static pci_ers_result_t bnx2_io_slot_reset(struct pci_dev *pdev)
} else {
pci_set_master(pdev);
pci_restore_state(pdev);
- pci_save_state(pdev);
if (netif_running(dev))
err = bnx2_init_nic(bp, 1);
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
index aca4267babc8..6a1cc2032bf3 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
@@ -14216,7 +14216,6 @@ static pci_ers_result_t bnx2x_io_slot_reset(struct pci_dev *pdev)
pci_set_master(pdev);
pci_restore_state(pdev);
- pci_save_state(pdev);
if (netif_running(dev))
bnx2x_set_power_state(bp, PCI_D0);
diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index e21f7c6a6de7..75f66587983d 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -18337,7 +18337,6 @@ static pci_ers_result_t tg3_io_slot_reset(struct pci_dev *pdev)
pci_set_master(pdev);
pci_restore_state(pdev);
- pci_save_state(pdev);
if (!netdev || !netif_running(netdev)) {
rc = PCI_ERS_RESULT_RECOVERED;
diff --git a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c
index f92a3550e480..3b1321c8ed14 100644
--- a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c
@@ -2933,7 +2933,6 @@ static int t3_reenable_adapter(struct adapter *adapter)
}
pci_set_master(adapter->pdev);
pci_restore_state(adapter->pdev);
- pci_save_state(adapter->pdev);
/* Free sge resources */
t3_free_sge_resources(adapter);
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 66b8854e059f..043733c5c812 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -5458,7 +5458,6 @@ static pci_ers_result_t eeh_slot_reset(struct pci_dev *pdev)
if (!adap) {
pci_restore_state(pdev);
- pci_save_state(pdev);
return PCI_ERS_RESULT_RECOVERED;
}
@@ -5473,7 +5472,6 @@ static pci_ers_result_t eeh_slot_reset(struct pci_dev *pdev)
pci_set_master(pdev);
pci_restore_state(pdev);
- pci_save_state(pdev);
if (t4_wait_dev_ready(adap->regs) < 0)
return PCI_ERS_RESULT_DISCONNECT;
diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_err.c b/drivers/net/ethernet/hisilicon/hibmcge/hbg_err.c
index e11495b7ee98..7234618e8e81 100644
--- a/drivers/net/ethernet/hisilicon/hibmcge/hbg_err.c
+++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_err.c
@@ -160,7 +160,6 @@ static pci_ers_result_t hbg_pci_err_slot_reset(struct pci_dev *pdev)
pci_set_master(pdev);
pci_restore_state(pdev);
- pci_save_state(pdev);
hbg_err_reset(priv);
return PCI_ERS_RESULT_RECOVERED;
diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c
index 116f3c92b5bc..ddbe2f7d8112 100644
--- a/drivers/net/ethernet/intel/e1000e/netdev.c
+++ b/drivers/net/ethernet/intel/e1000e/netdev.c
@@ -7195,7 +7195,6 @@ static pci_ers_result_t e1000_io_slot_reset(struct pci_dev *pdev)
"Cannot re-enable PCI device after reset.\n");
result = PCI_ERS_RESULT_DISCONNECT;
} else {
- pdev->state_saved = true;
pci_restore_state(pdev);
pci_set_master(pdev);
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
index ae5fe34659cf..d75b8a50413d 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
@@ -2423,12 +2423,6 @@ static pci_ers_result_t fm10k_io_slot_reset(struct pci_dev *pdev)
} else {
pci_set_master(pdev);
pci_restore_state(pdev);
-
- /* After second error pci->state_saved is false, this
- * resets it so EEH doesn't break.
- */
- pci_save_state(pdev);
-
pci_wake_from_d3(pdev, false);
result = PCI_ERS_RESULT_RECOVERED;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 50be0a60ae13..d8192aa23254 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -16455,7 +16455,6 @@ static pci_ers_result_t i40e_pci_error_slot_reset(struct pci_dev *pdev)
} else {
pci_set_master(pdev);
pci_restore_state(pdev);
- pci_save_state(pdev);
pci_wake_from_d3(pdev, false);
reg = rd32(&pf->hw, I40E_GLGEN_RTRIG);
diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
index 2533876f1a2f..4bb68e7a00f5 100644
--- a/drivers/net/ethernet/intel/ice/ice_main.c
+++ b/drivers/net/ethernet/intel/ice/ice_main.c
@@ -5653,7 +5653,6 @@ static int ice_resume(struct device *dev)
pci_set_power_state(pdev, PCI_D0);
pci_restore_state(pdev);
- pci_save_state(pdev);
if (!pci_device_is_present(pdev))
return -ENODEV;
@@ -5753,7 +5752,6 @@ static pci_ers_result_t ice_pci_err_slot_reset(struct pci_dev *pdev)
} else {
pci_set_master(pdev);
pci_restore_state(pdev);
- pci_save_state(pdev);
pci_wake_from_d3(pdev, false);
/* Check for life */
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index 85f9589cc568..dbea37269d2c 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -9599,7 +9599,6 @@ static int __igb_resume(struct device *dev, bool rpm)
pci_set_power_state(pdev, PCI_D0);
pci_restore_state(pdev);
- pci_save_state(pdev);
if (!pci_device_is_present(pdev))
return -ENODEV;
@@ -9754,7 +9753,6 @@ static pci_ers_result_t igb_io_slot_reset(struct pci_dev *pdev)
} else {
pci_set_master(pdev);
pci_restore_state(pdev);
- pci_save_state(pdev);
pci_enable_wake(pdev, PCI_D3hot, 0);
pci_enable_wake(pdev, PCI_D3cold, 0);
diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c
index 728d7ca5338b..7aafa60ba0c8 100644
--- a/drivers/net/ethernet/intel/igc/igc_main.c
+++ b/drivers/net/ethernet/intel/igc/igc_main.c
@@ -7530,7 +7530,6 @@ static int __igc_resume(struct device *dev, bool rpm)
pci_set_power_state(pdev, PCI_D0);
pci_restore_state(pdev);
- pci_save_state(pdev);
if (!pci_device_is_present(pdev))
return -ENODEV;
@@ -7667,7 +7666,6 @@ static pci_ers_result_t igc_io_slot_reset(struct pci_dev *pdev)
} else {
pci_set_master(pdev);
pci_restore_state(pdev);
- pci_save_state(pdev);
pci_enable_wake(pdev, PCI_D3hot, 0);
pci_enable_wake(pdev, PCI_D3cold, 0);
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 4af3b3e71ff1..034618e79169 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -12298,7 +12298,6 @@ static pci_ers_result_t ixgbe_io_slot_reset(struct pci_dev *pdev)
adapter->hw.hw_addr = adapter->io_addr;
pci_set_master(pdev);
pci_restore_state(pdev);
- pci_save_state(pdev);
pci_wake_from_d3(pdev, false);
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index 2de226951e19..4293f8e33f44 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -4368,7 +4368,6 @@ static pci_ers_result_t mlx4_pci_slot_reset(struct pci_dev *pdev)
pci_set_master(pdev);
pci_restore_state(pdev);
- pci_save_state(pdev);
return PCI_ERS_RESULT_RECOVERED;
}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 024339ce41f1..1ab569ce3fcf 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -2137,7 +2137,6 @@ static pci_ers_result_t mlx5_pci_slot_reset(struct pci_dev *pdev)
pci_set_master(pdev);
pci_restore_state(pdev);
- pci_save_state(pdev);
err = wait_vital(pdev);
if (err) {
diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_pci.c b/drivers/net/ethernet/meta/fbnic/fbnic_pci.c
index 861d98099c44..9240673c7533 100644
--- a/drivers/net/ethernet/meta/fbnic/fbnic_pci.c
+++ b/drivers/net/ethernet/meta/fbnic/fbnic_pci.c
@@ -581,7 +581,6 @@ static pci_ers_result_t fbnic_err_slot_reset(struct pci_dev *pdev)
pci_set_power_state(pdev, PCI_D0);
pci_restore_state(pdev);
- pci_save_state(pdev);
if (pci_enable_device_mem(pdev)) {
dev_err(&pdev->dev,
diff --git a/drivers/net/ethernet/microchip/lan743x_main.c b/drivers/net/ethernet/microchip/lan743x_main.c
index 9d70b51ca91d..e4c542fc6c2b 100644
--- a/drivers/net/ethernet/microchip/lan743x_main.c
+++ b/drivers/net/ethernet/microchip/lan743x_main.c
@@ -3915,7 +3915,6 @@ static int lan743x_pm_resume(struct device *dev)
pci_set_power_state(pdev, PCI_D0);
pci_restore_state(pdev);
- pci_save_state(pdev);
/* Restore HW_CFG that was saved during pm suspend */
if (adapter->is_pci11x1x)
diff --git a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c
index e611ff7fa3fa..7be30a8df268 100644
--- a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c
+++ b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c
@@ -3416,10 +3416,6 @@ static void myri10ge_watchdog(struct work_struct *work)
* nic was resumed from power saving mode.
*/
pci_restore_state(mgp->pdev);
-
- /* save state again for accounting reasons */
- pci_save_state(mgp->pdev);
-
} else {
/* if we get back -1's from our slot, perhaps somebody
* powered off our card. Don't try to reset it in
diff --git a/drivers/net/ethernet/neterion/s2io.c b/drivers/net/ethernet/neterion/s2io.c
index 5026b0263d43..1e55ccb4822b 100644
--- a/drivers/net/ethernet/neterion/s2io.c
+++ b/drivers/net/ethernet/neterion/s2io.c
@@ -3425,7 +3425,6 @@ static void s2io_reset(struct s2io_nic *sp)
/* Restore the PCI state saved during initialization. */
pci_restore_state(sp->pdev);
- pci_save_state(sp->pdev);
pci_read_config_word(sp->pdev, 0x2, &val16);
if (check_pci_device_id(val16) != (u16)PCI_ANY_ID)
break;
diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile
index 67647f1880fb..f3c81c892786 100644
--- a/drivers/pci/Makefile
+++ b/drivers/pci/Makefile
@@ -4,7 +4,7 @@
obj-$(CONFIG_PCI) += access.o bus.o probe.o host-bridge.o \
remove.o pci.o pci-driver.o search.o \
- rom.o setup-res.o irq.o vpd.o \
+ rebar.o rom.o setup-res.o irq.o vpd.o \
setup-bus.o vc.o mmap.o devres.o
obj-$(CONFIG_PCI) += msi/
diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c
index f26aec6ff588..9daf13ed3714 100644
--- a/drivers/pci/bus.c
+++ b/drivers/pci/bus.c
@@ -357,6 +357,9 @@ void pci_bus_add_device(struct pci_dev *dev)
pci_proc_attach_device(dev);
pci_bridge_d3_update(dev);
+ /* Save config space for error recoverability */
+ pci_save_state(dev);
+
/*
* If the PCI device is associated with a pwrctrl device with a
* power supply, create a device link between the PCI device and
diff --git a/drivers/pci/controller/Kconfig b/drivers/pci/controller/Kconfig
index 41748d083b93..c254d2b8bf17 100644
--- a/drivers/pci/controller/Kconfig
+++ b/drivers/pci/controller/Kconfig
@@ -146,7 +146,7 @@ config PCIE_HISI_ERR
config PCI_IXP4XX
bool "Intel IXP4xx PCI controller"
- depends on ARM && OF
+ depends on OF
depends on ARCH_IXP4XX || COMPILE_TEST
default ARCH_IXP4XX
help
@@ -259,12 +259,20 @@ config PCIE_RCAR_EP
config PCI_RCAR_GEN2
bool "Renesas R-Car Gen2 Internal PCI controller"
- depends on ARCH_RENESAS || COMPILE_TEST
- depends on ARM
+ depends on (ARCH_RENESAS && ARM) || COMPILE_TEST
help
Say Y here if you want internal PCI support on R-Car Gen2 SoC.
- There are 3 internal PCI controllers available with a single
- built-in EHCI/OHCI host controller present on each one.
+ Each internal PCI controller contains a single built-in EHCI/OHCI
+ host controller.
+
+config PCIE_RENESAS_RZG3S_HOST
+ bool "Renesas RZ/G3S PCIe host controller"
+ depends on ARCH_RENESAS || COMPILE_TEST
+ select MFD_SYSCON
+ select IRQ_MSI_LIB
+ help
+ Say Y here if you want PCIe host controller support on Renesas RZ/G3S
+ SoC.
config PCIE_ROCKCHIP
bool
diff --git a/drivers/pci/controller/Makefile b/drivers/pci/controller/Makefile
index 038ccbd9e3ba..229929a945c2 100644
--- a/drivers/pci/controller/Makefile
+++ b/drivers/pci/controller/Makefile
@@ -10,6 +10,7 @@ obj-$(CONFIG_PCI_TEGRA) += pci-tegra.o
obj-$(CONFIG_PCI_RCAR_GEN2) += pci-rcar-gen2.o
obj-$(CONFIG_PCIE_RCAR_HOST) += pcie-rcar.o pcie-rcar-host.o
obj-$(CONFIG_PCIE_RCAR_EP) += pcie-rcar.o pcie-rcar-ep.o
+obj-$(CONFIG_PCIE_RENESAS_RZG3S_HOST) += pcie-rzg3s-host.o
obj-$(CONFIG_PCI_HOST_COMMON) += pci-host-common.o
obj-$(CONFIG_PCI_HOST_GENERIC) += pci-host-generic.o
obj-$(CONFIG_PCI_HOST_THUNDER_ECAM) += pci-thunder-ecam.o
diff --git a/drivers/pci/controller/cadence/Kconfig b/drivers/pci/controller/cadence/Kconfig
index 02a639e55fd8..9e651d545973 100644
--- a/drivers/pci/controller/cadence/Kconfig
+++ b/drivers/pci/controller/cadence/Kconfig
@@ -19,10 +19,10 @@ config PCIE_CADENCE_EP
select PCIE_CADENCE
config PCIE_CADENCE_PLAT
- bool
+ tristate
config PCIE_CADENCE_PLAT_HOST
- bool "Cadence platform PCIe controller (host mode)"
+ tristate "Cadence platform PCIe controller (host mode)"
depends on OF
select PCIE_CADENCE_HOST
select PCIE_CADENCE_PLAT
@@ -32,7 +32,7 @@ config PCIE_CADENCE_PLAT_HOST
vendors SoCs.
config PCIE_CADENCE_PLAT_EP
- bool "Cadence platform PCIe controller (endpoint mode)"
+ tristate "Cadence platform PCIe controller (endpoint mode)"
depends on OF
depends on PCI_ENDPOINT
select PCIE_CADENCE_EP
@@ -42,6 +42,21 @@ config PCIE_CADENCE_PLAT_EP
endpoint mode. This PCIe controller may be embedded into many
different vendors SoCs.
+config PCI_SKY1_HOST
+ tristate "CIX SKY1 PCIe controller (host mode)"
+ depends on OF && (ARCH_CIX || COMPILE_TEST)
+ select PCIE_CADENCE_HOST
+ select PCI_ECAM
+ help
+ Say Y here if you want to support the CIX SKY1 PCIe platform
+ controller in host mode. CIX SKY1 PCIe controller uses Cadence
+ HPA (High Performance Architecture IP [Second generation of
+ Cadence PCIe IP])
+
+ This driver requires Cadence PCIe core infrastructure
+ (PCIE_CADENCE_HOST) and hardware platform adaptation layer
+ to function.
+
config PCIE_SG2042_HOST
tristate "Sophgo SG2042 PCIe controller (host mode)"
depends on OF && (ARCH_SOPHGO || COMPILE_TEST)
diff --git a/drivers/pci/controller/cadence/Makefile b/drivers/pci/controller/cadence/Makefile
index 5e23f8539ecc..b8ec1cecfaa8 100644
--- a/drivers/pci/controller/cadence/Makefile
+++ b/drivers/pci/controller/cadence/Makefile
@@ -1,7 +1,12 @@
# SPDX-License-Identifier: GPL-2.0
-obj-$(CONFIG_PCIE_CADENCE) += pcie-cadence.o
-obj-$(CONFIG_PCIE_CADENCE_HOST) += pcie-cadence-host.o
-obj-$(CONFIG_PCIE_CADENCE_EP) += pcie-cadence-ep.o
+pcie-cadence-mod-y := pcie-cadence-hpa.o pcie-cadence.o
+pcie-cadence-host-mod-y := pcie-cadence-host-common.o pcie-cadence-host.o pcie-cadence-host-hpa.o
+pcie-cadence-ep-mod-y := pcie-cadence-ep.o
+
+obj-$(CONFIG_PCIE_CADENCE) = pcie-cadence-mod.o
+obj-$(CONFIG_PCIE_CADENCE_HOST) += pcie-cadence-host-mod.o
+obj-$(CONFIG_PCIE_CADENCE_EP) += pcie-cadence-ep-mod.o
obj-$(CONFIG_PCIE_CADENCE_PLAT) += pcie-cadence-plat.o
obj-$(CONFIG_PCI_J721E) += pci-j721e.o
obj-$(CONFIG_PCIE_SG2042_HOST) += pcie-sg2042.o
+obj-$(CONFIG_PCI_SKY1_HOST) += pci-sky1.o
diff --git a/drivers/pci/controller/cadence/pci-j721e.c b/drivers/pci/controller/cadence/pci-j721e.c
index 5bc5ab20aa6d..ecd1b0312400 100644
--- a/drivers/pci/controller/cadence/pci-j721e.c
+++ b/drivers/pci/controller/cadence/pci-j721e.c
@@ -477,9 +477,7 @@ static int j721e_pcie_probe(struct platform_device *pdev)
struct j721e_pcie *pcie;
struct cdns_pcie_rc *rc = NULL;
struct cdns_pcie_ep *ep = NULL;
- struct gpio_desc *gpiod;
void __iomem *base;
- struct clk *clk;
u32 num_lanes;
u32 mode;
int ret;
@@ -590,12 +588,12 @@ static int j721e_pcie_probe(struct platform_device *pdev)
switch (mode) {
case PCI_MODE_RC:
- gpiod = devm_gpiod_get_optional(dev, "reset", GPIOD_OUT_LOW);
- if (IS_ERR(gpiod)) {
- ret = dev_err_probe(dev, PTR_ERR(gpiod), "Failed to get reset GPIO\n");
+ pcie->reset_gpio = devm_gpiod_get_optional(dev, "reset", GPIOD_OUT_LOW);
+ if (IS_ERR(pcie->reset_gpio)) {
+ ret = dev_err_probe(dev, PTR_ERR(pcie->reset_gpio),
+ "Failed to get reset GPIO\n");
goto err_get_sync;
}
- pcie->reset_gpio = gpiod;
ret = cdns_pcie_init_phy(dev, cdns_pcie);
if (ret) {
@@ -603,19 +601,13 @@ static int j721e_pcie_probe(struct platform_device *pdev)
goto err_get_sync;
}
- clk = devm_clk_get_optional(dev, "pcie_refclk");
- if (IS_ERR(clk)) {
- ret = dev_err_probe(dev, PTR_ERR(clk), "failed to get pcie_refclk\n");
+ pcie->refclk = devm_clk_get_optional_enabled(dev, "pcie_refclk");
+ if (IS_ERR(pcie->refclk)) {
+ ret = dev_err_probe(dev, PTR_ERR(pcie->refclk),
+ "failed to enable pcie_refclk\n");
goto err_pcie_setup;
}
- ret = clk_prepare_enable(clk);
- if (ret) {
- dev_err_probe(dev, ret, "failed to enable pcie_refclk\n");
- goto err_pcie_setup;
- }
- pcie->refclk = clk;
-
/*
* Section 2.2 of the PCI Express Card Electromechanical
* Specification (Revision 5.1) mandates that the deassertion
@@ -623,16 +615,14 @@ static int j721e_pcie_probe(struct platform_device *pdev)
* This shall ensure that the power and the reference clock
* are stable.
*/
- if (gpiod) {
+ if (pcie->reset_gpio) {
msleep(PCIE_T_PVPERL_MS);
- gpiod_set_value_cansleep(gpiod, 1);
+ gpiod_set_value_cansleep(pcie->reset_gpio, 1);
}
ret = cdns_pcie_host_setup(rc);
- if (ret < 0) {
- clk_disable_unprepare(pcie->refclk);
+ if (ret < 0)
goto err_pcie_setup;
- }
break;
case PCI_MODE_EP:
@@ -679,7 +669,6 @@ static void j721e_pcie_remove(struct platform_device *pdev)
gpiod_set_value_cansleep(pcie->reset_gpio, 0);
- clk_disable_unprepare(pcie->refclk);
cdns_pcie_disable_phy(cdns_pcie);
j721e_pcie_disable_link_irq(pcie);
pm_runtime_put(dev);
diff --git a/drivers/pci/controller/cadence/pci-sky1.c b/drivers/pci/controller/cadence/pci-sky1.c
new file mode 100644
index 000000000000..d8c216dc120d
--- /dev/null
+++ b/drivers/pci/controller/cadence/pci-sky1.c
@@ -0,0 +1,238 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * PCIe controller driver for CIX's sky1 SoCs
+ *
+ * Copyright 2025 Cix Technology Group Co., Ltd.
+ * Author: Hans Zhang <hans.zhang@cixtech.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/pci.h>
+#include <linux/pci-ecam.h>
+#include <linux/pci_ids.h>
+
+#include "pcie-cadence.h"
+#include "pcie-cadence-host-common.h"
+
+#define PCI_VENDOR_ID_CIX 0x1f6c
+#define PCI_DEVICE_ID_CIX_SKY1 0x0001
+
+#define STRAP_REG(n) ((n) * 0x04)
+#define STATUS_REG(n) ((n) * 0x04)
+#define LINK_TRAINING_ENABLE BIT(0)
+#define LINK_COMPLETE BIT(0)
+
+#define SKY1_IP_REG_BANK 0x1000
+#define SKY1_IP_CFG_CTRL_REG_BANK 0x4c00
+#define SKY1_IP_AXI_MASTER_COMMON 0xf000
+#define SKY1_AXI_SLAVE 0x9000
+#define SKY1_AXI_MASTER 0xb000
+#define SKY1_AXI_HLS_REGISTERS 0xc000
+#define SKY1_AXI_RAS_REGISTERS 0xe000
+#define SKY1_DTI_REGISTERS 0xd000
+
+#define IP_REG_I_DBG_STS_0 0x420
+
+struct sky1_pcie {
+ struct cdns_pcie *cdns_pcie;
+ struct cdns_pcie_rc *cdns_pcie_rc;
+
+ struct resource *cfg_res;
+ struct resource *msg_res;
+ struct pci_config_window *cfg;
+ void __iomem *strap_base;
+ void __iomem *status_base;
+ void __iomem *reg_base;
+ void __iomem *cfg_base;
+ void __iomem *msg_base;
+};
+
+static int sky1_pcie_resource_get(struct platform_device *pdev,
+ struct sky1_pcie *pcie)
+{
+ struct device *dev = &pdev->dev;
+ struct resource *res;
+ void __iomem *base;
+
+ base = devm_platform_ioremap_resource_byname(pdev, "reg");
+ if (IS_ERR(base))
+ return dev_err_probe(dev, PTR_ERR(base),
+ "unable to find \"reg\" registers\n");
+ pcie->reg_base = base;
+
+ res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "cfg");
+ if (!res)
+ return dev_err_probe(dev, -ENODEV, "unable to get \"cfg\" resource\n");
+ pcie->cfg_res = res;
+
+ base = devm_platform_ioremap_resource_byname(pdev, "rcsu_strap");
+ if (IS_ERR(base))
+ return dev_err_probe(dev, PTR_ERR(base),
+ "unable to find \"rcsu_strap\" registers\n");
+ pcie->strap_base = base;
+
+ base = devm_platform_ioremap_resource_byname(pdev, "rcsu_status");
+ if (IS_ERR(base))
+ return dev_err_probe(dev, PTR_ERR(base),
+ "unable to find \"rcsu_status\" registers\n");
+ pcie->status_base = base;
+
+ res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "msg");
+ if (!res)
+ return dev_err_probe(dev, -ENODEV, "unable to get \"msg\" resource\n");
+ pcie->msg_res = res;
+ pcie->msg_base = devm_ioremap_resource(dev, res);
+ if (IS_ERR(pcie->msg_base)) {
+ return dev_err_probe(dev, PTR_ERR(pcie->msg_base),
+ "unable to ioremap msg resource\n");
+ }
+
+ return 0;
+}
+
+static int sky1_pcie_start_link(struct cdns_pcie *cdns_pcie)
+{
+ struct sky1_pcie *pcie = dev_get_drvdata(cdns_pcie->dev);
+ u32 val;
+
+ val = readl(pcie->strap_base + STRAP_REG(1));
+ val |= LINK_TRAINING_ENABLE;
+ writel(val, pcie->strap_base + STRAP_REG(1));
+
+ return 0;
+}
+
+static void sky1_pcie_stop_link(struct cdns_pcie *cdns_pcie)
+{
+ struct sky1_pcie *pcie = dev_get_drvdata(cdns_pcie->dev);
+ u32 val;
+
+ val = readl(pcie->strap_base + STRAP_REG(1));
+ val &= ~LINK_TRAINING_ENABLE;
+ writel(val, pcie->strap_base + STRAP_REG(1));
+}
+
+static bool sky1_pcie_link_up(struct cdns_pcie *cdns_pcie)
+{
+ u32 val;
+
+ val = cdns_pcie_hpa_readl(cdns_pcie, REG_BANK_IP_REG,
+ IP_REG_I_DBG_STS_0);
+ return val & LINK_COMPLETE;
+}
+
+static const struct cdns_pcie_ops sky1_pcie_ops = {
+ .start_link = sky1_pcie_start_link,
+ .stop_link = sky1_pcie_stop_link,
+ .link_up = sky1_pcie_link_up,
+};
+
+static int sky1_pcie_probe(struct platform_device *pdev)
+{
+ struct cdns_plat_pcie_of_data *reg_off;
+ struct device *dev = &pdev->dev;
+ struct pci_host_bridge *bridge;
+ struct cdns_pcie *cdns_pcie;
+ struct resource_entry *bus;
+ struct cdns_pcie_rc *rc;
+ struct sky1_pcie *pcie;
+ int ret;
+
+ pcie = devm_kzalloc(dev, sizeof(*pcie), GFP_KERNEL);
+ if (!pcie)
+ return -ENOMEM;
+
+ bridge = devm_pci_alloc_host_bridge(dev, sizeof(*rc));
+ if (!bridge)
+ return -ENOMEM;
+
+ ret = sky1_pcie_resource_get(pdev, pcie);
+ if (ret < 0)
+ return ret;
+
+ bus = resource_list_first_type(&bridge->windows, IORESOURCE_BUS);
+ if (!bus)
+ return -ENODEV;
+
+ pcie->cfg = pci_ecam_create(dev, pcie->cfg_res, bus->res,
+ &pci_generic_ecam_ops);
+ if (IS_ERR(pcie->cfg))
+ return PTR_ERR(pcie->cfg);
+
+ bridge->ops = (struct pci_ops *)&pci_generic_ecam_ops.pci_ops;
+ rc = pci_host_bridge_priv(bridge);
+ rc->ecam_supported = 1;
+ rc->cfg_base = pcie->cfg->win;
+ rc->cfg_res = &pcie->cfg->res;
+
+ cdns_pcie = &rc->pcie;
+ cdns_pcie->dev = dev;
+ cdns_pcie->ops = &sky1_pcie_ops;
+ cdns_pcie->reg_base = pcie->reg_base;
+ cdns_pcie->msg_res = pcie->msg_res;
+ cdns_pcie->is_rc = 1;
+
+ reg_off = devm_kzalloc(dev, sizeof(*reg_off), GFP_KERNEL);
+ if (!reg_off)
+ return -ENOMEM;
+
+ reg_off->ip_reg_bank_offset = SKY1_IP_REG_BANK;
+ reg_off->ip_cfg_ctrl_reg_offset = SKY1_IP_CFG_CTRL_REG_BANK;
+ reg_off->axi_mstr_common_offset = SKY1_IP_AXI_MASTER_COMMON;
+ reg_off->axi_slave_offset = SKY1_AXI_SLAVE;
+ reg_off->axi_master_offset = SKY1_AXI_MASTER;
+ reg_off->axi_hls_offset = SKY1_AXI_HLS_REGISTERS;
+ reg_off->axi_ras_offset = SKY1_AXI_RAS_REGISTERS;
+ reg_off->axi_dti_offset = SKY1_DTI_REGISTERS;
+ cdns_pcie->cdns_pcie_reg_offsets = reg_off;
+
+ pcie->cdns_pcie = cdns_pcie;
+ pcie->cdns_pcie_rc = rc;
+ pcie->cfg_base = rc->cfg_base;
+ bridge->sysdata = pcie->cfg;
+
+ rc->vendor_id = PCI_VENDOR_ID_CIX;
+ rc->device_id = PCI_DEVICE_ID_CIX_SKY1;
+ rc->no_inbound_map = 1;
+
+ dev_set_drvdata(dev, pcie);
+
+ ret = cdns_pcie_hpa_host_setup(rc);
+ if (ret < 0) {
+ pci_ecam_free(pcie->cfg);
+ return ret;
+ }
+
+ return 0;
+}
+
+static const struct of_device_id of_sky1_pcie_match[] = {
+ { .compatible = "cix,sky1-pcie-host", },
+ {},
+};
+MODULE_DEVICE_TABLE(of, of_sky1_pcie_match);
+
+static void sky1_pcie_remove(struct platform_device *pdev)
+{
+ struct sky1_pcie *pcie = platform_get_drvdata(pdev);
+
+ pci_ecam_free(pcie->cfg);
+}
+
+static struct platform_driver sky1_pcie_driver = {
+ .probe = sky1_pcie_probe,
+ .remove = sky1_pcie_remove,
+ .driver = {
+ .name = "sky1-pcie",
+ .of_match_table = of_sky1_pcie_match,
+ .probe_type = PROBE_PREFER_ASYNCHRONOUS,
+ },
+};
+module_platform_driver(sky1_pcie_driver);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("PCIe controller driver for CIX's sky1 SoCs");
+MODULE_AUTHOR("Hans Zhang <hans.zhang@cixtech.com>");
diff --git a/drivers/pci/controller/cadence/pcie-cadence-host-common.c b/drivers/pci/controller/cadence/pcie-cadence-host-common.c
new file mode 100644
index 000000000000..15415d7f35ee
--- /dev/null
+++ b/drivers/pci/controller/cadence/pcie-cadence-host-common.c
@@ -0,0 +1,288 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Cadence PCIe host controller library.
+ *
+ * Copyright (c) 2017 Cadence
+ * Author: Cyrille Pitchen <cyrille.pitchen@free-electrons.com>
+ */
+#include <linux/delay.h>
+#include <linux/kernel.h>
+#include <linux/list_sort.h>
+#include <linux/of_address.h>
+#include <linux/of_pci.h>
+#include <linux/platform_device.h>
+
+#include "pcie-cadence.h"
+#include "pcie-cadence-host-common.h"
+
+#define LINK_RETRAIN_TIMEOUT HZ
+
+u64 bar_max_size[] = {
+ [RP_BAR0] = _ULL(128 * SZ_2G),
+ [RP_BAR1] = SZ_2G,
+ [RP_NO_BAR] = _BITULL(63),
+};
+EXPORT_SYMBOL_GPL(bar_max_size);
+
+int cdns_pcie_host_training_complete(struct cdns_pcie *pcie)
+{
+ u32 pcie_cap_off = CDNS_PCIE_RP_CAP_OFFSET;
+ unsigned long end_jiffies;
+ u16 lnk_stat;
+
+ /* Wait for link training to complete. Exit after timeout. */
+ end_jiffies = jiffies + LINK_RETRAIN_TIMEOUT;
+ do {
+ lnk_stat = cdns_pcie_rp_readw(pcie, pcie_cap_off + PCI_EXP_LNKSTA);
+ if (!(lnk_stat & PCI_EXP_LNKSTA_LT))
+ break;
+ usleep_range(0, 1000);
+ } while (time_before(jiffies, end_jiffies));
+
+ if (!(lnk_stat & PCI_EXP_LNKSTA_LT))
+ return 0;
+
+ return -ETIMEDOUT;
+}
+EXPORT_SYMBOL_GPL(cdns_pcie_host_training_complete);
+
+int cdns_pcie_host_wait_for_link(struct cdns_pcie *pcie,
+ cdns_pcie_linkup_func pcie_link_up)
+{
+ struct device *dev = pcie->dev;
+ int retries;
+
+ /* Check if the link is up or not */
+ for (retries = 0; retries < LINK_WAIT_MAX_RETRIES; retries++) {
+ if (pcie_link_up(pcie)) {
+ dev_info(dev, "Link up\n");
+ return 0;
+ }
+ usleep_range(LINK_WAIT_USLEEP_MIN, LINK_WAIT_USLEEP_MAX);
+ }
+
+ return -ETIMEDOUT;
+}
+EXPORT_SYMBOL_GPL(cdns_pcie_host_wait_for_link);
+
+int cdns_pcie_retrain(struct cdns_pcie *pcie,
+ cdns_pcie_linkup_func pcie_link_up)
+{
+ u32 lnk_cap_sls, pcie_cap_off = CDNS_PCIE_RP_CAP_OFFSET;
+ u16 lnk_stat, lnk_ctl;
+ int ret = 0;
+
+ /*
+ * Set retrain bit if current speed is 2.5 GB/s,
+ * but the PCIe root port support is > 2.5 GB/s.
+ */
+
+ lnk_cap_sls = cdns_pcie_readl(pcie, (CDNS_PCIE_RP_BASE + pcie_cap_off +
+ PCI_EXP_LNKCAP));
+ if ((lnk_cap_sls & PCI_EXP_LNKCAP_SLS) <= PCI_EXP_LNKCAP_SLS_2_5GB)
+ return ret;
+
+ lnk_stat = cdns_pcie_rp_readw(pcie, pcie_cap_off + PCI_EXP_LNKSTA);
+ if ((lnk_stat & PCI_EXP_LNKSTA_CLS) == PCI_EXP_LNKSTA_CLS_2_5GB) {
+ lnk_ctl = cdns_pcie_rp_readw(pcie,
+ pcie_cap_off + PCI_EXP_LNKCTL);
+ lnk_ctl |= PCI_EXP_LNKCTL_RL;
+ cdns_pcie_rp_writew(pcie, pcie_cap_off + PCI_EXP_LNKCTL,
+ lnk_ctl);
+
+ ret = cdns_pcie_host_training_complete(pcie);
+ if (ret)
+ return ret;
+
+ ret = cdns_pcie_host_wait_for_link(pcie, pcie_link_up);
+ }
+ return ret;
+}
+EXPORT_SYMBOL_GPL(cdns_pcie_retrain);
+
+int cdns_pcie_host_start_link(struct cdns_pcie_rc *rc,
+ cdns_pcie_linkup_func pcie_link_up)
+{
+ struct cdns_pcie *pcie = &rc->pcie;
+ int ret;
+
+ ret = cdns_pcie_host_wait_for_link(pcie, pcie_link_up);
+
+ /*
+ * Retrain link for Gen2 training defect
+ * if quirk flag is set.
+ */
+ if (!ret && rc->quirk_retrain_flag)
+ ret = cdns_pcie_retrain(pcie, pcie_link_up);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(cdns_pcie_host_start_link);
+
+enum cdns_pcie_rp_bar
+cdns_pcie_host_find_min_bar(struct cdns_pcie_rc *rc, u64 size)
+{
+ enum cdns_pcie_rp_bar bar, sel_bar;
+
+ sel_bar = RP_BAR_UNDEFINED;
+ for (bar = RP_BAR0; bar <= RP_NO_BAR; bar++) {
+ if (!rc->avail_ib_bar[bar])
+ continue;
+
+ if (size <= bar_max_size[bar]) {
+ if (sel_bar == RP_BAR_UNDEFINED) {
+ sel_bar = bar;
+ continue;
+ }
+
+ if (bar_max_size[bar] < bar_max_size[sel_bar])
+ sel_bar = bar;
+ }
+ }
+
+ return sel_bar;
+}
+EXPORT_SYMBOL_GPL(cdns_pcie_host_find_min_bar);
+
+enum cdns_pcie_rp_bar
+cdns_pcie_host_find_max_bar(struct cdns_pcie_rc *rc, u64 size)
+{
+ enum cdns_pcie_rp_bar bar, sel_bar;
+
+ sel_bar = RP_BAR_UNDEFINED;
+ for (bar = RP_BAR0; bar <= RP_NO_BAR; bar++) {
+ if (!rc->avail_ib_bar[bar])
+ continue;
+
+ if (size >= bar_max_size[bar]) {
+ if (sel_bar == RP_BAR_UNDEFINED) {
+ sel_bar = bar;
+ continue;
+ }
+
+ if (bar_max_size[bar] > bar_max_size[sel_bar])
+ sel_bar = bar;
+ }
+ }
+
+ return sel_bar;
+}
+EXPORT_SYMBOL_GPL(cdns_pcie_host_find_max_bar);
+
+int cdns_pcie_host_dma_ranges_cmp(void *priv, const struct list_head *a,
+ const struct list_head *b)
+{
+ struct resource_entry *entry1, *entry2;
+
+ entry1 = container_of(a, struct resource_entry, node);
+ entry2 = container_of(b, struct resource_entry, node);
+
+ return resource_size(entry2->res) - resource_size(entry1->res);
+}
+EXPORT_SYMBOL_GPL(cdns_pcie_host_dma_ranges_cmp);
+
+int cdns_pcie_host_bar_config(struct cdns_pcie_rc *rc,
+ struct resource_entry *entry,
+ cdns_pcie_host_bar_ib_cfg pci_host_ib_config)
+{
+ struct cdns_pcie *pcie = &rc->pcie;
+ struct device *dev = pcie->dev;
+ u64 cpu_addr, size, winsize;
+ enum cdns_pcie_rp_bar bar;
+ unsigned long flags;
+ int ret;
+
+ cpu_addr = entry->res->start;
+ flags = entry->res->flags;
+ size = resource_size(entry->res);
+
+ while (size > 0) {
+ /*
+ * Try to find a minimum BAR whose size is greater than
+ * or equal to the remaining resource_entry size. This will
+ * fail if the size of each of the available BARs is less than
+ * the remaining resource_entry size.
+ *
+ * If a minimum BAR is found, IB ATU will be configured and
+ * exited.
+ */
+ bar = cdns_pcie_host_find_min_bar(rc, size);
+ if (bar != RP_BAR_UNDEFINED) {
+ ret = pci_host_ib_config(rc, bar, cpu_addr, size, flags);
+ if (ret)
+ dev_err(dev, "IB BAR: %d config failed\n", bar);
+ return ret;
+ }
+
+ /*
+ * If the control reaches here, it would mean the remaining
+ * resource_entry size cannot be fitted in a single BAR. So we
+ * find a maximum BAR whose size is less than or equal to the
+ * remaining resource_entry size and split the resource entry
+ * so that part of resource entry is fitted inside the maximum
+ * BAR. The remaining size would be fitted during the next
+ * iteration of the loop.
+ *
+ * If a maximum BAR is not found, there is no way we can fit
+ * this resource_entry, so we error out.
+ */
+ bar = cdns_pcie_host_find_max_bar(rc, size);
+ if (bar == RP_BAR_UNDEFINED) {
+ dev_err(dev, "No free BAR to map cpu_addr %llx\n",
+ cpu_addr);
+ return -EINVAL;
+ }
+
+ winsize = bar_max_size[bar];
+ ret = pci_host_ib_config(rc, bar, cpu_addr, winsize, flags);
+ if (ret) {
+ dev_err(dev, "IB BAR: %d config failed\n", bar);
+ return ret;
+ }
+
+ size -= winsize;
+ cpu_addr += winsize;
+ }
+
+ return 0;
+}
+
+int cdns_pcie_host_map_dma_ranges(struct cdns_pcie_rc *rc,
+ cdns_pcie_host_bar_ib_cfg pci_host_ib_config)
+{
+ struct cdns_pcie *pcie = &rc->pcie;
+ struct device *dev = pcie->dev;
+ struct device_node *np = dev->of_node;
+ struct pci_host_bridge *bridge;
+ struct resource_entry *entry;
+ u32 no_bar_nbits = 32;
+ int err;
+
+ bridge = pci_host_bridge_from_priv(rc);
+ if (!bridge)
+ return -ENOMEM;
+
+ if (list_empty(&bridge->dma_ranges)) {
+ of_property_read_u32(np, "cdns,no-bar-match-nbits",
+ &no_bar_nbits);
+ err = pci_host_ib_config(rc, RP_NO_BAR, 0x0, (u64)1 << no_bar_nbits, 0);
+ if (err)
+ dev_err(dev, "IB BAR: %d config failed\n", RP_NO_BAR);
+ return err;
+ }
+
+ list_sort(NULL, &bridge->dma_ranges, cdns_pcie_host_dma_ranges_cmp);
+
+ resource_list_for_each_entry(entry, &bridge->dma_ranges) {
+ err = cdns_pcie_host_bar_config(rc, entry, pci_host_ib_config);
+ if (err) {
+ dev_err(dev, "Fail to configure IB using dma-ranges\n");
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Cadence PCIe host controller driver");
diff --git a/drivers/pci/controller/cadence/pcie-cadence-host-common.h b/drivers/pci/controller/cadence/pcie-cadence-host-common.h
new file mode 100644
index 000000000000..fe7d4202a8b6
--- /dev/null
+++ b/drivers/pci/controller/cadence/pcie-cadence-host-common.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Cadence PCIe Host controller driver.
+ *
+ * Copyright (c) 2017 Cadence
+ * Author: Cyrille Pitchen <cyrille.pitchen@free-electrons.com>
+ */
+#ifndef _PCIE_CADENCE_HOST_COMMON_H
+#define _PCIE_CADENCE_HOST_COMMON_H
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+
+extern u64 bar_max_size[];
+
+typedef int (*cdns_pcie_host_bar_ib_cfg)(struct cdns_pcie_rc *,
+ enum cdns_pcie_rp_bar,
+ u64,
+ u64,
+ unsigned long);
+typedef bool (*cdns_pcie_linkup_func)(struct cdns_pcie *);
+
+int cdns_pcie_host_training_complete(struct cdns_pcie *pcie);
+int cdns_pcie_host_wait_for_link(struct cdns_pcie *pcie,
+ cdns_pcie_linkup_func pcie_link_up);
+int cdns_pcie_retrain(struct cdns_pcie *pcie, cdns_pcie_linkup_func pcie_linkup_func);
+int cdns_pcie_host_start_link(struct cdns_pcie_rc *rc,
+ cdns_pcie_linkup_func pcie_link_up);
+enum cdns_pcie_rp_bar
+cdns_pcie_host_find_min_bar(struct cdns_pcie_rc *rc, u64 size);
+enum cdns_pcie_rp_bar
+cdns_pcie_host_find_max_bar(struct cdns_pcie_rc *rc, u64 size);
+int cdns_pcie_host_dma_ranges_cmp(void *priv, const struct list_head *a,
+ const struct list_head *b);
+int cdns_pcie_host_bar_ib_config(struct cdns_pcie_rc *rc,
+ enum cdns_pcie_rp_bar bar,
+ u64 cpu_addr,
+ u64 size,
+ unsigned long flags);
+int cdns_pcie_host_bar_config(struct cdns_pcie_rc *rc,
+ struct resource_entry *entry,
+ cdns_pcie_host_bar_ib_cfg pci_host_ib_config);
+int cdns_pcie_host_map_dma_ranges(struct cdns_pcie_rc *rc,
+ cdns_pcie_host_bar_ib_cfg pci_host_ib_config);
+
+#endif /* _PCIE_CADENCE_HOST_COMMON_H */
diff --git a/drivers/pci/controller/cadence/pcie-cadence-host-hpa.c b/drivers/pci/controller/cadence/pcie-cadence-host-hpa.c
new file mode 100644
index 000000000000..0f540bed58e8
--- /dev/null
+++ b/drivers/pci/controller/cadence/pcie-cadence-host-hpa.c
@@ -0,0 +1,368 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Cadence PCIe host controller driver.
+ *
+ * Copyright (c) 2024, Cadence Design Systems
+ * Author: Manikandan K Pillai <mpillai@cadence.com>
+ */
+#include <linux/delay.h>
+#include <linux/kernel.h>
+#include <linux/list_sort.h>
+#include <linux/of_address.h>
+#include <linux/of_pci.h>
+#include <linux/of_irq.h>
+#include <linux/platform_device.h>
+
+#include "pcie-cadence.h"
+#include "pcie-cadence-host-common.h"
+
+static u8 bar_aperture_mask[] = {
+ [RP_BAR0] = 0x3F,
+ [RP_BAR1] = 0x3F,
+};
+
+void __iomem *cdns_pci_hpa_map_bus(struct pci_bus *bus, unsigned int devfn,
+ int where)
+{
+ struct pci_host_bridge *bridge = pci_find_host_bridge(bus);
+ struct cdns_pcie_rc *rc = pci_host_bridge_priv(bridge);
+ struct cdns_pcie *pcie = &rc->pcie;
+ unsigned int busn = bus->number;
+ u32 addr0, desc0, desc1, ctrl0;
+ u32 regval;
+
+ if (pci_is_root_bus(bus)) {
+ /*
+ * Only the root port (devfn == 0) is connected to this bus.
+ * All other PCI devices are behind some bridge hence on another
+ * bus.
+ */
+ if (devfn)
+ return NULL;
+
+ return pcie->reg_base + (where & 0xfff);
+ }
+
+ /* Clear AXI link-down status */
+ regval = cdns_pcie_hpa_readl(pcie, REG_BANK_AXI_SLAVE, CDNS_PCIE_HPA_AT_LINKDOWN);
+ cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE, CDNS_PCIE_HPA_AT_LINKDOWN,
+ (regval & ~GENMASK(0, 0)));
+
+ /* Update Output registers for AXI region 0 */
+ addr0 = CDNS_PCIE_HPA_AT_OB_REGION_PCI_ADDR0_NBITS(12) |
+ CDNS_PCIE_HPA_AT_OB_REGION_PCI_ADDR0_DEVFN(devfn) |
+ CDNS_PCIE_HPA_AT_OB_REGION_PCI_ADDR0_BUS(busn);
+ cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE,
+ CDNS_PCIE_HPA_AT_OB_REGION_PCI_ADDR0(0), addr0);
+
+ desc1 = cdns_pcie_hpa_readl(pcie, REG_BANK_AXI_SLAVE,
+ CDNS_PCIE_HPA_AT_OB_REGION_DESC1(0));
+ desc1 &= ~CDNS_PCIE_HPA_AT_OB_REGION_DESC1_DEVFN_MASK;
+ desc1 |= CDNS_PCIE_HPA_AT_OB_REGION_DESC1_DEVFN(0);
+ ctrl0 = CDNS_PCIE_HPA_AT_OB_REGION_CTRL0_SUPPLY_BUS |
+ CDNS_PCIE_HPA_AT_OB_REGION_CTRL0_SUPPLY_DEV_FN;
+
+ if (busn == bridge->busnr + 1)
+ desc0 = CDNS_PCIE_HPA_AT_OB_REGION_DESC0_TYPE_CONF_TYPE0;
+ else
+ desc0 = CDNS_PCIE_HPA_AT_OB_REGION_DESC0_TYPE_CONF_TYPE1;
+
+ cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE,
+ CDNS_PCIE_HPA_AT_OB_REGION_DESC0(0), desc0);
+ cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE,
+ CDNS_PCIE_HPA_AT_OB_REGION_DESC1(0), desc1);
+ cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE,
+ CDNS_PCIE_HPA_AT_OB_REGION_CTRL0(0), ctrl0);
+
+ return rc->cfg_base + (where & 0xfff);
+}
+
+static struct pci_ops cdns_pcie_hpa_host_ops = {
+ .map_bus = cdns_pci_hpa_map_bus,
+ .read = pci_generic_config_read,
+ .write = pci_generic_config_write,
+};
+
+static void cdns_pcie_hpa_host_enable_ptm_response(struct cdns_pcie *pcie)
+{
+ u32 val;
+
+ val = cdns_pcie_hpa_readl(pcie, REG_BANK_IP_REG, CDNS_PCIE_HPA_LM_PTM_CTRL);
+ cdns_pcie_hpa_writel(pcie, REG_BANK_IP_REG, CDNS_PCIE_HPA_LM_PTM_CTRL,
+ val | CDNS_PCIE_HPA_LM_PTM_CTRL_PTMRSEN);
+}
+
+static int cdns_pcie_hpa_host_bar_ib_config(struct cdns_pcie_rc *rc,
+ enum cdns_pcie_rp_bar bar,
+ u64 cpu_addr, u64 size,
+ unsigned long flags)
+{
+ struct cdns_pcie *pcie = &rc->pcie;
+ u32 addr0, addr1, aperture, value;
+
+ if (!rc->avail_ib_bar[bar])
+ return -ENODEV;
+
+ rc->avail_ib_bar[bar] = false;
+
+ aperture = ilog2(size);
+ if (bar == RP_NO_BAR) {
+ addr0 = CDNS_PCIE_HPA_AT_IB_RP_BAR_ADDR0_NBITS(aperture) |
+ (lower_32_bits(cpu_addr) & GENMASK(31, 8));
+ addr1 = upper_32_bits(cpu_addr);
+ } else {
+ addr0 = lower_32_bits(cpu_addr);
+ addr1 = upper_32_bits(cpu_addr);
+ }
+ cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_MASTER,
+ CDNS_PCIE_HPA_AT_IB_RP_BAR_ADDR0(bar), addr0);
+ cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_MASTER,
+ CDNS_PCIE_HPA_AT_IB_RP_BAR_ADDR1(bar), addr1);
+
+ if (bar == RP_NO_BAR)
+ bar = (enum cdns_pcie_rp_bar)BAR_0;
+
+ value = cdns_pcie_hpa_readl(pcie, REG_BANK_IP_CFG_CTRL_REG, CDNS_PCIE_HPA_LM_RC_BAR_CFG);
+ value &= ~(HPA_LM_RC_BAR_CFG_CTRL_MEM_64BITS(bar) |
+ HPA_LM_RC_BAR_CFG_CTRL_PREF_MEM_64BITS(bar) |
+ HPA_LM_RC_BAR_CFG_CTRL_MEM_32BITS(bar) |
+ HPA_LM_RC_BAR_CFG_CTRL_PREF_MEM_32BITS(bar) |
+ HPA_LM_RC_BAR_CFG_APERTURE(bar, bar_aperture_mask[bar] + 7));
+ if (size + cpu_addr >= SZ_4G) {
+ value |= HPA_LM_RC_BAR_CFG_CTRL_MEM_64BITS(bar);
+ if ((flags & IORESOURCE_PREFETCH))
+ value |= HPA_LM_RC_BAR_CFG_CTRL_PREF_MEM_64BITS(bar);
+ } else {
+ value |= HPA_LM_RC_BAR_CFG_CTRL_MEM_32BITS(bar);
+ if ((flags & IORESOURCE_PREFETCH))
+ value |= HPA_LM_RC_BAR_CFG_CTRL_PREF_MEM_32BITS(bar);
+ }
+
+ value |= HPA_LM_RC_BAR_CFG_APERTURE(bar, aperture);
+ cdns_pcie_hpa_writel(pcie, REG_BANK_IP_CFG_CTRL_REG, CDNS_PCIE_HPA_LM_RC_BAR_CFG, value);
+
+ return 0;
+}
+
+static int cdns_pcie_hpa_host_init_root_port(struct cdns_pcie_rc *rc)
+{
+ struct cdns_pcie *pcie = &rc->pcie;
+ u32 value, ctrl;
+
+ /*
+ * Set the root port BAR configuration register:
+ * - disable both BAR0 and BAR1
+ * - enable Prefetchable Memory Base and Limit registers in type 1
+ * config space (64 bits)
+ * - enable IO Base and Limit registers in type 1 config
+ * space (32 bits)
+ */
+
+ ctrl = CDNS_PCIE_HPA_LM_BAR_CFG_CTRL_DISABLED;
+ value = CDNS_PCIE_HPA_LM_RC_BAR_CFG_BAR0_CTRL(ctrl) |
+ CDNS_PCIE_HPA_LM_RC_BAR_CFG_BAR1_CTRL(ctrl) |
+ CDNS_PCIE_HPA_LM_RC_BAR_CFG_PREFETCH_MEM_ENABLE |
+ CDNS_PCIE_HPA_LM_RC_BAR_CFG_PREFETCH_MEM_64BITS |
+ CDNS_PCIE_HPA_LM_RC_BAR_CFG_IO_ENABLE |
+ CDNS_PCIE_HPA_LM_RC_BAR_CFG_IO_32BITS;
+ cdns_pcie_hpa_writel(pcie, REG_BANK_IP_CFG_CTRL_REG,
+ CDNS_PCIE_HPA_LM_RC_BAR_CFG, value);
+
+ if (rc->vendor_id != 0xffff)
+ cdns_pcie_hpa_rp_writew(pcie, PCI_VENDOR_ID, rc->vendor_id);
+
+ if (rc->device_id != 0xffff)
+ cdns_pcie_hpa_rp_writew(pcie, PCI_DEVICE_ID, rc->device_id);
+
+ cdns_pcie_hpa_rp_writeb(pcie, PCI_CLASS_REVISION, 0);
+ cdns_pcie_hpa_rp_writeb(pcie, PCI_CLASS_PROG, 0);
+ cdns_pcie_hpa_rp_writew(pcie, PCI_CLASS_DEVICE, PCI_CLASS_BRIDGE_PCI);
+
+ /* Enable bus mastering */
+ value = cdns_pcie_hpa_readl(pcie, REG_BANK_RP, PCI_COMMAND);
+ value |= (PCI_COMMAND_MEMORY | PCI_COMMAND_IO | PCI_COMMAND_MASTER);
+ cdns_pcie_hpa_writel(pcie, REG_BANK_RP, PCI_COMMAND, value);
+ return 0;
+}
+
+static void cdns_pcie_hpa_create_region_for_cfg(struct cdns_pcie_rc *rc)
+{
+ struct cdns_pcie *pcie = &rc->pcie;
+ struct pci_host_bridge *bridge = pci_host_bridge_from_priv(rc);
+ struct resource *cfg_res = rc->cfg_res;
+ struct resource_entry *entry;
+ u64 cpu_addr = cfg_res->start;
+ u32 addr0, addr1, desc1;
+ int busnr = 0;
+
+ entry = resource_list_first_type(&bridge->windows, IORESOURCE_BUS);
+ if (entry)
+ busnr = entry->res->start;
+
+ cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE,
+ CDNS_PCIE_HPA_TAG_MANAGEMENT, 0x01000000);
+ /*
+ * Reserve region 0 for PCI configure space accesses:
+ * OB_REGION_PCI_ADDR0 and OB_REGION_DESC0 are updated dynamically by
+ * cdns_pci_map_bus(), other region registers are set here once for all
+ */
+ desc1 = CDNS_PCIE_HPA_AT_OB_REGION_DESC1_BUS(busnr);
+ cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE,
+ CDNS_PCIE_HPA_AT_OB_REGION_PCI_ADDR1(0), 0x0);
+ /* Type-1 CFG */
+ cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE,
+ CDNS_PCIE_HPA_AT_OB_REGION_DESC0(0), 0x05000000);
+ cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE,
+ CDNS_PCIE_HPA_AT_OB_REGION_DESC1(0), desc1);
+
+ addr0 = CDNS_PCIE_HPA_AT_OB_REGION_CPU_ADDR0_NBITS(12) |
+ (lower_32_bits(cpu_addr) & GENMASK(31, 8));
+ addr1 = upper_32_bits(cpu_addr);
+ cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE,
+ CDNS_PCIE_HPA_AT_OB_REGION_CPU_ADDR0(0), addr0);
+ cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE,
+ CDNS_PCIE_HPA_AT_OB_REGION_CPU_ADDR1(0), addr1);
+ cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE,
+ CDNS_PCIE_HPA_AT_OB_REGION_CTRL0(0), 0x06000000);
+}
+
+static int cdns_pcie_hpa_host_init_address_translation(struct cdns_pcie_rc *rc)
+{
+ struct cdns_pcie *pcie = &rc->pcie;
+ struct pci_host_bridge *bridge = pci_host_bridge_from_priv(rc);
+ struct resource_entry *entry;
+ int r = 0, busnr = 0;
+
+ if (!rc->ecam_supported)
+ cdns_pcie_hpa_create_region_for_cfg(rc);
+
+ entry = resource_list_first_type(&bridge->windows, IORESOURCE_BUS);
+ if (entry)
+ busnr = entry->res->start;
+
+ r++;
+ if (pcie->msg_res) {
+ cdns_pcie_hpa_set_outbound_region_for_normal_msg(pcie, busnr, 0, r,
+ pcie->msg_res->start);
+
+ r++;
+ }
+ resource_list_for_each_entry(entry, &bridge->windows) {
+ struct resource *res = entry->res;
+ u64 pci_addr = res->start - entry->offset;
+
+ if (resource_type(res) == IORESOURCE_IO)
+ cdns_pcie_hpa_set_outbound_region(pcie, busnr, 0, r,
+ true,
+ pci_pio_to_address(res->start),
+ pci_addr,
+ resource_size(res));
+ else
+ cdns_pcie_hpa_set_outbound_region(pcie, busnr, 0, r,
+ false,
+ res->start,
+ pci_addr,
+ resource_size(res));
+
+ r++;
+ }
+
+ if (rc->no_inbound_map)
+ return 0;
+ else
+ return cdns_pcie_host_map_dma_ranges(rc, cdns_pcie_hpa_host_bar_ib_config);
+}
+
+static int cdns_pcie_hpa_host_init(struct cdns_pcie_rc *rc)
+{
+ int err;
+
+ err = cdns_pcie_hpa_host_init_root_port(rc);
+ if (err)
+ return err;
+
+ return cdns_pcie_hpa_host_init_address_translation(rc);
+}
+
+int cdns_pcie_hpa_host_link_setup(struct cdns_pcie_rc *rc)
+{
+ struct cdns_pcie *pcie = &rc->pcie;
+ struct device *dev = rc->pcie.dev;
+ int ret;
+
+ if (rc->quirk_detect_quiet_flag)
+ cdns_pcie_hpa_detect_quiet_min_delay_set(&rc->pcie);
+
+ cdns_pcie_hpa_host_enable_ptm_response(pcie);
+
+ ret = cdns_pcie_start_link(pcie);
+ if (ret) {
+ dev_err(dev, "Failed to start link\n");
+ return ret;
+ }
+
+ ret = cdns_pcie_host_wait_for_link(pcie, cdns_pcie_hpa_link_up);
+ if (ret)
+ dev_dbg(dev, "PCIe link never came up\n");
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(cdns_pcie_hpa_host_link_setup);
+
+int cdns_pcie_hpa_host_setup(struct cdns_pcie_rc *rc)
+{
+ struct device *dev = rc->pcie.dev;
+ struct platform_device *pdev = to_platform_device(dev);
+ struct pci_host_bridge *bridge;
+ enum cdns_pcie_rp_bar bar;
+ struct cdns_pcie *pcie;
+ struct resource *res;
+ int ret;
+
+ bridge = pci_host_bridge_from_priv(rc);
+ if (!bridge)
+ return -ENOMEM;
+
+ pcie = &rc->pcie;
+ pcie->is_rc = true;
+
+ if (!pcie->reg_base) {
+ pcie->reg_base = devm_platform_ioremap_resource_byname(pdev, "reg");
+ if (IS_ERR(pcie->reg_base)) {
+ dev_err(dev, "missing \"reg\"\n");
+ return PTR_ERR(pcie->reg_base);
+ }
+ }
+
+ /* ECAM config space is remapped at glue layer */
+ if (!rc->cfg_base) {
+ res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "cfg");
+ rc->cfg_base = devm_pci_remap_cfg_resource(dev, res);
+ if (IS_ERR(rc->cfg_base))
+ return PTR_ERR(rc->cfg_base);
+ rc->cfg_res = res;
+ }
+
+ /* Put EROM Bar aperture to 0 */
+ cdns_pcie_hpa_writel(pcie, REG_BANK_IP_CFG_CTRL_REG, CDNS_PCIE_EROM, 0x0);
+
+ ret = cdns_pcie_hpa_host_link_setup(rc);
+ if (ret)
+ return ret;
+
+ for (bar = RP_BAR0; bar <= RP_NO_BAR; bar++)
+ rc->avail_ib_bar[bar] = true;
+
+ ret = cdns_pcie_hpa_host_init(rc);
+ if (ret)
+ return ret;
+
+ if (!bridge->ops)
+ bridge->ops = &cdns_pcie_hpa_host_ops;
+
+ return pci_host_probe(bridge);
+}
+EXPORT_SYMBOL_GPL(cdns_pcie_hpa_host_setup);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Cadence PCIe host controller driver");
diff --git a/drivers/pci/controller/cadence/pcie-cadence-host.c b/drivers/pci/controller/cadence/pcie-cadence-host.c
index fffd63d6665e..db3154c1eccb 100644
--- a/drivers/pci/controller/cadence/pcie-cadence-host.c
+++ b/drivers/pci/controller/cadence/pcie-cadence-host.c
@@ -12,14 +12,7 @@
#include <linux/platform_device.h>
#include "pcie-cadence.h"
-
-#define LINK_RETRAIN_TIMEOUT HZ
-
-static u64 bar_max_size[] = {
- [RP_BAR0] = _ULL(128 * SZ_2G),
- [RP_BAR1] = SZ_2G,
- [RP_NO_BAR] = _BITULL(63),
-};
+#include "pcie-cadence-host-common.h"
static u8 bar_aperture_mask[] = {
[RP_BAR0] = 0x1F,
@@ -81,77 +74,6 @@ static struct pci_ops cdns_pcie_host_ops = {
.write = pci_generic_config_write,
};
-static int cdns_pcie_host_training_complete(struct cdns_pcie *pcie)
-{
- u32 pcie_cap_off = CDNS_PCIE_RP_CAP_OFFSET;
- unsigned long end_jiffies;
- u16 lnk_stat;
-
- /* Wait for link training to complete. Exit after timeout. */
- end_jiffies = jiffies + LINK_RETRAIN_TIMEOUT;
- do {
- lnk_stat = cdns_pcie_rp_readw(pcie, pcie_cap_off + PCI_EXP_LNKSTA);
- if (!(lnk_stat & PCI_EXP_LNKSTA_LT))
- break;
- usleep_range(0, 1000);
- } while (time_before(jiffies, end_jiffies));
-
- if (!(lnk_stat & PCI_EXP_LNKSTA_LT))
- return 0;
-
- return -ETIMEDOUT;
-}
-
-static int cdns_pcie_host_wait_for_link(struct cdns_pcie *pcie)
-{
- struct device *dev = pcie->dev;
- int retries;
-
- /* Check if the link is up or not */
- for (retries = 0; retries < LINK_WAIT_MAX_RETRIES; retries++) {
- if (cdns_pcie_link_up(pcie)) {
- dev_info(dev, "Link up\n");
- return 0;
- }
- usleep_range(LINK_WAIT_USLEEP_MIN, LINK_WAIT_USLEEP_MAX);
- }
-
- return -ETIMEDOUT;
-}
-
-static int cdns_pcie_retrain(struct cdns_pcie *pcie)
-{
- u32 lnk_cap_sls, pcie_cap_off = CDNS_PCIE_RP_CAP_OFFSET;
- u16 lnk_stat, lnk_ctl;
- int ret = 0;
-
- /*
- * Set retrain bit if current speed is 2.5 GB/s,
- * but the PCIe root port support is > 2.5 GB/s.
- */
-
- lnk_cap_sls = cdns_pcie_readl(pcie, (CDNS_PCIE_RP_BASE + pcie_cap_off +
- PCI_EXP_LNKCAP));
- if ((lnk_cap_sls & PCI_EXP_LNKCAP_SLS) <= PCI_EXP_LNKCAP_SLS_2_5GB)
- return ret;
-
- lnk_stat = cdns_pcie_rp_readw(pcie, pcie_cap_off + PCI_EXP_LNKSTA);
- if ((lnk_stat & PCI_EXP_LNKSTA_CLS) == PCI_EXP_LNKSTA_CLS_2_5GB) {
- lnk_ctl = cdns_pcie_rp_readw(pcie,
- pcie_cap_off + PCI_EXP_LNKCTL);
- lnk_ctl |= PCI_EXP_LNKCTL_RL;
- cdns_pcie_rp_writew(pcie, pcie_cap_off + PCI_EXP_LNKCTL,
- lnk_ctl);
-
- ret = cdns_pcie_host_training_complete(pcie);
- if (ret)
- return ret;
-
- ret = cdns_pcie_host_wait_for_link(pcie);
- }
- return ret;
-}
-
static void cdns_pcie_host_disable_ptm_response(struct cdns_pcie *pcie)
{
u32 val;
@@ -168,23 +90,6 @@ static void cdns_pcie_host_enable_ptm_response(struct cdns_pcie *pcie)
cdns_pcie_writel(pcie, CDNS_PCIE_LM_PTM_CTRL, val | CDNS_PCIE_LM_TPM_CTRL_PTMRSEN);
}
-static int cdns_pcie_host_start_link(struct cdns_pcie_rc *rc)
-{
- struct cdns_pcie *pcie = &rc->pcie;
- int ret;
-
- ret = cdns_pcie_host_wait_for_link(pcie);
-
- /*
- * Retrain link for Gen2 training defect
- * if quirk flag is set.
- */
- if (!ret && rc->quirk_retrain_flag)
- ret = cdns_pcie_retrain(pcie);
-
- return ret;
-}
-
static void cdns_pcie_host_deinit_root_port(struct cdns_pcie_rc *rc)
{
struct cdns_pcie *pcie = &rc->pcie;
@@ -245,10 +150,11 @@ static int cdns_pcie_host_init_root_port(struct cdns_pcie_rc *rc)
return 0;
}
-static int cdns_pcie_host_bar_ib_config(struct cdns_pcie_rc *rc,
- enum cdns_pcie_rp_bar bar,
- u64 cpu_addr, u64 size,
- unsigned long flags)
+int cdns_pcie_host_bar_ib_config(struct cdns_pcie_rc *rc,
+ enum cdns_pcie_rp_bar bar,
+ u64 cpu_addr,
+ u64 size,
+ unsigned long flags)
{
struct cdns_pcie *pcie = &rc->pcie;
u32 addr0, addr1, aperture, value;
@@ -290,137 +196,6 @@ static int cdns_pcie_host_bar_ib_config(struct cdns_pcie_rc *rc,
return 0;
}
-static enum cdns_pcie_rp_bar
-cdns_pcie_host_find_min_bar(struct cdns_pcie_rc *rc, u64 size)
-{
- enum cdns_pcie_rp_bar bar, sel_bar;
-
- sel_bar = RP_BAR_UNDEFINED;
- for (bar = RP_BAR0; bar <= RP_NO_BAR; bar++) {
- if (!rc->avail_ib_bar[bar])
- continue;
-
- if (size <= bar_max_size[bar]) {
- if (sel_bar == RP_BAR_UNDEFINED) {
- sel_bar = bar;
- continue;
- }
-
- if (bar_max_size[bar] < bar_max_size[sel_bar])
- sel_bar = bar;
- }
- }
-
- return sel_bar;
-}
-
-static enum cdns_pcie_rp_bar
-cdns_pcie_host_find_max_bar(struct cdns_pcie_rc *rc, u64 size)
-{
- enum cdns_pcie_rp_bar bar, sel_bar;
-
- sel_bar = RP_BAR_UNDEFINED;
- for (bar = RP_BAR0; bar <= RP_NO_BAR; bar++) {
- if (!rc->avail_ib_bar[bar])
- continue;
-
- if (size >= bar_max_size[bar]) {
- if (sel_bar == RP_BAR_UNDEFINED) {
- sel_bar = bar;
- continue;
- }
-
- if (bar_max_size[bar] > bar_max_size[sel_bar])
- sel_bar = bar;
- }
- }
-
- return sel_bar;
-}
-
-static int cdns_pcie_host_bar_config(struct cdns_pcie_rc *rc,
- struct resource_entry *entry)
-{
- u64 cpu_addr, pci_addr, size, winsize;
- struct cdns_pcie *pcie = &rc->pcie;
- struct device *dev = pcie->dev;
- enum cdns_pcie_rp_bar bar;
- unsigned long flags;
- int ret;
-
- cpu_addr = entry->res->start;
- pci_addr = entry->res->start - entry->offset;
- flags = entry->res->flags;
- size = resource_size(entry->res);
-
- if (entry->offset) {
- dev_err(dev, "PCI addr: %llx must be equal to CPU addr: %llx\n",
- pci_addr, cpu_addr);
- return -EINVAL;
- }
-
- while (size > 0) {
- /*
- * Try to find a minimum BAR whose size is greater than
- * or equal to the remaining resource_entry size. This will
- * fail if the size of each of the available BARs is less than
- * the remaining resource_entry size.
- * If a minimum BAR is found, IB ATU will be configured and
- * exited.
- */
- bar = cdns_pcie_host_find_min_bar(rc, size);
- if (bar != RP_BAR_UNDEFINED) {
- ret = cdns_pcie_host_bar_ib_config(rc, bar, cpu_addr,
- size, flags);
- if (ret)
- dev_err(dev, "IB BAR: %d config failed\n", bar);
- return ret;
- }
-
- /*
- * If the control reaches here, it would mean the remaining
- * resource_entry size cannot be fitted in a single BAR. So we
- * find a maximum BAR whose size is less than or equal to the
- * remaining resource_entry size and split the resource entry
- * so that part of resource entry is fitted inside the maximum
- * BAR. The remaining size would be fitted during the next
- * iteration of the loop.
- * If a maximum BAR is not found, there is no way we can fit
- * this resource_entry, so we error out.
- */
- bar = cdns_pcie_host_find_max_bar(rc, size);
- if (bar == RP_BAR_UNDEFINED) {
- dev_err(dev, "No free BAR to map cpu_addr %llx\n",
- cpu_addr);
- return -EINVAL;
- }
-
- winsize = bar_max_size[bar];
- ret = cdns_pcie_host_bar_ib_config(rc, bar, cpu_addr, winsize,
- flags);
- if (ret) {
- dev_err(dev, "IB BAR: %d config failed\n", bar);
- return ret;
- }
-
- size -= winsize;
- cpu_addr += winsize;
- }
-
- return 0;
-}
-
-static int cdns_pcie_host_dma_ranges_cmp(void *priv, const struct list_head *a,
- const struct list_head *b)
-{
- struct resource_entry *entry1, *entry2;
-
- entry1 = container_of(a, struct resource_entry, node);
- entry2 = container_of(b, struct resource_entry, node);
-
- return resource_size(entry2->res) - resource_size(entry1->res);
-}
-
static void cdns_pcie_host_unmap_dma_ranges(struct cdns_pcie_rc *rc)
{
struct cdns_pcie *pcie = &rc->pcie;
@@ -447,43 +222,6 @@ static void cdns_pcie_host_unmap_dma_ranges(struct cdns_pcie_rc *rc)
}
}
-static int cdns_pcie_host_map_dma_ranges(struct cdns_pcie_rc *rc)
-{
- struct cdns_pcie *pcie = &rc->pcie;
- struct device *dev = pcie->dev;
- struct device_node *np = dev->of_node;
- struct pci_host_bridge *bridge;
- struct resource_entry *entry;
- u32 no_bar_nbits = 32;
- int err;
-
- bridge = pci_host_bridge_from_priv(rc);
- if (!bridge)
- return -ENOMEM;
-
- if (list_empty(&bridge->dma_ranges)) {
- of_property_read_u32(np, "cdns,no-bar-match-nbits",
- &no_bar_nbits);
- err = cdns_pcie_host_bar_ib_config(rc, RP_NO_BAR, 0x0,
- (u64)1 << no_bar_nbits, 0);
- if (err)
- dev_err(dev, "IB BAR: %d config failed\n", RP_NO_BAR);
- return err;
- }
-
- list_sort(NULL, &bridge->dma_ranges, cdns_pcie_host_dma_ranges_cmp);
-
- resource_list_for_each_entry(entry, &bridge->dma_ranges) {
- err = cdns_pcie_host_bar_config(rc, entry);
- if (err) {
- dev_err(dev, "Fail to configure IB using dma-ranges\n");
- return err;
- }
- }
-
- return 0;
-}
-
static void cdns_pcie_host_deinit_address_translation(struct cdns_pcie_rc *rc)
{
struct cdns_pcie *pcie = &rc->pcie;
@@ -561,7 +299,7 @@ static int cdns_pcie_host_init_address_translation(struct cdns_pcie_rc *rc)
r++;
}
- return cdns_pcie_host_map_dma_ranges(rc);
+ return cdns_pcie_host_map_dma_ranges(rc, cdns_pcie_host_bar_ib_config);
}
static void cdns_pcie_host_deinit(struct cdns_pcie_rc *rc)
@@ -607,7 +345,7 @@ int cdns_pcie_host_link_setup(struct cdns_pcie_rc *rc)
return ret;
}
- ret = cdns_pcie_host_start_link(rc);
+ ret = cdns_pcie_host_start_link(rc, cdns_pcie_link_up);
if (ret)
dev_dbg(dev, "PCIe link never came up\n");
diff --git a/drivers/pci/controller/cadence/pcie-cadence-hpa-regs.h b/drivers/pci/controller/cadence/pcie-cadence-hpa-regs.h
new file mode 100644
index 000000000000..026e131600de
--- /dev/null
+++ b/drivers/pci/controller/cadence/pcie-cadence-hpa-regs.h
@@ -0,0 +1,193 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Cadence PCIe controller driver.
+ *
+ * Copyright (c) 2024, Cadence Design Systems
+ * Author: Manikandan K Pillai <mpillai@cadence.com>
+ */
+#ifndef _PCIE_CADENCE_HPA_REGS_H
+#define _PCIE_CADENCE_HPA_REGS_H
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/pci-epf.h>
+#include <linux/phy/phy.h>
+#include <linux/bitfield.h>
+
+/* High Performance Architecture (HPA) PCIe controller registers */
+#define CDNS_PCIE_HPA_IP_REG_BANK 0x01000000
+#define CDNS_PCIE_HPA_IP_CFG_CTRL_REG_BANK 0x01003C00
+#define CDNS_PCIE_HPA_IP_AXI_MASTER_COMMON 0x02020000
+
+/* Address Translation Registers */
+#define CDNS_PCIE_HPA_AXI_SLAVE 0x03000000
+#define CDNS_PCIE_HPA_AXI_MASTER 0x03002000
+
+/* Root Port register base address */
+#define CDNS_PCIE_HPA_RP_BASE 0x0
+
+#define CDNS_PCIE_HPA_LM_ID 0x1420
+
+/* Endpoint Function BARs */
+#define CDNS_PCIE_HPA_LM_EP_FUNC_BAR_CFG(bar, fn) \
+ (((bar) < BAR_3) ? CDNS_PCIE_HPA_LM_EP_FUNC_BAR_CFG0(fn) : \
+ CDNS_PCIE_HPA_LM_EP_FUNC_BAR_CFG1(fn))
+#define CDNS_PCIE_HPA_LM_EP_FUNC_BAR_CFG0(pfn) (0x4000 * (pfn))
+#define CDNS_PCIE_HPA_LM_EP_FUNC_BAR_CFG1(pfn) ((0x4000 * (pfn)) + 0x04)
+#define CDNS_PCIE_HPA_LM_EP_VFUNC_BAR_CFG(bar, fn) \
+ (((bar) < BAR_3) ? CDNS_PCIE_HPA_LM_EP_VFUNC_BAR_CFG0(fn) : \
+ CDNS_PCIE_HPA_LM_EP_VFUNC_BAR_CFG1(fn))
+#define CDNS_PCIE_HPA_LM_EP_VFUNC_BAR_CFG0(vfn) ((0x4000 * (vfn)) + 0x08)
+#define CDNS_PCIE_HPA_LM_EP_VFUNC_BAR_CFG1(vfn) ((0x4000 * (vfn)) + 0x0C)
+#define CDNS_PCIE_HPA_LM_EP_FUNC_BAR_CFG_BAR_APERTURE_MASK(f) \
+ (GENMASK(5, 0) << (0x4 + (f) * 10))
+#define CDNS_PCIE_HPA_LM_EP_FUNC_BAR_CFG_BAR_APERTURE(b, a) \
+ (((a) << (4 + ((b) * 10))) & (CDNS_PCIE_HPA_LM_EP_FUNC_BAR_CFG_BAR_APERTURE_MASK(b)))
+#define CDNS_PCIE_HPA_LM_EP_FUNC_BAR_CFG_BAR_CTRL_MASK(f) \
+ (GENMASK(3, 0) << ((f) * 10))
+#define CDNS_PCIE_HPA_LM_EP_FUNC_BAR_CFG_BAR_CTRL(b, c) \
+ (((c) << ((b) * 10)) & (CDNS_PCIE_HPA_LM_EP_FUNC_BAR_CFG_BAR_CTRL_MASK(b)))
+
+/* Endpoint Function Configuration Register */
+#define CDNS_PCIE_HPA_LM_EP_FUNC_CFG 0x02C0
+
+/* Root Complex BAR Configuration Register */
+#define CDNS_PCIE_HPA_LM_RC_BAR_CFG 0x14
+#define CDNS_PCIE_HPA_LM_RC_BAR_CFG_BAR0_APERTURE_MASK GENMASK(9, 4)
+#define CDNS_PCIE_HPA_LM_RC_BAR_CFG_BAR0_APERTURE(a) \
+ FIELD_PREP(CDNS_PCIE_HPA_LM_RC_BAR_CFG_BAR0_APERTURE_MASK, a)
+#define CDNS_PCIE_HPA_LM_RC_BAR_CFG_BAR0_CTRL_MASK GENMASK(3, 0)
+#define CDNS_PCIE_HPA_LM_RC_BAR_CFG_BAR0_CTRL(c) \
+ FIELD_PREP(CDNS_PCIE_HPA_LM_RC_BAR_CFG_BAR0_CTRL_MASK, c)
+#define CDNS_PCIE_HPA_LM_RC_BAR_CFG_BAR1_APERTURE_MASK GENMASK(19, 14)
+#define CDNS_PCIE_HPA_LM_RC_BAR_CFG_BAR1_APERTURE(a) \
+ FIELD_PREP(CDNS_PCIE_HPA_LM_RC_BAR_CFG_BAR1_APERTURE_MASK, a)
+#define CDNS_PCIE_HPA_LM_RC_BAR_CFG_BAR1_CTRL_MASK GENMASK(13, 10)
+#define CDNS_PCIE_HPA_LM_RC_BAR_CFG_BAR1_CTRL(c) \
+ FIELD_PREP(CDNS_PCIE_HPA_LM_RC_BAR_CFG_BAR1_CTRL_MASK, c)
+
+#define CDNS_PCIE_HPA_LM_RC_BAR_CFG_PREFETCH_MEM_ENABLE BIT(20)
+#define CDNS_PCIE_HPA_LM_RC_BAR_CFG_PREFETCH_MEM_64BITS BIT(21)
+#define CDNS_PCIE_HPA_LM_RC_BAR_CFG_IO_ENABLE BIT(22)
+#define CDNS_PCIE_HPA_LM_RC_BAR_CFG_IO_32BITS BIT(23)
+
+/* BAR control values applicable to both Endpoint Function and Root Complex */
+#define CDNS_PCIE_HPA_LM_BAR_CFG_CTRL_DISABLED 0x0
+#define CDNS_PCIE_HPA_LM_BAR_CFG_CTRL_IO_32BITS 0x3
+#define CDNS_PCIE_HPA_LM_BAR_CFG_CTRL_MEM_32BITS 0x1
+#define CDNS_PCIE_HPA_LM_BAR_CFG_CTRL_PREFETCH_MEM_32BITS 0x9
+#define CDNS_PCIE_HPA_LM_BAR_CFG_CTRL_MEM_64BITS 0x5
+#define CDNS_PCIE_HPA_LM_BAR_CFG_CTRL_PREFETCH_MEM_64BITS 0xD
+
+#define HPA_LM_RC_BAR_CFG_CTRL_DISABLED(bar) \
+ (CDNS_PCIE_HPA_LM_BAR_CFG_CTRL_DISABLED << ((bar) * 10))
+#define HPA_LM_RC_BAR_CFG_CTRL_IO_32BITS(bar) \
+ (CDNS_PCIE_HPA_LM_BAR_CFG_CTRL_IO_32BITS << ((bar) * 10))
+#define HPA_LM_RC_BAR_CFG_CTRL_MEM_32BITS(bar) \
+ (CDNS_PCIE_HPA_LM_BAR_CFG_CTRL_MEM_32BITS << ((bar) * 10))
+#define HPA_LM_RC_BAR_CFG_CTRL_PREF_MEM_32BITS(bar) \
+ (CDNS_PCIE_HPA_LM_BAR_CFG_CTRL_PREFETCH_MEM_32BITS << ((bar) * 10))
+#define HPA_LM_RC_BAR_CFG_CTRL_MEM_64BITS(bar) \
+ (CDNS_PCIE_HPA_LM_BAR_CFG_CTRL_MEM_64BITS << ((bar) * 10))
+#define HPA_LM_RC_BAR_CFG_CTRL_PREF_MEM_64BITS(bar) \
+ (CDNS_PCIE_HPA_LM_BAR_CFG_CTRL_PREFETCH_MEM_64BITS << ((bar) * 10))
+#define HPA_LM_RC_BAR_CFG_APERTURE(bar, aperture) \
+ (((aperture) - 7) << (((bar) * 10) + 4))
+
+#define CDNS_PCIE_HPA_LM_PTM_CTRL 0x0520
+#define CDNS_PCIE_HPA_LM_PTM_CTRL_PTMRSEN BIT(17)
+
+/* Root Port Registers PCI config space for root port function */
+#define CDNS_PCIE_HPA_RP_CAP_OFFSET 0xC0
+
+/* Region r Outbound AXI to PCIe Address Translation Register 0 */
+#define CDNS_PCIE_HPA_AT_OB_REGION_PCI_ADDR0(r) (0x1010 + ((r) & 0x1F) * 0x0080)
+#define CDNS_PCIE_HPA_AT_OB_REGION_PCI_ADDR0_NBITS_MASK GENMASK(5, 0)
+#define CDNS_PCIE_HPA_AT_OB_REGION_PCI_ADDR0_NBITS(nbits) \
+ (((nbits) - 1) & CDNS_PCIE_HPA_AT_OB_REGION_PCI_ADDR0_NBITS_MASK)
+#define CDNS_PCIE_HPA_AT_OB_REGION_PCI_ADDR0_DEVFN_MASK GENMASK(23, 16)
+#define CDNS_PCIE_HPA_AT_OB_REGION_PCI_ADDR0_DEVFN(devfn) \
+ FIELD_PREP(CDNS_PCIE_HPA_AT_OB_REGION_PCI_ADDR0_DEVFN_MASK, devfn)
+#define CDNS_PCIE_HPA_AT_OB_REGION_PCI_ADDR0_BUS_MASK GENMASK(31, 24)
+#define CDNS_PCIE_HPA_AT_OB_REGION_PCI_ADDR0_BUS(bus) \
+ FIELD_PREP(CDNS_PCIE_HPA_AT_OB_REGION_PCI_ADDR0_BUS_MASK, bus)
+
+/* Region r Outbound AXI to PCIe Address Translation Register 1 */
+#define CDNS_PCIE_HPA_AT_OB_REGION_PCI_ADDR1(r) (0x1014 + ((r) & 0x1F) * 0x0080)
+
+/* Region r Outbound PCIe Descriptor Register */
+#define CDNS_PCIE_HPA_AT_OB_REGION_DESC0(r) (0x1008 + ((r) & 0x1F) * 0x0080)
+#define CDNS_PCIE_HPA_AT_OB_REGION_DESC0_TYPE_MASK GENMASK(28, 24)
+#define CDNS_PCIE_HPA_AT_OB_REGION_DESC0_TYPE_MEM \
+ FIELD_PREP(CDNS_PCIE_HPA_AT_OB_REGION_DESC0_TYPE_MASK, 0x0)
+#define CDNS_PCIE_HPA_AT_OB_REGION_DESC0_TYPE_IO \
+ FIELD_PREP(CDNS_PCIE_HPA_AT_OB_REGION_DESC0_TYPE_MASK, 0x2)
+#define CDNS_PCIE_HPA_AT_OB_REGION_DESC0_TYPE_CONF_TYPE0 \
+ FIELD_PREP(CDNS_PCIE_HPA_AT_OB_REGION_DESC0_TYPE_MASK, 0x4)
+#define CDNS_PCIE_HPA_AT_OB_REGION_DESC0_TYPE_CONF_TYPE1 \
+ FIELD_PREP(CDNS_PCIE_HPA_AT_OB_REGION_DESC0_TYPE_MASK, 0x5)
+#define CDNS_PCIE_HPA_AT_OB_REGION_DESC0_TYPE_NORMAL_MSG \
+ FIELD_PREP(CDNS_PCIE_HPA_AT_OB_REGION_DESC0_TYPE_MASK, 0x10)
+
+/* Region r Outbound PCIe Descriptor Register */
+#define CDNS_PCIE_HPA_AT_OB_REGION_DESC1(r) (0x100C + ((r) & 0x1F) * 0x0080)
+#define CDNS_PCIE_HPA_AT_OB_REGION_DESC1_BUS_MASK GENMASK(31, 24)
+#define CDNS_PCIE_HPA_AT_OB_REGION_DESC1_BUS(bus) \
+ FIELD_PREP(CDNS_PCIE_HPA_AT_OB_REGION_DESC1_BUS_MASK, bus)
+#define CDNS_PCIE_HPA_AT_OB_REGION_DESC1_DEVFN_MASK GENMASK(23, 16)
+#define CDNS_PCIE_HPA_AT_OB_REGION_DESC1_DEVFN(devfn) \
+ FIELD_PREP(CDNS_PCIE_HPA_AT_OB_REGION_DESC1_DEVFN_MASK, devfn)
+
+#define CDNS_PCIE_HPA_AT_OB_REGION_CTRL0(r) (0x1018 + ((r) & 0x1F) * 0x0080)
+#define CDNS_PCIE_HPA_AT_OB_REGION_CTRL0_SUPPLY_BUS BIT(26)
+#define CDNS_PCIE_HPA_AT_OB_REGION_CTRL0_SUPPLY_DEV_FN BIT(25)
+
+/* Region r AXI Region Base Address Register 0 */
+#define CDNS_PCIE_HPA_AT_OB_REGION_CPU_ADDR0(r) (0x1000 + ((r) & 0x1F) * 0x0080)
+#define CDNS_PCIE_HPA_AT_OB_REGION_CPU_ADDR0_NBITS_MASK GENMASK(5, 0)
+#define CDNS_PCIE_HPA_AT_OB_REGION_CPU_ADDR0_NBITS(nbits) \
+ (((nbits) - 1) & CDNS_PCIE_HPA_AT_OB_REGION_CPU_ADDR0_NBITS_MASK)
+
+/* Region r AXI Region Base Address Register 1 */
+#define CDNS_PCIE_HPA_AT_OB_REGION_CPU_ADDR1(r) (0x1004 + ((r) & 0x1F) * 0x0080)
+
+/* Root Port BAR Inbound PCIe to AXI Address Translation Register */
+#define CDNS_PCIE_HPA_AT_IB_RP_BAR_ADDR0(bar) (((bar) * 0x0008))
+#define CDNS_PCIE_HPA_AT_IB_RP_BAR_ADDR0_NBITS_MASK GENMASK(5, 0)
+#define CDNS_PCIE_HPA_AT_IB_RP_BAR_ADDR0_NBITS(nbits) \
+ (((nbits) - 1) & CDNS_PCIE_HPA_AT_IB_RP_BAR_ADDR0_NBITS_MASK)
+#define CDNS_PCIE_HPA_AT_IB_RP_BAR_ADDR1(bar) (0x04 + ((bar) * 0x0008))
+
+/* AXI link down register */
+#define CDNS_PCIE_HPA_AT_LINKDOWN 0x04
+
+/*
+ * Physical Layer Configuration Register 0
+ * This register contains the parameters required for functional setup
+ * of Physical Layer.
+ */
+#define CDNS_PCIE_HPA_PHY_LAYER_CFG0 0x0400
+#define CDNS_PCIE_HPA_DETECT_QUIET_MIN_DELAY_MASK GENMASK(26, 24)
+#define CDNS_PCIE_HPA_DETECT_QUIET_MIN_DELAY(delay) \
+ FIELD_PREP(CDNS_PCIE_HPA_DETECT_QUIET_MIN_DELAY_MASK, delay)
+#define CDNS_PCIE_HPA_LINK_TRNG_EN_MASK GENMASK(27, 27)
+
+#define CDNS_PCIE_HPA_PHY_DBG_STS_REG0 0x0420
+
+#define CDNS_PCIE_HPA_RP_MAX_IB 0x3
+#define CDNS_PCIE_HPA_MAX_OB 15
+
+/* Endpoint Function BAR Inbound PCIe to AXI Address Translation Register */
+#define CDNS_PCIE_HPA_AT_IB_EP_FUNC_BAR_ADDR0(fn, bar) (((fn) * 0x0080) + ((bar) * 0x0008))
+#define CDNS_PCIE_HPA_AT_IB_EP_FUNC_BAR_ADDR1(fn, bar) (0x4 + ((fn) * 0x0080) + ((bar) * 0x0008))
+
+/* Miscellaneous offsets definitions */
+#define CDNS_PCIE_HPA_TAG_MANAGEMENT 0x0
+#define CDNS_PCIE_HPA_SLAVE_RESP 0x100
+
+#define I_ROOT_PORT_REQ_ID_REG 0x141c
+#define LM_HAL_SBSA_CTRL 0x1170
+
+#define I_PCIE_BUS_NUMBERS (CDNS_PCIE_HPA_RP_BASE + 0x18)
+#define CDNS_PCIE_EROM 0x18
+#endif /* _PCIE_CADENCE_HPA_REGS_H */
diff --git a/drivers/pci/controller/cadence/pcie-cadence-hpa.c b/drivers/pci/controller/cadence/pcie-cadence-hpa.c
new file mode 100644
index 000000000000..f60a16938265
--- /dev/null
+++ b/drivers/pci/controller/cadence/pcie-cadence-hpa.c
@@ -0,0 +1,167 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Cadence PCIe controller driver.
+ *
+ * Copyright (c) 2024, Cadence Design Systems
+ * Author: Manikandan K Pillai <mpillai@cadence.com>
+ */
+#include <linux/kernel.h>
+#include <linux/of.h>
+
+#include "pcie-cadence.h"
+
+bool cdns_pcie_hpa_link_up(struct cdns_pcie *pcie)
+{
+ u32 pl_reg_val;
+
+ pl_reg_val = cdns_pcie_hpa_readl(pcie, REG_BANK_IP_REG, CDNS_PCIE_HPA_PHY_DBG_STS_REG0);
+ if (pl_reg_val & GENMASK(0, 0))
+ return true;
+ return false;
+}
+EXPORT_SYMBOL_GPL(cdns_pcie_hpa_link_up);
+
+void cdns_pcie_hpa_detect_quiet_min_delay_set(struct cdns_pcie *pcie)
+{
+ u32 delay = 0x3;
+ u32 ltssm_control_cap;
+
+ /* Set the LTSSM Detect Quiet state min. delay to 2ms */
+ ltssm_control_cap = cdns_pcie_hpa_readl(pcie, REG_BANK_IP_REG,
+ CDNS_PCIE_HPA_PHY_LAYER_CFG0);
+ ltssm_control_cap = ((ltssm_control_cap &
+ ~CDNS_PCIE_HPA_DETECT_QUIET_MIN_DELAY_MASK) |
+ CDNS_PCIE_HPA_DETECT_QUIET_MIN_DELAY(delay));
+
+ cdns_pcie_hpa_writel(pcie, REG_BANK_IP_REG,
+ CDNS_PCIE_HPA_PHY_LAYER_CFG0, ltssm_control_cap);
+}
+EXPORT_SYMBOL_GPL(cdns_pcie_hpa_detect_quiet_min_delay_set);
+
+void cdns_pcie_hpa_set_outbound_region(struct cdns_pcie *pcie, u8 busnr, u8 fn,
+ u32 r, bool is_io,
+ u64 cpu_addr, u64 pci_addr, size_t size)
+{
+ /*
+ * roundup_pow_of_two() returns an unsigned long, which is not suited
+ * for 64bit values
+ */
+ u64 sz = 1ULL << fls64(size - 1);
+ int nbits = ilog2(sz);
+ u32 addr0, addr1, desc0, desc1, ctrl0;
+
+ if (nbits < 8)
+ nbits = 8;
+
+ /* Set the PCI address */
+ addr0 = CDNS_PCIE_HPA_AT_OB_REGION_PCI_ADDR0_NBITS(nbits) |
+ (lower_32_bits(pci_addr) & GENMASK(31, 8));
+ addr1 = upper_32_bits(pci_addr);
+
+ cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE,
+ CDNS_PCIE_HPA_AT_OB_REGION_PCI_ADDR0(r), addr0);
+ cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE,
+ CDNS_PCIE_HPA_AT_OB_REGION_PCI_ADDR1(r), addr1);
+
+ /* Set the PCIe header descriptor */
+ if (is_io)
+ desc0 = CDNS_PCIE_HPA_AT_OB_REGION_DESC0_TYPE_IO;
+ else
+ desc0 = CDNS_PCIE_HPA_AT_OB_REGION_DESC0_TYPE_MEM;
+ desc1 = 0;
+ ctrl0 = 0;
+
+ /*
+ * Whether Bit [26] is set or not inside DESC0 register of the outbound
+ * PCIe descriptor, the PCI function number must be set into
+ * Bits [31:24] of DESC1 anyway.
+ *
+ * In Root Complex mode, the function number is always 0 but in Endpoint
+ * mode, the PCIe controller may support more than one function. This
+ * function number needs to be set properly into the outbound PCIe
+ * descriptor.
+ *
+ * Besides, setting Bit [26] is mandatory when in Root Complex mode:
+ * then the driver must provide the bus, resp. device, number in
+ * Bits [31:24] of DESC1, resp. Bits[23:16] of DESC0. Like the function
+ * number, the device number is always 0 in Root Complex mode.
+ *
+ * However when in Endpoint mode, we can clear Bit [26] of DESC0, hence
+ * the PCIe controller will use the captured values for the bus and
+ * device numbers.
+ */
+ if (pcie->is_rc) {
+ /* The device and function numbers are always 0 */
+ desc1 = CDNS_PCIE_HPA_AT_OB_REGION_DESC1_BUS(busnr) |
+ CDNS_PCIE_HPA_AT_OB_REGION_DESC1_DEVFN(0);
+ ctrl0 = CDNS_PCIE_HPA_AT_OB_REGION_CTRL0_SUPPLY_BUS |
+ CDNS_PCIE_HPA_AT_OB_REGION_CTRL0_SUPPLY_DEV_FN;
+ } else {
+ /*
+ * Use captured values for bus and device numbers but still
+ * need to set the function number
+ */
+ desc1 |= CDNS_PCIE_HPA_AT_OB_REGION_DESC1_DEVFN(fn);
+ }
+
+ cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE,
+ CDNS_PCIE_HPA_AT_OB_REGION_DESC0(r), desc0);
+ cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE,
+ CDNS_PCIE_HPA_AT_OB_REGION_DESC1(r), desc1);
+
+ addr0 = CDNS_PCIE_HPA_AT_OB_REGION_CPU_ADDR0_NBITS(nbits) |
+ (lower_32_bits(cpu_addr) & GENMASK(31, 8));
+ addr1 = upper_32_bits(cpu_addr);
+
+ cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE,
+ CDNS_PCIE_HPA_AT_OB_REGION_CPU_ADDR0(r), addr0);
+ cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE,
+ CDNS_PCIE_HPA_AT_OB_REGION_CPU_ADDR1(r), addr1);
+ cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE,
+ CDNS_PCIE_HPA_AT_OB_REGION_CTRL0(r), ctrl0);
+}
+EXPORT_SYMBOL_GPL(cdns_pcie_hpa_set_outbound_region);
+
+void cdns_pcie_hpa_set_outbound_region_for_normal_msg(struct cdns_pcie *pcie,
+ u8 busnr, u8 fn,
+ u32 r, u64 cpu_addr)
+{
+ u32 addr0, addr1, desc0, desc1, ctrl0;
+
+ desc0 = CDNS_PCIE_HPA_AT_OB_REGION_DESC0_TYPE_NORMAL_MSG;
+ desc1 = 0;
+ ctrl0 = 0;
+
+ /* See cdns_pcie_set_outbound_region() comments above */
+ if (pcie->is_rc) {
+ desc1 = CDNS_PCIE_HPA_AT_OB_REGION_DESC1_BUS(busnr) |
+ CDNS_PCIE_HPA_AT_OB_REGION_DESC1_DEVFN(0);
+ ctrl0 = CDNS_PCIE_HPA_AT_OB_REGION_CTRL0_SUPPLY_BUS |
+ CDNS_PCIE_HPA_AT_OB_REGION_CTRL0_SUPPLY_DEV_FN;
+ } else {
+ desc1 |= CDNS_PCIE_HPA_AT_OB_REGION_DESC1_DEVFN(fn);
+ }
+
+ addr0 = CDNS_PCIE_HPA_AT_OB_REGION_CPU_ADDR0_NBITS(17) |
+ (lower_32_bits(cpu_addr) & GENMASK(31, 8));
+ addr1 = upper_32_bits(cpu_addr);
+
+ cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE,
+ CDNS_PCIE_HPA_AT_OB_REGION_PCI_ADDR0(r), 0);
+ cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE,
+ CDNS_PCIE_HPA_AT_OB_REGION_PCI_ADDR1(r), 0);
+ cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE,
+ CDNS_PCIE_HPA_AT_OB_REGION_DESC0(r), desc0);
+ cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE,
+ CDNS_PCIE_HPA_AT_OB_REGION_DESC1(r), desc1);
+ cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE,
+ CDNS_PCIE_HPA_AT_OB_REGION_CPU_ADDR0(r), addr0);
+ cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE,
+ CDNS_PCIE_HPA_AT_OB_REGION_CPU_ADDR1(r), addr1);
+ cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE,
+ CDNS_PCIE_HPA_AT_OB_REGION_CTRL0(r), ctrl0);
+}
+EXPORT_SYMBOL_GPL(cdns_pcie_hpa_set_outbound_region_for_normal_msg);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Cadence PCIe controller driver");
diff --git a/drivers/pci/controller/cadence/pcie-cadence-lga-regs.h b/drivers/pci/controller/cadence/pcie-cadence-lga-regs.h
new file mode 100644
index 000000000000..857b2140c5d2
--- /dev/null
+++ b/drivers/pci/controller/cadence/pcie-cadence-lga-regs.h
@@ -0,0 +1,230 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Cadence PCIe controller driver.
+ *
+ * Copyright (c) 2017 Cadence
+ * Author: Cyrille Pitchen <cyrille.pitchen@free-electrons.com>
+ */
+#ifndef _PCIE_CADENCE_LGA_REGS_H
+#define _PCIE_CADENCE_LGA_REGS_H
+
+#include <linux/bitfield.h>
+
+/* Parameters for the waiting for link up routine */
+#define LINK_WAIT_MAX_RETRIES 10
+#define LINK_WAIT_USLEEP_MIN 90000
+#define LINK_WAIT_USLEEP_MAX 100000
+
+/* Local Management Registers */
+#define CDNS_PCIE_LM_BASE 0x00100000
+
+/* Vendor ID Register */
+#define CDNS_PCIE_LM_ID (CDNS_PCIE_LM_BASE + 0x0044)
+#define CDNS_PCIE_LM_ID_VENDOR_MASK GENMASK(15, 0)
+#define CDNS_PCIE_LM_ID_VENDOR_SHIFT 0
+#define CDNS_PCIE_LM_ID_VENDOR(vid) \
+ (((vid) << CDNS_PCIE_LM_ID_VENDOR_SHIFT) & CDNS_PCIE_LM_ID_VENDOR_MASK)
+#define CDNS_PCIE_LM_ID_SUBSYS_MASK GENMASK(31, 16)
+#define CDNS_PCIE_LM_ID_SUBSYS_SHIFT 16
+#define CDNS_PCIE_LM_ID_SUBSYS(sub) \
+ (((sub) << CDNS_PCIE_LM_ID_SUBSYS_SHIFT) & CDNS_PCIE_LM_ID_SUBSYS_MASK)
+
+/* Root Port Requester ID Register */
+#define CDNS_PCIE_LM_RP_RID (CDNS_PCIE_LM_BASE + 0x0228)
+#define CDNS_PCIE_LM_RP_RID_MASK GENMASK(15, 0)
+#define CDNS_PCIE_LM_RP_RID_SHIFT 0
+#define CDNS_PCIE_LM_RP_RID_(rid) \
+ (((rid) << CDNS_PCIE_LM_RP_RID_SHIFT) & CDNS_PCIE_LM_RP_RID_MASK)
+
+/* Endpoint Bus and Device Number Register */
+#define CDNS_PCIE_LM_EP_ID (CDNS_PCIE_LM_BASE + 0x022C)
+#define CDNS_PCIE_LM_EP_ID_DEV_MASK GENMASK(4, 0)
+#define CDNS_PCIE_LM_EP_ID_DEV_SHIFT 0
+#define CDNS_PCIE_LM_EP_ID_BUS_MASK GENMASK(15, 8)
+#define CDNS_PCIE_LM_EP_ID_BUS_SHIFT 8
+
+/* Endpoint Function f BAR b Configuration Registers */
+#define CDNS_PCIE_LM_EP_FUNC_BAR_CFG(bar, fn) \
+ (((bar) < BAR_4) ? CDNS_PCIE_LM_EP_FUNC_BAR_CFG0(fn) : CDNS_PCIE_LM_EP_FUNC_BAR_CFG1(fn))
+#define CDNS_PCIE_LM_EP_FUNC_BAR_CFG0(fn) \
+ (CDNS_PCIE_LM_BASE + 0x0240 + (fn) * 0x0008)
+#define CDNS_PCIE_LM_EP_FUNC_BAR_CFG1(fn) \
+ (CDNS_PCIE_LM_BASE + 0x0244 + (fn) * 0x0008)
+#define CDNS_PCIE_LM_EP_VFUNC_BAR_CFG(bar, fn) \
+ (((bar) < BAR_4) ? CDNS_PCIE_LM_EP_VFUNC_BAR_CFG0(fn) : CDNS_PCIE_LM_EP_VFUNC_BAR_CFG1(fn))
+#define CDNS_PCIE_LM_EP_VFUNC_BAR_CFG0(fn) \
+ (CDNS_PCIE_LM_BASE + 0x0280 + (fn) * 0x0008)
+#define CDNS_PCIE_LM_EP_VFUNC_BAR_CFG1(fn) \
+ (CDNS_PCIE_LM_BASE + 0x0284 + (fn) * 0x0008)
+#define CDNS_PCIE_LM_EP_FUNC_BAR_CFG_BAR_APERTURE_MASK(b) \
+ (GENMASK(4, 0) << ((b) * 8))
+#define CDNS_PCIE_LM_EP_FUNC_BAR_CFG_BAR_APERTURE(b, a) \
+ (((a) << ((b) * 8)) & CDNS_PCIE_LM_EP_FUNC_BAR_CFG_BAR_APERTURE_MASK(b))
+#define CDNS_PCIE_LM_EP_FUNC_BAR_CFG_BAR_CTRL_MASK(b) \
+ (GENMASK(7, 5) << ((b) * 8))
+#define CDNS_PCIE_LM_EP_FUNC_BAR_CFG_BAR_CTRL(b, c) \
+ (((c) << ((b) * 8 + 5)) & CDNS_PCIE_LM_EP_FUNC_BAR_CFG_BAR_CTRL_MASK(b))
+
+/* Endpoint Function Configuration Register */
+#define CDNS_PCIE_LM_EP_FUNC_CFG (CDNS_PCIE_LM_BASE + 0x02C0)
+
+/* Root Complex BAR Configuration Register */
+#define CDNS_PCIE_LM_RC_BAR_CFG (CDNS_PCIE_LM_BASE + 0x0300)
+#define CDNS_PCIE_LM_RC_BAR_CFG_BAR0_APERTURE_MASK GENMASK(5, 0)
+#define CDNS_PCIE_LM_RC_BAR_CFG_BAR0_APERTURE(a) \
+ (((a) << 0) & CDNS_PCIE_LM_RC_BAR_CFG_BAR0_APERTURE_MASK)
+#define CDNS_PCIE_LM_RC_BAR_CFG_BAR0_CTRL_MASK GENMASK(8, 6)
+#define CDNS_PCIE_LM_RC_BAR_CFG_BAR0_CTRL(c) \
+ (((c) << 6) & CDNS_PCIE_LM_RC_BAR_CFG_BAR0_CTRL_MASK)
+#define CDNS_PCIE_LM_RC_BAR_CFG_BAR1_APERTURE_MASK GENMASK(13, 9)
+#define CDNS_PCIE_LM_RC_BAR_CFG_BAR1_APERTURE(a) \
+ (((a) << 9) & CDNS_PCIE_LM_RC_BAR_CFG_BAR1_APERTURE_MASK)
+#define CDNS_PCIE_LM_RC_BAR_CFG_BAR1_CTRL_MASK GENMASK(16, 14)
+#define CDNS_PCIE_LM_RC_BAR_CFG_BAR1_CTRL(c) \
+ (((c) << 14) & CDNS_PCIE_LM_RC_BAR_CFG_BAR1_CTRL_MASK)
+#define CDNS_PCIE_LM_RC_BAR_CFG_PREFETCH_MEM_ENABLE BIT(17)
+#define CDNS_PCIE_LM_RC_BAR_CFG_PREFETCH_MEM_32BITS 0
+#define CDNS_PCIE_LM_RC_BAR_CFG_PREFETCH_MEM_64BITS BIT(18)
+#define CDNS_PCIE_LM_RC_BAR_CFG_IO_ENABLE BIT(19)
+#define CDNS_PCIE_LM_RC_BAR_CFG_IO_16BITS 0
+#define CDNS_PCIE_LM_RC_BAR_CFG_IO_32BITS BIT(20)
+#define CDNS_PCIE_LM_RC_BAR_CFG_CHECK_ENABLE BIT(31)
+
+/* BAR control values applicable to both Endpoint Function and Root Complex */
+#define CDNS_PCIE_LM_BAR_CFG_CTRL_DISABLED 0x0
+#define CDNS_PCIE_LM_BAR_CFG_CTRL_IO_32BITS 0x1
+#define CDNS_PCIE_LM_BAR_CFG_CTRL_MEM_32BITS 0x4
+#define CDNS_PCIE_LM_BAR_CFG_CTRL_PREFETCH_MEM_32BITS 0x5
+#define CDNS_PCIE_LM_BAR_CFG_CTRL_MEM_64BITS 0x6
+#define CDNS_PCIE_LM_BAR_CFG_CTRL_PREFETCH_MEM_64BITS 0x7
+
+#define LM_RC_BAR_CFG_CTRL_DISABLED(bar) \
+ (CDNS_PCIE_LM_BAR_CFG_CTRL_DISABLED << (((bar) * 8) + 6))
+#define LM_RC_BAR_CFG_CTRL_IO_32BITS(bar) \
+ (CDNS_PCIE_LM_BAR_CFG_CTRL_IO_32BITS << (((bar) * 8) + 6))
+#define LM_RC_BAR_CFG_CTRL_MEM_32BITS(bar) \
+ (CDNS_PCIE_LM_BAR_CFG_CTRL_MEM_32BITS << (((bar) * 8) + 6))
+#define LM_RC_BAR_CFG_CTRL_PREF_MEM_32BITS(bar) \
+ (CDNS_PCIE_LM_BAR_CFG_CTRL_PREFETCH_MEM_32BITS << (((bar) * 8) + 6))
+#define LM_RC_BAR_CFG_CTRL_MEM_64BITS(bar) \
+ (CDNS_PCIE_LM_BAR_CFG_CTRL_MEM_64BITS << (((bar) * 8) + 6))
+#define LM_RC_BAR_CFG_CTRL_PREF_MEM_64BITS(bar) \
+ (CDNS_PCIE_LM_BAR_CFG_CTRL_PREFETCH_MEM_64BITS << (((bar) * 8) + 6))
+#define LM_RC_BAR_CFG_APERTURE(bar, aperture) \
+ (((aperture) - 2) << ((bar) * 8))
+
+/* PTM Control Register */
+#define CDNS_PCIE_LM_PTM_CTRL (CDNS_PCIE_LM_BASE + 0x0DA8)
+#define CDNS_PCIE_LM_TPM_CTRL_PTMRSEN BIT(17)
+
+/*
+ * Endpoint Function Registers (PCI configuration space for endpoint functions)
+ */
+#define CDNS_PCIE_EP_FUNC_BASE(fn) (((fn) << 12) & GENMASK(19, 12))
+
+#define CDNS_PCIE_EP_FUNC_MSI_CAP_OFFSET 0x90
+#define CDNS_PCIE_EP_FUNC_MSIX_CAP_OFFSET 0xB0
+#define CDNS_PCIE_EP_FUNC_DEV_CAP_OFFSET 0xC0
+#define CDNS_PCIE_EP_FUNC_SRIOV_CAP_OFFSET 0x200
+
+/* Endpoint PF Registers */
+#define CDNS_PCIE_CORE_PF_I_ARI_CAP_AND_CTRL(fn) (0x144 + (fn) * 0x1000)
+#define CDNS_PCIE_ARI_CAP_NFN_MASK GENMASK(15, 8)
+
+/* Root Port Registers (PCI configuration space for the root port function) */
+#define CDNS_PCIE_RP_BASE 0x00200000
+#define CDNS_PCIE_RP_CAP_OFFSET 0xC0
+
+/* Address Translation Registers */
+#define CDNS_PCIE_AT_BASE 0x00400000
+
+/* Region r Outbound AXI to PCIe Address Translation Register 0 */
+#define CDNS_PCIE_AT_OB_REGION_PCI_ADDR0(r) \
+ (CDNS_PCIE_AT_BASE + 0x0000 + ((r) & 0x1F) * 0x0020)
+#define CDNS_PCIE_AT_OB_REGION_PCI_ADDR0_NBITS_MASK GENMASK(5, 0)
+#define CDNS_PCIE_AT_OB_REGION_PCI_ADDR0_NBITS(nbits) \
+ (((nbits) - 1) & CDNS_PCIE_AT_OB_REGION_PCI_ADDR0_NBITS_MASK)
+#define CDNS_PCIE_AT_OB_REGION_PCI_ADDR0_DEVFN_MASK GENMASK(19, 12)
+#define CDNS_PCIE_AT_OB_REGION_PCI_ADDR0_DEVFN(devfn) \
+ (((devfn) << 12) & CDNS_PCIE_AT_OB_REGION_PCI_ADDR0_DEVFN_MASK)
+#define CDNS_PCIE_AT_OB_REGION_PCI_ADDR0_BUS_MASK GENMASK(27, 20)
+#define CDNS_PCIE_AT_OB_REGION_PCI_ADDR0_BUS(bus) \
+ (((bus) << 20) & CDNS_PCIE_AT_OB_REGION_PCI_ADDR0_BUS_MASK)
+
+/* Region r Outbound AXI to PCIe Address Translation Register 1 */
+#define CDNS_PCIE_AT_OB_REGION_PCI_ADDR1(r) \
+ (CDNS_PCIE_AT_BASE + 0x0004 + ((r) & 0x1F) * 0x0020)
+
+/* Region r Outbound PCIe Descriptor Register 0 */
+#define CDNS_PCIE_AT_OB_REGION_DESC0(r) \
+ (CDNS_PCIE_AT_BASE + 0x0008 + ((r) & 0x1F) * 0x0020)
+#define CDNS_PCIE_AT_OB_REGION_DESC0_TYPE_MASK GENMASK(3, 0)
+#define CDNS_PCIE_AT_OB_REGION_DESC0_TYPE_MEM 0x2
+#define CDNS_PCIE_AT_OB_REGION_DESC0_TYPE_IO 0x6
+#define CDNS_PCIE_AT_OB_REGION_DESC0_TYPE_CONF_TYPE0 0xA
+#define CDNS_PCIE_AT_OB_REGION_DESC0_TYPE_CONF_TYPE1 0xB
+#define CDNS_PCIE_AT_OB_REGION_DESC0_TYPE_NORMAL_MSG 0xC
+#define CDNS_PCIE_AT_OB_REGION_DESC0_TYPE_VENDOR_MSG 0xD
+/* Bit 23 MUST be set in RC mode. */
+#define CDNS_PCIE_AT_OB_REGION_DESC0_HARDCODED_RID BIT(23)
+#define CDNS_PCIE_AT_OB_REGION_DESC0_DEVFN_MASK GENMASK(31, 24)
+#define CDNS_PCIE_AT_OB_REGION_DESC0_DEVFN(devfn) \
+ (((devfn) << 24) & CDNS_PCIE_AT_OB_REGION_DESC0_DEVFN_MASK)
+
+/* Region r Outbound PCIe Descriptor Register 1 */
+#define CDNS_PCIE_AT_OB_REGION_DESC1(r) \
+ (CDNS_PCIE_AT_BASE + 0x000C + ((r) & 0x1F) * 0x0020)
+#define CDNS_PCIE_AT_OB_REGION_DESC1_BUS_MASK GENMASK(7, 0)
+#define CDNS_PCIE_AT_OB_REGION_DESC1_BUS(bus) \
+ ((bus) & CDNS_PCIE_AT_OB_REGION_DESC1_BUS_MASK)
+
+/* Region r AXI Region Base Address Register 0 */
+#define CDNS_PCIE_AT_OB_REGION_CPU_ADDR0(r) \
+ (CDNS_PCIE_AT_BASE + 0x0018 + ((r) & 0x1F) * 0x0020)
+#define CDNS_PCIE_AT_OB_REGION_CPU_ADDR0_NBITS_MASK GENMASK(5, 0)
+#define CDNS_PCIE_AT_OB_REGION_CPU_ADDR0_NBITS(nbits) \
+ (((nbits) - 1) & CDNS_PCIE_AT_OB_REGION_CPU_ADDR0_NBITS_MASK)
+
+/* Region r AXI Region Base Address Register 1 */
+#define CDNS_PCIE_AT_OB_REGION_CPU_ADDR1(r) \
+ (CDNS_PCIE_AT_BASE + 0x001C + ((r) & 0x1F) * 0x0020)
+
+/* Root Port BAR Inbound PCIe to AXI Address Translation Register */
+#define CDNS_PCIE_AT_IB_RP_BAR_ADDR0(bar) \
+ (CDNS_PCIE_AT_BASE + 0x0800 + (bar) * 0x0008)
+#define CDNS_PCIE_AT_IB_RP_BAR_ADDR0_NBITS_MASK GENMASK(5, 0)
+#define CDNS_PCIE_AT_IB_RP_BAR_ADDR0_NBITS(nbits) \
+ (((nbits) - 1) & CDNS_PCIE_AT_IB_RP_BAR_ADDR0_NBITS_MASK)
+#define CDNS_PCIE_AT_IB_RP_BAR_ADDR1(bar) \
+ (CDNS_PCIE_AT_BASE + 0x0804 + (bar) * 0x0008)
+
+/* AXI link down register */
+#define CDNS_PCIE_AT_LINKDOWN (CDNS_PCIE_AT_BASE + 0x0824)
+
+/* LTSSM Capabilities register */
+#define CDNS_PCIE_LTSSM_CONTROL_CAP (CDNS_PCIE_LM_BASE + 0x0054)
+#define CDNS_PCIE_DETECT_QUIET_MIN_DELAY_MASK GENMASK(2, 1)
+#define CDNS_PCIE_DETECT_QUIET_MIN_DELAY_SHIFT 1
+#define CDNS_PCIE_DETECT_QUIET_MIN_DELAY(delay) \
+ (((delay) << CDNS_PCIE_DETECT_QUIET_MIN_DELAY_SHIFT) & \
+ CDNS_PCIE_DETECT_QUIET_MIN_DELAY_MASK)
+
+#define CDNS_PCIE_RP_MAX_IB 0x3
+#define CDNS_PCIE_MAX_OB 32
+
+/* Endpoint Function BAR Inbound PCIe to AXI Address Translation Register */
+#define CDNS_PCIE_AT_IB_EP_FUNC_BAR_ADDR0(fn, bar) \
+ (CDNS_PCIE_AT_BASE + 0x0840 + (fn) * 0x0040 + (bar) * 0x0008)
+#define CDNS_PCIE_AT_IB_EP_FUNC_BAR_ADDR1(fn, bar) \
+ (CDNS_PCIE_AT_BASE + 0x0844 + (fn) * 0x0040 + (bar) * 0x0008)
+
+/* Normal/Vendor specific message access: offset inside some outbound region */
+#define CDNS_PCIE_NORMAL_MSG_ROUTING_MASK GENMASK(7, 5)
+#define CDNS_PCIE_NORMAL_MSG_ROUTING(route) \
+ (((route) << 5) & CDNS_PCIE_NORMAL_MSG_ROUTING_MASK)
+#define CDNS_PCIE_NORMAL_MSG_CODE_MASK GENMASK(15, 8)
+#define CDNS_PCIE_NORMAL_MSG_CODE(code) \
+ (((code) << 8) & CDNS_PCIE_NORMAL_MSG_CODE_MASK)
+#define CDNS_PCIE_MSG_NO_DATA BIT(16)
+
+#endif /* _PCIE_CADENCE_LGA_REGS_H */
diff --git a/drivers/pci/controller/cadence/pcie-cadence-plat.c b/drivers/pci/controller/cadence/pcie-cadence-plat.c
index 0456845dabb9..b067a3296dd3 100644
--- a/drivers/pci/controller/cadence/pcie-cadence-plat.c
+++ b/drivers/pci/controller/cadence/pcie-cadence-plat.c
@@ -22,10 +22,6 @@ struct cdns_plat_pcie {
struct cdns_pcie *pcie;
};
-struct cdns_plat_pcie_of_data {
- bool is_rc;
-};
-
static const struct of_device_id cdns_plat_pcie_of_match[];
static u64 cdns_plat_cpu_addr_fixup(struct cdns_pcie *pcie, u64 cpu_addr)
@@ -177,4 +173,7 @@ static struct platform_driver cdns_plat_pcie_driver = {
.probe = cdns_plat_pcie_probe,
.shutdown = cdns_plat_pcie_shutdown,
};
-builtin_platform_driver(cdns_plat_pcie_driver);
+module_platform_driver(cdns_plat_pcie_driver);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Cadence PCIe controller platform driver");
diff --git a/drivers/pci/controller/cadence/pcie-cadence.c b/drivers/pci/controller/cadence/pcie-cadence.c
index bd683d0fecb2..e6f1a4ac0fb7 100644
--- a/drivers/pci/controller/cadence/pcie-cadence.c
+++ b/drivers/pci/controller/cadence/pcie-cadence.c
@@ -23,6 +23,17 @@ u16 cdns_pcie_find_ext_capability(struct cdns_pcie *pcie, u8 cap)
}
EXPORT_SYMBOL_GPL(cdns_pcie_find_ext_capability);
+bool cdns_pcie_linkup(struct cdns_pcie *pcie)
+{
+ u32 pl_reg_val;
+
+ pl_reg_val = cdns_pcie_readl(pcie, CDNS_PCIE_LM_BASE);
+ if (pl_reg_val & GENMASK(0, 0))
+ return true;
+ return false;
+}
+EXPORT_SYMBOL_GPL(cdns_pcie_linkup);
+
void cdns_pcie_detect_quiet_min_delay_set(struct cdns_pcie *pcie)
{
u32 delay = 0x3;
@@ -293,6 +304,7 @@ const struct dev_pm_ops cdns_pcie_pm_ops = {
NOIRQ_SYSTEM_SLEEP_PM_OPS(cdns_pcie_suspend_noirq,
cdns_pcie_resume_noirq)
};
+EXPORT_SYMBOL_GPL(cdns_pcie_pm_ops);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Cadence PCIe controller driver");
diff --git a/drivers/pci/controller/cadence/pcie-cadence.h b/drivers/pci/controller/cadence/pcie-cadence.h
index e2a853d2c0ab..443033c607d7 100644
--- a/drivers/pci/controller/cadence/pcie-cadence.h
+++ b/drivers/pci/controller/cadence/pcie-cadence.h
@@ -7,211 +7,12 @@
#define _PCIE_CADENCE_H
#include <linux/kernel.h>
+#include <linux/module.h>
#include <linux/pci.h>
#include <linux/pci-epf.h>
#include <linux/phy/phy.h>
-
-/* Parameters for the waiting for link up routine */
-#define LINK_WAIT_MAX_RETRIES 10
-#define LINK_WAIT_USLEEP_MIN 90000
-#define LINK_WAIT_USLEEP_MAX 100000
-
-/*
- * Local Management Registers
- */
-#define CDNS_PCIE_LM_BASE 0x00100000
-
-/* Vendor ID Register */
-#define CDNS_PCIE_LM_ID (CDNS_PCIE_LM_BASE + 0x0044)
-#define CDNS_PCIE_LM_ID_VENDOR_MASK GENMASK(15, 0)
-#define CDNS_PCIE_LM_ID_VENDOR_SHIFT 0
-#define CDNS_PCIE_LM_ID_VENDOR(vid) \
- (((vid) << CDNS_PCIE_LM_ID_VENDOR_SHIFT) & CDNS_PCIE_LM_ID_VENDOR_MASK)
-#define CDNS_PCIE_LM_ID_SUBSYS_MASK GENMASK(31, 16)
-#define CDNS_PCIE_LM_ID_SUBSYS_SHIFT 16
-#define CDNS_PCIE_LM_ID_SUBSYS(sub) \
- (((sub) << CDNS_PCIE_LM_ID_SUBSYS_SHIFT) & CDNS_PCIE_LM_ID_SUBSYS_MASK)
-
-/* Root Port Requester ID Register */
-#define CDNS_PCIE_LM_RP_RID (CDNS_PCIE_LM_BASE + 0x0228)
-#define CDNS_PCIE_LM_RP_RID_MASK GENMASK(15, 0)
-#define CDNS_PCIE_LM_RP_RID_SHIFT 0
-#define CDNS_PCIE_LM_RP_RID_(rid) \
- (((rid) << CDNS_PCIE_LM_RP_RID_SHIFT) & CDNS_PCIE_LM_RP_RID_MASK)
-
-/* Endpoint Bus and Device Number Register */
-#define CDNS_PCIE_LM_EP_ID (CDNS_PCIE_LM_BASE + 0x022c)
-#define CDNS_PCIE_LM_EP_ID_DEV_MASK GENMASK(4, 0)
-#define CDNS_PCIE_LM_EP_ID_DEV_SHIFT 0
-#define CDNS_PCIE_LM_EP_ID_BUS_MASK GENMASK(15, 8)
-#define CDNS_PCIE_LM_EP_ID_BUS_SHIFT 8
-
-/* Endpoint Function f BAR b Configuration Registers */
-#define CDNS_PCIE_LM_EP_FUNC_BAR_CFG(bar, fn) \
- (((bar) < BAR_4) ? CDNS_PCIE_LM_EP_FUNC_BAR_CFG0(fn) : CDNS_PCIE_LM_EP_FUNC_BAR_CFG1(fn))
-#define CDNS_PCIE_LM_EP_FUNC_BAR_CFG0(fn) \
- (CDNS_PCIE_LM_BASE + 0x0240 + (fn) * 0x0008)
-#define CDNS_PCIE_LM_EP_FUNC_BAR_CFG1(fn) \
- (CDNS_PCIE_LM_BASE + 0x0244 + (fn) * 0x0008)
-#define CDNS_PCIE_LM_EP_VFUNC_BAR_CFG(bar, fn) \
- (((bar) < BAR_4) ? CDNS_PCIE_LM_EP_VFUNC_BAR_CFG0(fn) : CDNS_PCIE_LM_EP_VFUNC_BAR_CFG1(fn))
-#define CDNS_PCIE_LM_EP_VFUNC_BAR_CFG0(fn) \
- (CDNS_PCIE_LM_BASE + 0x0280 + (fn) * 0x0008)
-#define CDNS_PCIE_LM_EP_VFUNC_BAR_CFG1(fn) \
- (CDNS_PCIE_LM_BASE + 0x0284 + (fn) * 0x0008)
-#define CDNS_PCIE_LM_EP_FUNC_BAR_CFG_BAR_APERTURE_MASK(b) \
- (GENMASK(4, 0) << ((b) * 8))
-#define CDNS_PCIE_LM_EP_FUNC_BAR_CFG_BAR_APERTURE(b, a) \
- (((a) << ((b) * 8)) & CDNS_PCIE_LM_EP_FUNC_BAR_CFG_BAR_APERTURE_MASK(b))
-#define CDNS_PCIE_LM_EP_FUNC_BAR_CFG_BAR_CTRL_MASK(b) \
- (GENMASK(7, 5) << ((b) * 8))
-#define CDNS_PCIE_LM_EP_FUNC_BAR_CFG_BAR_CTRL(b, c) \
- (((c) << ((b) * 8 + 5)) & CDNS_PCIE_LM_EP_FUNC_BAR_CFG_BAR_CTRL_MASK(b))
-
-/* Endpoint Function Configuration Register */
-#define CDNS_PCIE_LM_EP_FUNC_CFG (CDNS_PCIE_LM_BASE + 0x02c0)
-
-/* Root Complex BAR Configuration Register */
-#define CDNS_PCIE_LM_RC_BAR_CFG (CDNS_PCIE_LM_BASE + 0x0300)
-#define CDNS_PCIE_LM_RC_BAR_CFG_BAR0_APERTURE_MASK GENMASK(5, 0)
-#define CDNS_PCIE_LM_RC_BAR_CFG_BAR0_APERTURE(a) \
- (((a) << 0) & CDNS_PCIE_LM_RC_BAR_CFG_BAR0_APERTURE_MASK)
-#define CDNS_PCIE_LM_RC_BAR_CFG_BAR0_CTRL_MASK GENMASK(8, 6)
-#define CDNS_PCIE_LM_RC_BAR_CFG_BAR0_CTRL(c) \
- (((c) << 6) & CDNS_PCIE_LM_RC_BAR_CFG_BAR0_CTRL_MASK)
-#define CDNS_PCIE_LM_RC_BAR_CFG_BAR1_APERTURE_MASK GENMASK(13, 9)
-#define CDNS_PCIE_LM_RC_BAR_CFG_BAR1_APERTURE(a) \
- (((a) << 9) & CDNS_PCIE_LM_RC_BAR_CFG_BAR1_APERTURE_MASK)
-#define CDNS_PCIE_LM_RC_BAR_CFG_BAR1_CTRL_MASK GENMASK(16, 14)
-#define CDNS_PCIE_LM_RC_BAR_CFG_BAR1_CTRL(c) \
- (((c) << 14) & CDNS_PCIE_LM_RC_BAR_CFG_BAR1_CTRL_MASK)
-#define CDNS_PCIE_LM_RC_BAR_CFG_PREFETCH_MEM_ENABLE BIT(17)
-#define CDNS_PCIE_LM_RC_BAR_CFG_PREFETCH_MEM_32BITS 0
-#define CDNS_PCIE_LM_RC_BAR_CFG_PREFETCH_MEM_64BITS BIT(18)
-#define CDNS_PCIE_LM_RC_BAR_CFG_IO_ENABLE BIT(19)
-#define CDNS_PCIE_LM_RC_BAR_CFG_IO_16BITS 0
-#define CDNS_PCIE_LM_RC_BAR_CFG_IO_32BITS BIT(20)
-#define CDNS_PCIE_LM_RC_BAR_CFG_CHECK_ENABLE BIT(31)
-
-/* BAR control values applicable to both Endpoint Function and Root Complex */
-#define CDNS_PCIE_LM_BAR_CFG_CTRL_DISABLED 0x0
-#define CDNS_PCIE_LM_BAR_CFG_CTRL_IO_32BITS 0x1
-#define CDNS_PCIE_LM_BAR_CFG_CTRL_MEM_32BITS 0x4
-#define CDNS_PCIE_LM_BAR_CFG_CTRL_PREFETCH_MEM_32BITS 0x5
-#define CDNS_PCIE_LM_BAR_CFG_CTRL_MEM_64BITS 0x6
-#define CDNS_PCIE_LM_BAR_CFG_CTRL_PREFETCH_MEM_64BITS 0x7
-
-#define LM_RC_BAR_CFG_CTRL_DISABLED(bar) \
- (CDNS_PCIE_LM_BAR_CFG_CTRL_DISABLED << (((bar) * 8) + 6))
-#define LM_RC_BAR_CFG_CTRL_IO_32BITS(bar) \
- (CDNS_PCIE_LM_BAR_CFG_CTRL_IO_32BITS << (((bar) * 8) + 6))
-#define LM_RC_BAR_CFG_CTRL_MEM_32BITS(bar) \
- (CDNS_PCIE_LM_BAR_CFG_CTRL_MEM_32BITS << (((bar) * 8) + 6))
-#define LM_RC_BAR_CFG_CTRL_PREF_MEM_32BITS(bar) \
- (CDNS_PCIE_LM_BAR_CFG_CTRL_PREFETCH_MEM_32BITS << (((bar) * 8) + 6))
-#define LM_RC_BAR_CFG_CTRL_MEM_64BITS(bar) \
- (CDNS_PCIE_LM_BAR_CFG_CTRL_MEM_64BITS << (((bar) * 8) + 6))
-#define LM_RC_BAR_CFG_CTRL_PREF_MEM_64BITS(bar) \
- (CDNS_PCIE_LM_BAR_CFG_CTRL_PREFETCH_MEM_64BITS << (((bar) * 8) + 6))
-#define LM_RC_BAR_CFG_APERTURE(bar, aperture) \
- (((aperture) - 2) << ((bar) * 8))
-
-/* PTM Control Register */
-#define CDNS_PCIE_LM_PTM_CTRL (CDNS_PCIE_LM_BASE + 0x0da8)
-#define CDNS_PCIE_LM_TPM_CTRL_PTMRSEN BIT(17)
-
-/*
- * Endpoint Function Registers (PCI configuration space for endpoint functions)
- */
-#define CDNS_PCIE_EP_FUNC_BASE(fn) (((fn) << 12) & GENMASK(19, 12))
-
-/*
- * Endpoint PF Registers
- */
-#define CDNS_PCIE_CORE_PF_I_ARI_CAP_AND_CTRL(fn) (0x144 + (fn) * 0x1000)
-#define CDNS_PCIE_ARI_CAP_NFN_MASK GENMASK(15, 8)
-
-/*
- * Root Port Registers (PCI configuration space for the root port function)
- */
-#define CDNS_PCIE_RP_BASE 0x00200000
-#define CDNS_PCIE_RP_CAP_OFFSET 0xc0
-
-/*
- * Address Translation Registers
- */
-#define CDNS_PCIE_AT_BASE 0x00400000
-
-/* Region r Outbound AXI to PCIe Address Translation Register 0 */
-#define CDNS_PCIE_AT_OB_REGION_PCI_ADDR0(r) \
- (CDNS_PCIE_AT_BASE + 0x0000 + ((r) & 0x1f) * 0x0020)
-#define CDNS_PCIE_AT_OB_REGION_PCI_ADDR0_NBITS_MASK GENMASK(5, 0)
-#define CDNS_PCIE_AT_OB_REGION_PCI_ADDR0_NBITS(nbits) \
- (((nbits) - 1) & CDNS_PCIE_AT_OB_REGION_PCI_ADDR0_NBITS_MASK)
-#define CDNS_PCIE_AT_OB_REGION_PCI_ADDR0_DEVFN_MASK GENMASK(19, 12)
-#define CDNS_PCIE_AT_OB_REGION_PCI_ADDR0_DEVFN(devfn) \
- (((devfn) << 12) & CDNS_PCIE_AT_OB_REGION_PCI_ADDR0_DEVFN_MASK)
-#define CDNS_PCIE_AT_OB_REGION_PCI_ADDR0_BUS_MASK GENMASK(27, 20)
-#define CDNS_PCIE_AT_OB_REGION_PCI_ADDR0_BUS(bus) \
- (((bus) << 20) & CDNS_PCIE_AT_OB_REGION_PCI_ADDR0_BUS_MASK)
-
-/* Region r Outbound AXI to PCIe Address Translation Register 1 */
-#define CDNS_PCIE_AT_OB_REGION_PCI_ADDR1(r) \
- (CDNS_PCIE_AT_BASE + 0x0004 + ((r) & 0x1f) * 0x0020)
-
-/* Region r Outbound PCIe Descriptor Register 0 */
-#define CDNS_PCIE_AT_OB_REGION_DESC0(r) \
- (CDNS_PCIE_AT_BASE + 0x0008 + ((r) & 0x1f) * 0x0020)
-#define CDNS_PCIE_AT_OB_REGION_DESC0_TYPE_MASK GENMASK(3, 0)
-#define CDNS_PCIE_AT_OB_REGION_DESC0_TYPE_MEM 0x2
-#define CDNS_PCIE_AT_OB_REGION_DESC0_TYPE_IO 0x6
-#define CDNS_PCIE_AT_OB_REGION_DESC0_TYPE_CONF_TYPE0 0xa
-#define CDNS_PCIE_AT_OB_REGION_DESC0_TYPE_CONF_TYPE1 0xb
-#define CDNS_PCIE_AT_OB_REGION_DESC0_TYPE_NORMAL_MSG 0xc
-#define CDNS_PCIE_AT_OB_REGION_DESC0_TYPE_VENDOR_MSG 0xd
-/* Bit 23 MUST be set in RC mode. */
-#define CDNS_PCIE_AT_OB_REGION_DESC0_HARDCODED_RID BIT(23)
-#define CDNS_PCIE_AT_OB_REGION_DESC0_DEVFN_MASK GENMASK(31, 24)
-#define CDNS_PCIE_AT_OB_REGION_DESC0_DEVFN(devfn) \
- (((devfn) << 24) & CDNS_PCIE_AT_OB_REGION_DESC0_DEVFN_MASK)
-
-/* Region r Outbound PCIe Descriptor Register 1 */
-#define CDNS_PCIE_AT_OB_REGION_DESC1(r) \
- (CDNS_PCIE_AT_BASE + 0x000c + ((r) & 0x1f) * 0x0020)
-#define CDNS_PCIE_AT_OB_REGION_DESC1_BUS_MASK GENMASK(7, 0)
-#define CDNS_PCIE_AT_OB_REGION_DESC1_BUS(bus) \
- ((bus) & CDNS_PCIE_AT_OB_REGION_DESC1_BUS_MASK)
-
-/* Region r AXI Region Base Address Register 0 */
-#define CDNS_PCIE_AT_OB_REGION_CPU_ADDR0(r) \
- (CDNS_PCIE_AT_BASE + 0x0018 + ((r) & 0x1f) * 0x0020)
-#define CDNS_PCIE_AT_OB_REGION_CPU_ADDR0_NBITS_MASK GENMASK(5, 0)
-#define CDNS_PCIE_AT_OB_REGION_CPU_ADDR0_NBITS(nbits) \
- (((nbits) - 1) & CDNS_PCIE_AT_OB_REGION_CPU_ADDR0_NBITS_MASK)
-
-/* Region r AXI Region Base Address Register 1 */
-#define CDNS_PCIE_AT_OB_REGION_CPU_ADDR1(r) \
- (CDNS_PCIE_AT_BASE + 0x001c + ((r) & 0x1f) * 0x0020)
-
-/* Root Port BAR Inbound PCIe to AXI Address Translation Register */
-#define CDNS_PCIE_AT_IB_RP_BAR_ADDR0(bar) \
- (CDNS_PCIE_AT_BASE + 0x0800 + (bar) * 0x0008)
-#define CDNS_PCIE_AT_IB_RP_BAR_ADDR0_NBITS_MASK GENMASK(5, 0)
-#define CDNS_PCIE_AT_IB_RP_BAR_ADDR0_NBITS(nbits) \
- (((nbits) - 1) & CDNS_PCIE_AT_IB_RP_BAR_ADDR0_NBITS_MASK)
-#define CDNS_PCIE_AT_IB_RP_BAR_ADDR1(bar) \
- (CDNS_PCIE_AT_BASE + 0x0804 + (bar) * 0x0008)
-
-/* AXI link down register */
-#define CDNS_PCIE_AT_LINKDOWN (CDNS_PCIE_AT_BASE + 0x0824)
-
-/* LTSSM Capabilities register */
-#define CDNS_PCIE_LTSSM_CONTROL_CAP (CDNS_PCIE_LM_BASE + 0x0054)
-#define CDNS_PCIE_DETECT_QUIET_MIN_DELAY_MASK GENMASK(2, 1)
-#define CDNS_PCIE_DETECT_QUIET_MIN_DELAY_SHIFT 1
-#define CDNS_PCIE_DETECT_QUIET_MIN_DELAY(delay) \
- (((delay) << CDNS_PCIE_DETECT_QUIET_MIN_DELAY_SHIFT) & \
- CDNS_PCIE_DETECT_QUIET_MIN_DELAY_MASK)
+#include "pcie-cadence-lga-regs.h"
+#include "pcie-cadence-hpa-regs.h"
enum cdns_pcie_rp_bar {
RP_BAR_UNDEFINED = -1,
@@ -220,42 +21,63 @@ enum cdns_pcie_rp_bar {
RP_NO_BAR
};
-#define CDNS_PCIE_RP_MAX_IB 0x3
-#define CDNS_PCIE_MAX_OB 32
-
struct cdns_pcie_rp_ib_bar {
u64 size;
bool free;
};
-/* Endpoint Function BAR Inbound PCIe to AXI Address Translation Register */
-#define CDNS_PCIE_AT_IB_EP_FUNC_BAR_ADDR0(fn, bar) \
- (CDNS_PCIE_AT_BASE + 0x0840 + (fn) * 0x0040 + (bar) * 0x0008)
-#define CDNS_PCIE_AT_IB_EP_FUNC_BAR_ADDR1(fn, bar) \
- (CDNS_PCIE_AT_BASE + 0x0844 + (fn) * 0x0040 + (bar) * 0x0008)
-
-/* Normal/Vendor specific message access: offset inside some outbound region */
-#define CDNS_PCIE_NORMAL_MSG_ROUTING_MASK GENMASK(7, 5)
-#define CDNS_PCIE_NORMAL_MSG_ROUTING(route) \
- (((route) << 5) & CDNS_PCIE_NORMAL_MSG_ROUTING_MASK)
-#define CDNS_PCIE_NORMAL_MSG_CODE_MASK GENMASK(15, 8)
-#define CDNS_PCIE_NORMAL_MSG_CODE(code) \
- (((code) << 8) & CDNS_PCIE_NORMAL_MSG_CODE_MASK)
-#define CDNS_PCIE_MSG_DATA BIT(16)
-
struct cdns_pcie;
+struct cdns_pcie_rc;
+
+enum cdns_pcie_reg_bank {
+ REG_BANK_RP,
+ REG_BANK_IP_REG,
+ REG_BANK_IP_CFG_CTRL_REG,
+ REG_BANK_AXI_MASTER_COMMON,
+ REG_BANK_AXI_MASTER,
+ REG_BANK_AXI_SLAVE,
+ REG_BANK_AXI_HLS,
+ REG_BANK_AXI_RAS,
+ REG_BANK_AXI_DTI,
+ REG_BANKS_MAX,
+};
struct cdns_pcie_ops {
- int (*start_link)(struct cdns_pcie *pcie);
- void (*stop_link)(struct cdns_pcie *pcie);
- bool (*link_up)(struct cdns_pcie *pcie);
+ int (*start_link)(struct cdns_pcie *pcie);
+ void (*stop_link)(struct cdns_pcie *pcie);
+ bool (*link_up)(struct cdns_pcie *pcie);
u64 (*cpu_addr_fixup)(struct cdns_pcie *pcie, u64 cpu_addr);
};
/**
+ * struct cdns_plat_pcie_of_data - Register bank offset for a platform
+ * @is_rc: controller is a RC
+ * @ip_reg_bank_offset: ip register bank start offset
+ * @ip_cfg_ctrl_reg_offset: ip config control register start offset
+ * @axi_mstr_common_offset: AXI master common register start offset
+ * @axi_slave_offset: AXI slave start offset
+ * @axi_master_offset: AXI master start offset
+ * @axi_hls_offset: AXI HLS offset start
+ * @axi_ras_offset: AXI RAS offset
+ * @axi_dti_offset: AXI DTI offset
+ */
+struct cdns_plat_pcie_of_data {
+ u32 is_rc:1;
+ u32 ip_reg_bank_offset;
+ u32 ip_cfg_ctrl_reg_offset;
+ u32 axi_mstr_common_offset;
+ u32 axi_slave_offset;
+ u32 axi_master_offset;
+ u32 axi_hls_offset;
+ u32 axi_ras_offset;
+ u32 axi_dti_offset;
+};
+
+/**
* struct cdns_pcie - private data for Cadence PCIe controller drivers
* @reg_base: IO mapped register base
* @mem_res: start/end offsets in the physical system memory to map PCI accesses
+ * @msg_res: Region for send message to map PCI accesses
* @dev: PCIe controller
* @is_rc: tell whether the PCIe controller mode is Root Complex or Endpoint.
* @phy_count: number of supported PHY devices
@@ -263,16 +85,19 @@ struct cdns_pcie_ops {
* @link: list of pointers to corresponding device link representations
* @ops: Platform-specific ops to control various inputs from Cadence PCIe
* wrapper
+ * @cdns_pcie_reg_offsets: Register bank offsets for different SoC
*/
struct cdns_pcie {
- void __iomem *reg_base;
- struct resource *mem_res;
- struct device *dev;
- bool is_rc;
- int phy_count;
- struct phy **phy;
- struct device_link **link;
- const struct cdns_pcie_ops *ops;
+ void __iomem *reg_base;
+ struct resource *mem_res;
+ struct resource *msg_res;
+ struct device *dev;
+ bool is_rc;
+ int phy_count;
+ struct phy **phy;
+ struct device_link **link;
+ const struct cdns_pcie_ops *ops;
+ const struct cdns_plat_pcie_of_data *cdns_pcie_reg_offsets;
};
/**
@@ -288,6 +113,8 @@ struct cdns_pcie {
* available
* @quirk_retrain_flag: Retrain link as quirk for PCIe Gen2
* @quirk_detect_quiet_flag: LTSSM Detect Quiet min delay set as quirk
+ * @ecam_supported: Whether the ECAM is supported
+ * @no_inbound_map: Whether inbound mapping is supported
*/
struct cdns_pcie_rc {
struct cdns_pcie pcie;
@@ -298,6 +125,8 @@ struct cdns_pcie_rc {
bool avail_ib_bar[CDNS_PCIE_RP_MAX_IB];
unsigned int quirk_retrain_flag:1;
unsigned int quirk_detect_quiet_flag:1;
+ unsigned int ecam_supported:1;
+ unsigned int no_inbound_map:1;
};
/**
@@ -350,6 +179,43 @@ struct cdns_pcie_ep {
unsigned int quirk_disable_flr:1;
};
+static inline u32 cdns_reg_bank_to_off(struct cdns_pcie *pcie, enum cdns_pcie_reg_bank bank)
+{
+ u32 offset = 0x0;
+
+ switch (bank) {
+ case REG_BANK_RP:
+ offset = 0;
+ break;
+ case REG_BANK_IP_REG:
+ offset = pcie->cdns_pcie_reg_offsets->ip_reg_bank_offset;
+ break;
+ case REG_BANK_IP_CFG_CTRL_REG:
+ offset = pcie->cdns_pcie_reg_offsets->ip_cfg_ctrl_reg_offset;
+ break;
+ case REG_BANK_AXI_MASTER_COMMON:
+ offset = pcie->cdns_pcie_reg_offsets->axi_mstr_common_offset;
+ break;
+ case REG_BANK_AXI_MASTER:
+ offset = pcie->cdns_pcie_reg_offsets->axi_master_offset;
+ break;
+ case REG_BANK_AXI_SLAVE:
+ offset = pcie->cdns_pcie_reg_offsets->axi_slave_offset;
+ break;
+ case REG_BANK_AXI_HLS:
+ offset = pcie->cdns_pcie_reg_offsets->axi_hls_offset;
+ break;
+ case REG_BANK_AXI_RAS:
+ offset = pcie->cdns_pcie_reg_offsets->axi_ras_offset;
+ break;
+ case REG_BANK_AXI_DTI:
+ offset = pcie->cdns_pcie_reg_offsets->axi_dti_offset;
+ break;
+ default:
+ break;
+ }
+ return offset;
+}
/* Register access */
static inline void cdns_pcie_writel(struct cdns_pcie *pcie, u32 reg, u32 value)
@@ -362,6 +228,27 @@ static inline u32 cdns_pcie_readl(struct cdns_pcie *pcie, u32 reg)
return readl(pcie->reg_base + reg);
}
+static inline void cdns_pcie_hpa_writel(struct cdns_pcie *pcie,
+ enum cdns_pcie_reg_bank bank,
+ u32 reg,
+ u32 value)
+{
+ u32 offset = cdns_reg_bank_to_off(pcie, bank);
+
+ reg += offset;
+ writel(value, pcie->reg_base + reg);
+}
+
+static inline u32 cdns_pcie_hpa_readl(struct cdns_pcie *pcie,
+ enum cdns_pcie_reg_bank bank,
+ u32 reg)
+{
+ u32 offset = cdns_reg_bank_to_off(pcie, bank);
+
+ reg += offset;
+ return readl(pcie->reg_base + reg);
+}
+
static inline u16 cdns_pcie_readw(struct cdns_pcie *pcie, u32 reg)
{
return readw(pcie->reg_base + reg);
@@ -457,6 +344,29 @@ static inline u16 cdns_pcie_rp_readw(struct cdns_pcie *pcie, u32 reg)
return cdns_pcie_read_sz(addr, 0x2);
}
+static inline void cdns_pcie_hpa_rp_writeb(struct cdns_pcie *pcie,
+ u32 reg, u8 value)
+{
+ void __iomem *addr = pcie->reg_base + CDNS_PCIE_HPA_RP_BASE + reg;
+
+ cdns_pcie_write_sz(addr, 0x1, value);
+}
+
+static inline void cdns_pcie_hpa_rp_writew(struct cdns_pcie *pcie,
+ u32 reg, u16 value)
+{
+ void __iomem *addr = pcie->reg_base + CDNS_PCIE_HPA_RP_BASE + reg;
+
+ cdns_pcie_write_sz(addr, 0x2, value);
+}
+
+static inline u16 cdns_pcie_hpa_rp_readw(struct cdns_pcie *pcie, u32 reg)
+{
+ void __iomem *addr = pcie->reg_base + CDNS_PCIE_HPA_RP_BASE + reg;
+
+ return cdns_pcie_read_sz(addr, 0x2);
+}
+
/* Endpoint Function register access */
static inline void cdns_pcie_ep_fn_writeb(struct cdns_pcie *pcie, u8 fn,
u32 reg, u8 value)
@@ -521,6 +431,7 @@ int cdns_pcie_host_setup(struct cdns_pcie_rc *rc);
void cdns_pcie_host_disable(struct cdns_pcie_rc *rc);
void __iomem *cdns_pci_map_bus(struct pci_bus *bus, unsigned int devfn,
int where);
+int cdns_pcie_hpa_host_setup(struct cdns_pcie_rc *rc);
#else
static inline int cdns_pcie_host_link_setup(struct cdns_pcie_rc *rc)
{
@@ -537,6 +448,11 @@ static inline int cdns_pcie_host_setup(struct cdns_pcie_rc *rc)
return 0;
}
+static inline int cdns_pcie_hpa_host_setup(struct cdns_pcie_rc *rc)
+{
+ return 0;
+}
+
static inline void cdns_pcie_host_disable(struct cdns_pcie_rc *rc)
{
}
@@ -551,6 +467,7 @@ static inline void __iomem *cdns_pci_map_bus(struct pci_bus *bus, unsigned int d
#if IS_ENABLED(CONFIG_PCIE_CADENCE_EP)
int cdns_pcie_ep_setup(struct cdns_pcie_ep *ep);
void cdns_pcie_ep_disable(struct cdns_pcie_ep *ep);
+int cdns_pcie_hpa_ep_setup(struct cdns_pcie_ep *ep);
#else
static inline int cdns_pcie_ep_setup(struct cdns_pcie_ep *ep)
{
@@ -560,10 +477,17 @@ static inline int cdns_pcie_ep_setup(struct cdns_pcie_ep *ep)
static inline void cdns_pcie_ep_disable(struct cdns_pcie_ep *ep)
{
}
+
+static inline int cdns_pcie_hpa_ep_setup(struct cdns_pcie_ep *ep)
+{
+ return 0;
+}
+
#endif
-u8 cdns_pcie_find_capability(struct cdns_pcie *pcie, u8 cap);
-u16 cdns_pcie_find_ext_capability(struct cdns_pcie *pcie, u8 cap);
+u8 cdns_pcie_find_capability(struct cdns_pcie *pcie, u8 cap);
+u16 cdns_pcie_find_ext_capability(struct cdns_pcie *pcie, u8 cap);
+bool cdns_pcie_linkup(struct cdns_pcie *pcie);
void cdns_pcie_detect_quiet_min_delay_set(struct cdns_pcie *pcie);
@@ -577,8 +501,23 @@ void cdns_pcie_set_outbound_region_for_normal_msg(struct cdns_pcie *pcie,
void cdns_pcie_reset_outbound_region(struct cdns_pcie *pcie, u32 r);
void cdns_pcie_disable_phy(struct cdns_pcie *pcie);
-int cdns_pcie_enable_phy(struct cdns_pcie *pcie);
-int cdns_pcie_init_phy(struct device *dev, struct cdns_pcie *pcie);
+int cdns_pcie_enable_phy(struct cdns_pcie *pcie);
+int cdns_pcie_init_phy(struct device *dev, struct cdns_pcie *pcie);
+void cdns_pcie_hpa_detect_quiet_min_delay_set(struct cdns_pcie *pcie);
+void cdns_pcie_hpa_set_outbound_region(struct cdns_pcie *pcie, u8 busnr, u8 fn,
+ u32 r, bool is_io,
+ u64 cpu_addr, u64 pci_addr, size_t size);
+void cdns_pcie_hpa_set_outbound_region_for_normal_msg(struct cdns_pcie *pcie,
+ u8 busnr, u8 fn,
+ u32 r, u64 cpu_addr);
+int cdns_pcie_hpa_host_link_setup(struct cdns_pcie_rc *rc);
+void __iomem *cdns_pci_hpa_map_bus(struct pci_bus *bus, unsigned int devfn,
+ int where);
+int cdns_pcie_hpa_host_start_link(struct cdns_pcie_rc *rc);
+int cdns_pcie_hpa_start_link(struct cdns_pcie *pcie);
+void cdns_pcie_hpa_stop_link(struct cdns_pcie *pcie);
+bool cdns_pcie_hpa_link_up(struct cdns_pcie *pcie);
+
extern const struct dev_pm_ops cdns_pcie_pm_ops;
#endif /* _PCIE_CADENCE_H */
diff --git a/drivers/pci/controller/cadence/pcie-sg2042.c b/drivers/pci/controller/cadence/pcie-sg2042.c
index a077b28d4894..0c50c74d03ee 100644
--- a/drivers/pci/controller/cadence/pcie-sg2042.c
+++ b/drivers/pci/controller/cadence/pcie-sg2042.c
@@ -74,15 +74,12 @@ static int sg2042_pcie_probe(struct platform_device *pdev)
static void sg2042_pcie_remove(struct platform_device *pdev)
{
struct cdns_pcie *pcie = platform_get_drvdata(pdev);
- struct device *dev = &pdev->dev;
struct cdns_pcie_rc *rc;
rc = container_of(pcie, struct cdns_pcie_rc, pcie);
cdns_pcie_host_disable(rc);
cdns_pcie_disable_phy(pcie);
-
- pm_runtime_disable(dev);
}
static int sg2042_pcie_suspend_noirq(struct device *dev)
diff --git a/drivers/pci/controller/dwc/Kconfig b/drivers/pci/controller/dwc/Kconfig
index 349d4657393c..519b59422b47 100644
--- a/drivers/pci/controller/dwc/Kconfig
+++ b/drivers/pci/controller/dwc/Kconfig
@@ -256,6 +256,16 @@ config PCIE_TEGRA194_EP
in order to enable device-specific features PCIE_TEGRA194_EP must be
selected. This uses the DesignWare core.
+config PCIE_NXP_S32G
+ bool "NXP S32G PCIe controller (host mode)"
+ depends on ARCH_S32 || COMPILE_TEST
+ select PCIE_DW_HOST
+ help
+ Enable support for the PCIe controller in NXP S32G based boards to
+ work in Host mode. The controller is based on DesignWare IP and
+ can work either as RC or EP. In order to enable host-specific
+ features PCIE_NXP_S32G must be selected.
+
config PCIE_DW_PLAT
bool
@@ -416,6 +426,19 @@ config PCIE_SOPHGO_DW
Say Y here if you want PCIe host controller support on
Sophgo SoCs.
+config PCIE_SPACEMIT_K1
+ tristate "SpacemiT K1 PCIe controller (host mode)"
+ depends on ARCH_SPACEMIT || COMPILE_TEST
+ depends on HAS_IOMEM
+ select PCIE_DW_HOST
+ select PCI_PWRCTRL_SLOT
+ default ARCH_SPACEMIT
+ help
+ Enables support for the DesignWare based PCIe controller in
+ the SpacemiT K1 SoC operating in host mode. Three controllers
+ are available on the K1 SoC; the first of these shares a PHY
+ with a USB 3.0 host controller (one or the other can be used).
+
config PCIE_SPEAR13XX
bool "STMicroelectronics SPEAr PCIe controller"
depends on ARCH_SPEAR13XX || COMPILE_TEST
@@ -482,15 +505,21 @@ config PCI_DRA7XX_EP
to enable device-specific features PCI_DRA7XX_EP must be selected.
This uses the DesignWare core.
+# ARM32 platforms use hook_fault_code() and cannot support loadable module.
config PCI_KEYSTONE
bool
+# On non-ARM32 platforms, loadable module can be supported.
+config PCI_KEYSTONE_TRISTATE
+ tristate
+
config PCI_KEYSTONE_HOST
- bool "TI Keystone PCIe controller (host mode)"
+ tristate "TI Keystone PCIe controller (host mode)"
depends on ARCH_KEYSTONE || ARCH_K3 || COMPILE_TEST
depends on PCI_MSI
select PCIE_DW_HOST
- select PCI_KEYSTONE
+ select PCI_KEYSTONE if ARM
+ select PCI_KEYSTONE_TRISTATE if !ARM
help
Enables support for the PCIe controller in the Keystone SoC to
work in host mode. The PCI controller on Keystone is based on
@@ -498,11 +527,12 @@ config PCI_KEYSTONE_HOST
DesignWare core functions to implement the driver.
config PCI_KEYSTONE_EP
- bool "TI Keystone PCIe controller (endpoint mode)"
+ tristate "TI Keystone PCIe controller (endpoint mode)"
depends on ARCH_KEYSTONE || ARCH_K3 || COMPILE_TEST
depends on PCI_ENDPOINT
select PCIE_DW_EP
- select PCI_KEYSTONE
+ select PCI_KEYSTONE if ARM
+ select PCI_KEYSTONE_TRISTATE if !ARM
help
Enables support for the PCIe controller in the Keystone SoC to
work in endpoint mode. The PCI controller on Keystone is based
diff --git a/drivers/pci/controller/dwc/Makefile b/drivers/pci/controller/dwc/Makefile
index 7ae28f3b0fb3..67ba59c02038 100644
--- a/drivers/pci/controller/dwc/Makefile
+++ b/drivers/pci/controller/dwc/Makefile
@@ -10,8 +10,12 @@ obj-$(CONFIG_PCI_DRA7XX) += pci-dra7xx.o
obj-$(CONFIG_PCI_EXYNOS) += pci-exynos.o
obj-$(CONFIG_PCIE_FU740) += pcie-fu740.o
obj-$(CONFIG_PCI_IMX6) += pci-imx6.o
+obj-$(CONFIG_PCIE_NXP_S32G) += pcie-nxp-s32g.o
obj-$(CONFIG_PCIE_SPEAR13XX) += pcie-spear13xx.o
+# ARM32 platforms use hook_fault_code() and cannot support loadable module.
obj-$(CONFIG_PCI_KEYSTONE) += pci-keystone.o
+# On non-ARM32 platforms, loadable module can be supported.
+obj-$(CONFIG_PCI_KEYSTONE_TRISTATE) += pci-keystone.o
obj-$(CONFIG_PCI_LAYERSCAPE) += pci-layerscape.o
obj-$(CONFIG_PCI_LAYERSCAPE_EP) += pci-layerscape-ep.o
obj-$(CONFIG_PCIE_QCOM_COMMON) += pcie-qcom-common.o
@@ -31,6 +35,7 @@ obj-$(CONFIG_PCIE_UNIPHIER) += pcie-uniphier.o
obj-$(CONFIG_PCIE_UNIPHIER_EP) += pcie-uniphier-ep.o
obj-$(CONFIG_PCIE_VISCONTI_HOST) += pcie-visconti.o
obj-$(CONFIG_PCIE_RCAR_GEN4) += pcie-rcar-gen4.o
+obj-$(CONFIG_PCIE_SPACEMIT_K1) += pcie-spacemit-k1.o
obj-$(CONFIG_PCIE_STM32_HOST) += pcie-stm32.o
obj-$(CONFIG_PCIE_STM32_EP) += pcie-stm32-ep.o
diff --git a/drivers/pci/controller/dwc/pci-keystone.c b/drivers/pci/controller/dwc/pci-keystone.c
index eb00aa380722..f86d9111f863 100644
--- a/drivers/pci/controller/dwc/pci-keystone.c
+++ b/drivers/pci/controller/dwc/pci-keystone.c
@@ -17,6 +17,7 @@
#include <linux/irqchip/chained_irq.h>
#include <linux/irqdomain.h>
#include <linux/mfd/syscon.h>
+#include <linux/module.h>
#include <linux/msi.h>
#include <linux/of.h>
#include <linux/of_irq.h>
@@ -777,29 +778,7 @@ err:
return ret;
}
-#ifdef CONFIG_ARM
-/*
- * When a PCI device does not exist during config cycles, keystone host
- * gets a bus error instead of returning 0xffffffff (PCI_ERROR_RESPONSE).
- * This handler always returns 0 for this kind of fault.
- */
-static int ks_pcie_fault(unsigned long addr, unsigned int fsr,
- struct pt_regs *regs)
-{
- unsigned long instr = *(unsigned long *) instruction_pointer(regs);
-
- if ((instr & 0x0e100090) == 0x00100090) {
- int reg = (instr >> 12) & 15;
-
- regs->uregs[reg] = -1;
- regs->ARM_pc += 4;
- }
-
- return 0;
-}
-#endif
-
-static int __init ks_pcie_init_id(struct keystone_pcie *ks_pcie)
+static int ks_pcie_init_id(struct keystone_pcie *ks_pcie)
{
int ret;
unsigned int id;
@@ -831,7 +810,7 @@ static int __init ks_pcie_init_id(struct keystone_pcie *ks_pcie)
return 0;
}
-static int __init ks_pcie_host_init(struct dw_pcie_rp *pp)
+static int ks_pcie_host_init(struct dw_pcie_rp *pp)
{
struct dw_pcie *pci = to_dw_pcie_from_pp(pp);
struct keystone_pcie *ks_pcie = to_keystone_pcie(pci);
@@ -861,15 +840,6 @@ static int __init ks_pcie_host_init(struct dw_pcie_rp *pp)
if (ret < 0)
return ret;
-#ifdef CONFIG_ARM
- /*
- * PCIe access errors that result into OCP errors are caught by ARM as
- * "External aborts"
- */
- hook_fault_code(17, ks_pcie_fault, SIGBUS, 0,
- "Asynchronous external abort");
-#endif
-
return 0;
}
@@ -1134,6 +1104,7 @@ static const struct of_device_id ks_pcie_of_match[] = {
},
{ },
};
+MODULE_DEVICE_TABLE(of, ks_pcie_of_match);
static int ks_pcie_probe(struct platform_device *pdev)
{
@@ -1337,6 +1308,8 @@ static int ks_pcie_probe(struct platform_device *pdev)
break;
default:
dev_err(dev, "INVALID device type %d\n", mode);
+ ret = -EINVAL;
+ goto err_get_sync;
}
ks_pcie_enable_error_irq(ks_pcie);
@@ -1379,4 +1352,45 @@ static struct platform_driver ks_pcie_driver = {
.of_match_table = ks_pcie_of_match,
},
};
+
+#ifdef CONFIG_ARM
+/*
+ * When a PCI device does not exist during config cycles, keystone host
+ * gets a bus error instead of returning 0xffffffff (PCI_ERROR_RESPONSE).
+ * This handler always returns 0 for this kind of fault.
+ */
+static int ks_pcie_fault(unsigned long addr, unsigned int fsr,
+ struct pt_regs *regs)
+{
+ unsigned long instr = *(unsigned long *)instruction_pointer(regs);
+
+ if ((instr & 0x0e100090) == 0x00100090) {
+ int reg = (instr >> 12) & 15;
+
+ regs->uregs[reg] = -1;
+ regs->ARM_pc += 4;
+ }
+
+ return 0;
+}
+
+static int __init ks_pcie_init(void)
+{
+ /*
+ * PCIe access errors that result into OCP errors are caught by ARM as
+ * "External aborts"
+ */
+ if (of_find_matching_node(NULL, ks_pcie_of_match))
+ hook_fault_code(17, ks_pcie_fault, SIGBUS, 0,
+ "Asynchronous external abort");
+
+ return platform_driver_register(&ks_pcie_driver);
+}
+device_initcall(ks_pcie_init);
+#else
builtin_platform_driver(ks_pcie_driver);
+#endif
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("PCIe controller driver for Texas Instruments Keystone SoCs");
+MODULE_AUTHOR("Murali Karicheri <m-karicheri2@ti.com>");
diff --git a/drivers/pci/controller/dwc/pci-meson.c b/drivers/pci/controller/dwc/pci-meson.c
index 787469d1b396..54b6a4196f17 100644
--- a/drivers/pci/controller/dwc/pci-meson.c
+++ b/drivers/pci/controller/dwc/pci-meson.c
@@ -108,10 +108,22 @@ static int meson_pcie_get_mems(struct platform_device *pdev,
struct meson_pcie *mp)
{
struct dw_pcie *pci = &mp->pci;
+ struct resource *res;
- pci->dbi_base = devm_platform_ioremap_resource_byname(pdev, "elbi");
- if (IS_ERR(pci->dbi_base))
- return PTR_ERR(pci->dbi_base);
+ /*
+ * For the broken DTs that supply 'dbi' as 'elbi', parse the 'elbi'
+ * region and assign it to both 'pci->elbi_base' and 'pci->dbi_space' so
+ * that the DWC core can skip parsing both regions.
+ */
+ res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "elbi");
+ if (res) {
+ pci->elbi_base = devm_pci_remap_cfg_resource(pci->dev, res);
+ if (IS_ERR(pci->elbi_base))
+ return PTR_ERR(pci->elbi_base);
+
+ pci->dbi_base = pci->elbi_base;
+ pci->dbi_phys_addr = res->start;
+ }
mp->cfg_base = devm_platform_ioremap_resource_byname(pdev, "cfg");
if (IS_ERR(mp->cfg_base))
diff --git a/drivers/pci/controller/dwc/pcie-designware-ep.c b/drivers/pci/controller/dwc/pcie-designware-ep.c
index 7f2112c2fb21..19571ac2b961 100644
--- a/drivers/pci/controller/dwc/pcie-designware-ep.c
+++ b/drivers/pci/controller/dwc/pcie-designware-ep.c
@@ -797,6 +797,7 @@ int dw_pcie_ep_raise_msix_irq(struct dw_pcie_ep *ep, u8 func_no,
return 0;
}
+EXPORT_SYMBOL_GPL(dw_pcie_ep_raise_msix_irq);
/**
* dw_pcie_ep_cleanup - Cleanup DWC EP resources after fundamental reset
diff --git a/drivers/pci/controller/dwc/pcie-designware-host.c b/drivers/pci/controller/dwc/pcie-designware-host.c
index e92513c5bda5..372207c33a85 100644
--- a/drivers/pci/controller/dwc/pcie-designware-host.c
+++ b/drivers/pci/controller/dwc/pcie-designware-host.c
@@ -233,6 +233,7 @@ int dw_pcie_allocate_domains(struct dw_pcie_rp *pp)
return 0;
}
+EXPORT_SYMBOL_GPL(dw_pcie_allocate_domains);
void dw_pcie_free_msi(struct dw_pcie_rp *pp)
{
@@ -856,10 +857,19 @@ static void __iomem *dw_pcie_ecam_conf_map_bus(struct pci_bus *bus, unsigned int
return pci->dbi_base + where;
}
+static int dw_pcie_op_assert_perst(struct pci_bus *bus, bool assert)
+{
+ struct dw_pcie_rp *pp = bus->sysdata;
+ struct dw_pcie *pci = to_dw_pcie_from_pp(pp);
+
+ return dw_pcie_assert_perst(pci, assert);
+}
+
static struct pci_ops dw_pcie_ops = {
.map_bus = dw_pcie_own_conf_map_bus,
.read = pci_generic_config_read,
.write = pci_generic_config_write,
+ .assert_perst = dw_pcie_op_assert_perst,
};
static struct pci_ops dw_pcie_ecam_ops = {
@@ -1080,6 +1090,8 @@ int dw_pcie_setup_rc(struct dw_pcie_rp *pp)
PCI_COMMAND_MASTER | PCI_COMMAND_SERR;
dw_pcie_writel_dbi(pci, PCI_COMMAND, val);
+ dw_pcie_hide_unsupported_l1ss(pci);
+
dw_pcie_config_presets(pp);
/*
* If the platform provides its own child bus config accesses, it means
diff --git a/drivers/pci/controller/dwc/pcie-designware.c b/drivers/pci/controller/dwc/pcie-designware.c
index c644216995f6..75fc8b767fcc 100644
--- a/drivers/pci/controller/dwc/pcie-designware.c
+++ b/drivers/pci/controller/dwc/pcie-designware.c
@@ -168,11 +168,13 @@ int dw_pcie_get_resources(struct dw_pcie *pci)
}
/* ELBI is an optional resource */
- res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "elbi");
- if (res) {
- pci->elbi_base = devm_ioremap_resource(pci->dev, res);
- if (IS_ERR(pci->elbi_base))
- return PTR_ERR(pci->elbi_base);
+ if (!pci->elbi_base) {
+ res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "elbi");
+ if (res) {
+ pci->elbi_base = devm_ioremap_resource(pci->dev, res);
+ if (IS_ERR(pci->elbi_base))
+ return PTR_ERR(pci->elbi_base);
+ }
}
/* LLDD is supposed to manually switch the clocks and resets state */
@@ -1081,6 +1083,30 @@ void dw_pcie_edma_remove(struct dw_pcie *pci)
dw_edma_remove(&pci->edma);
}
+void dw_pcie_hide_unsupported_l1ss(struct dw_pcie *pci)
+{
+ u16 l1ss;
+ u32 l1ss_cap;
+
+ if (pci->l1ss_support)
+ return;
+
+ l1ss = dw_pcie_find_ext_capability(pci, PCI_EXT_CAP_ID_L1SS);
+ if (!l1ss)
+ return;
+
+ /*
+ * Unless the driver claims "l1ss_support", don't advertise L1 PM
+ * Substates because they require CLKREQ# and possibly other
+ * device-specific configuration.
+ */
+ l1ss_cap = dw_pcie_readl_dbi(pci, l1ss + PCI_L1SS_CAP);
+ l1ss_cap &= ~(PCI_L1SS_CAP_PCIPM_L1_1 | PCI_L1SS_CAP_ASPM_L1_1 |
+ PCI_L1SS_CAP_PCIPM_L1_2 | PCI_L1SS_CAP_ASPM_L1_2 |
+ PCI_L1SS_CAP_L1_PM_SS);
+ dw_pcie_writel_dbi(pci, l1ss + PCI_L1SS_CAP, l1ss_cap);
+}
+
void dw_pcie_setup(struct dw_pcie *pci)
{
u32 val;
diff --git a/drivers/pci/controller/dwc/pcie-designware.h b/drivers/pci/controller/dwc/pcie-designware.h
index e995f692a1ec..31685951a080 100644
--- a/drivers/pci/controller/dwc/pcie-designware.h
+++ b/drivers/pci/controller/dwc/pcie-designware.h
@@ -97,7 +97,7 @@
#define PORT_LANE_SKEW_INSERT_MASK GENMASK(23, 0)
#define PCIE_PORT_DEBUG0 0x728
-#define PORT_LOGIC_LTSSM_STATE_MASK 0x1f
+#define PORT_LOGIC_LTSSM_STATE_MASK 0x3f
#define PORT_LOGIC_LTSSM_STATE_L0 0x11
#define PCIE_PORT_DEBUG1 0x72C
#define PCIE_PORT_DEBUG1_LINK_UP BIT(4)
@@ -121,6 +121,7 @@
#define GEN3_RELATED_OFF 0x890
#define GEN3_RELATED_OFF_GEN3_ZRXDC_NONCOMPL BIT(0)
+#define GEN3_RELATED_OFF_EQ_PHASE_2_3 BIT(9)
#define GEN3_RELATED_OFF_RXEQ_RGRDLESS_RXTS BIT(13)
#define GEN3_RELATED_OFF_GEN3_EQ_DISABLE BIT(16)
#define GEN3_RELATED_OFF_RATE_SHADOW_SEL_SHIFT 24
@@ -138,6 +139,13 @@
#define GEN3_EQ_FMDC_MAX_PRE_CURSOR_DELTA GENMASK(13, 10)
#define GEN3_EQ_FMDC_MAX_POST_CURSOR_DELTA GENMASK(17, 14)
+#define COHERENCY_CONTROL_1_OFF 0x8E0
+#define CFG_MEMTYPE_BOUNDARY_LOW_ADDR_MASK GENMASK(31, 2)
+#define CFG_MEMTYPE_VALUE BIT(0)
+
+#define COHERENCY_CONTROL_2_OFF 0x8E4
+#define COHERENCY_CONTROL_3_OFF 0x8E8
+
#define PCIE_PORT_MULTI_LANE_CTRL 0x8C0
#define PORT_MLTI_UPCFG_SUPPORT BIT(7)
@@ -485,6 +493,7 @@ struct dw_pcie_ops {
enum dw_pcie_ltssm (*get_ltssm)(struct dw_pcie *pcie);
int (*start_link)(struct dw_pcie *pcie);
void (*stop_link)(struct dw_pcie *pcie);
+ int (*assert_perst)(struct dw_pcie *pcie, bool assert);
};
struct debugfs_info {
@@ -516,6 +525,7 @@ struct dw_pcie {
int max_link_speed;
u8 n_fts[2];
struct dw_edma_chip edma;
+ bool l1ss_support; /* L1 PM Substates support */
struct clk_bulk_data app_clks[DW_PCIE_NUM_APP_CLKS];
struct clk_bulk_data core_clks[DW_PCIE_NUM_CORE_CLKS];
struct reset_control_bulk_data app_rsts[DW_PCIE_NUM_APP_RSTS];
@@ -573,6 +583,7 @@ int dw_pcie_prog_ep_inbound_atu(struct dw_pcie *pci, u8 func_no, int index,
int type, u64 parent_bus_addr,
u8 bar, size_t size);
void dw_pcie_disable_atu(struct dw_pcie *pci, u32 dir, int index);
+void dw_pcie_hide_unsupported_l1ss(struct dw_pcie *pci);
void dw_pcie_setup(struct dw_pcie *pci);
void dw_pcie_iatu_detect(struct dw_pcie *pci);
int dw_pcie_edma_detect(struct dw_pcie *pci);
@@ -787,6 +798,14 @@ static inline void dw_pcie_stop_link(struct dw_pcie *pci)
pci->ops->stop_link(pci);
}
+static inline int dw_pcie_assert_perst(struct dw_pcie *pci, bool assert)
+{
+ if (pci->ops && pci->ops->assert_perst)
+ return pci->ops->assert_perst(pci, assert);
+
+ return 0;
+}
+
static inline enum dw_pcie_ltssm dw_pcie_get_ltssm(struct dw_pcie *pci)
{
u32 val;
diff --git a/drivers/pci/controller/dwc/pcie-dw-rockchip.c b/drivers/pci/controller/dwc/pcie-dw-rockchip.c
index 3e2752c7dd09..f8605fe61a41 100644
--- a/drivers/pci/controller/dwc/pcie-dw-rockchip.c
+++ b/drivers/pci/controller/dwc/pcie-dw-rockchip.c
@@ -62,6 +62,12 @@
/* Interrupt Mask Register Related to Miscellaneous Operation */
#define PCIE_CLIENT_INTR_MASK_MISC 0x24
+/* Power Management Control Register */
+#define PCIE_CLIENT_POWER_CON 0x2c
+#define PCIE_CLKREQ_READY FIELD_PREP_WM16(BIT(0), 1)
+#define PCIE_CLKREQ_NOT_READY FIELD_PREP_WM16(BIT(0), 0)
+#define PCIE_CLKREQ_PULL_DOWN FIELD_PREP_WM16(GENMASK(13, 12), 1)
+
/* Hot Reset Control Register */
#define PCIE_CLIENT_HOT_RESET_CTRL 0x180
#define PCIE_LTSSM_APP_DLY2_EN BIT(1)
@@ -82,9 +88,9 @@ struct rockchip_pcie {
unsigned int clk_cnt;
struct reset_control *rst;
struct gpio_desc *rst_gpio;
- struct regulator *vpcie3v3;
struct irq_domain *irq_domain;
const struct rockchip_pcie_of_data *data;
+ bool supports_clkreq;
};
struct rockchip_pcie_of_data {
@@ -200,6 +206,35 @@ static bool rockchip_pcie_link_up(struct dw_pcie *pci)
return FIELD_GET(PCIE_LINKUP_MASK, val) == PCIE_LINKUP;
}
+/*
+ * See e.g. section '11.6.6.4 L1 Substate' in the RK3588 TRM V1.0 for the steps
+ * needed to support L1 substates. Currently, just enable L1 substates for RC
+ * mode if CLKREQ# is properly connected and supports-clkreq is present in DT.
+ * For EP mode, there are more things should be done to actually save power in
+ * L1 substates, so disable L1 substates until there is proper support.
+ */
+static void rockchip_pcie_configure_l1ss(struct dw_pcie *pci)
+{
+ struct rockchip_pcie *rockchip = to_rockchip_pcie(pci);
+
+ /* Enable L1 substates if CLKREQ# is properly connected */
+ if (rockchip->supports_clkreq) {
+ rockchip_pcie_writel_apb(rockchip, PCIE_CLKREQ_READY,
+ PCIE_CLIENT_POWER_CON);
+ pci->l1ss_support = true;
+ return;
+ }
+
+ /*
+ * Otherwise, assert CLKREQ# unconditionally. Since
+ * pci->l1ss_support is not set, the DWC core will prevent L1
+ * Substates support from being advertised.
+ */
+ rockchip_pcie_writel_apb(rockchip,
+ PCIE_CLKREQ_PULL_DOWN | PCIE_CLKREQ_NOT_READY,
+ PCIE_CLIENT_POWER_CON);
+}
+
static void rockchip_pcie_enable_l0s(struct dw_pcie *pci)
{
u32 cap, lnkcap;
@@ -264,6 +299,7 @@ static int rockchip_pcie_host_init(struct dw_pcie_rp *pp)
irq_set_chained_handler_and_data(irq, rockchip_pcie_intx_handler,
rockchip);
+ rockchip_pcie_configure_l1ss(pci);
rockchip_pcie_enable_l0s(pci);
return 0;
@@ -412,6 +448,9 @@ static int rockchip_pcie_resource_get(struct platform_device *pdev,
return dev_err_probe(&pdev->dev, PTR_ERR(rockchip->rst),
"failed to get reset lines\n");
+ rockchip->supports_clkreq = of_property_read_bool(pdev->dev.of_node,
+ "supports-clkreq");
+
return 0;
}
@@ -652,22 +691,15 @@ static int rockchip_pcie_probe(struct platform_device *pdev)
return ret;
/* DON'T MOVE ME: must be enable before PHY init */
- rockchip->vpcie3v3 = devm_regulator_get_optional(dev, "vpcie3v3");
- if (IS_ERR(rockchip->vpcie3v3)) {
- if (PTR_ERR(rockchip->vpcie3v3) != -ENODEV)
- return dev_err_probe(dev, PTR_ERR(rockchip->vpcie3v3),
- "failed to get vpcie3v3 regulator\n");
- rockchip->vpcie3v3 = NULL;
- } else {
- ret = regulator_enable(rockchip->vpcie3v3);
- if (ret)
- return dev_err_probe(dev, ret,
- "failed to enable vpcie3v3 regulator\n");
- }
+ ret = devm_regulator_get_enable_optional(dev, "vpcie3v3");
+ if (ret < 0 && ret != -ENODEV)
+ return dev_err_probe(dev, ret,
+ "failed to enable vpcie3v3 regulator\n");
ret = rockchip_pcie_phy_init(rockchip);
if (ret)
- goto disable_regulator;
+ return dev_err_probe(dev, ret,
+ "failed to initialize the phy\n");
ret = reset_control_deassert(rockchip->rst);
if (ret)
@@ -700,9 +732,6 @@ deinit_clk:
clk_bulk_disable_unprepare(rockchip->clk_cnt, rockchip->clks);
deinit_phy:
rockchip_pcie_phy_deinit(rockchip);
-disable_regulator:
- if (rockchip->vpcie3v3)
- regulator_disable(rockchip->vpcie3v3);
return ret;
}
diff --git a/drivers/pci/controller/dwc/pcie-nxp-s32g.c b/drivers/pci/controller/dwc/pcie-nxp-s32g.c
new file mode 100644
index 000000000000..47745749f75c
--- /dev/null
+++ b/drivers/pci/controller/dwc/pcie-nxp-s32g.c
@@ -0,0 +1,406 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * PCIe host controller driver for NXP S32G SoCs
+ *
+ * Copyright 2019-2025 NXP
+ */
+
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/of_device.h>
+#include <linux/of_address.h>
+#include <linux/pci.h>
+#include <linux/phy/phy.h>
+#include <linux/platform_device.h>
+#include <linux/pm_runtime.h>
+#include <linux/sizes.h>
+#include <linux/types.h>
+
+#include "pcie-designware.h"
+
+/* PCIe controller Sub-System */
+
+/* PCIe controller 0 General Control 1 */
+#define PCIE_S32G_PE0_GEN_CTRL_1 0x50
+#define DEVICE_TYPE_MASK GENMASK(3, 0)
+#define SRIS_MODE BIT(8)
+
+/* PCIe controller 0 General Control 3 */
+#define PCIE_S32G_PE0_GEN_CTRL_3 0x58
+#define LTSSM_EN BIT(0)
+
+/* PCIe Controller 0 Interrupt Status */
+#define PCIE_S32G_PE0_INT_STS 0xE8
+#define HP_INT_STS BIT(6)
+
+/* Boundary between peripheral space and physical memory space */
+#define S32G_MEMORY_BOUNDARY_ADDR 0x80000000
+
+struct s32g_pcie_port {
+ struct list_head list;
+ struct phy *phy;
+};
+
+struct s32g_pcie {
+ struct dw_pcie pci;
+ void __iomem *ctrl_base;
+ struct list_head ports;
+};
+
+#define to_s32g_from_dw_pcie(x) \
+ container_of(x, struct s32g_pcie, pci)
+
+static void s32g_pcie_writel_ctrl(struct s32g_pcie *s32g_pp, u32 reg, u32 val)
+{
+ writel(val, s32g_pp->ctrl_base + reg);
+}
+
+static u32 s32g_pcie_readl_ctrl(struct s32g_pcie *s32g_pp, u32 reg)
+{
+ return readl(s32g_pp->ctrl_base + reg);
+}
+
+static void s32g_pcie_enable_ltssm(struct s32g_pcie *s32g_pp)
+{
+ u32 reg;
+
+ reg = s32g_pcie_readl_ctrl(s32g_pp, PCIE_S32G_PE0_GEN_CTRL_3);
+ reg |= LTSSM_EN;
+ s32g_pcie_writel_ctrl(s32g_pp, PCIE_S32G_PE0_GEN_CTRL_3, reg);
+}
+
+static void s32g_pcie_disable_ltssm(struct s32g_pcie *s32g_pp)
+{
+ u32 reg;
+
+ reg = s32g_pcie_readl_ctrl(s32g_pp, PCIE_S32G_PE0_GEN_CTRL_3);
+ reg &= ~LTSSM_EN;
+ s32g_pcie_writel_ctrl(s32g_pp, PCIE_S32G_PE0_GEN_CTRL_3, reg);
+}
+
+static int s32g_pcie_start_link(struct dw_pcie *pci)
+{
+ struct s32g_pcie *s32g_pp = to_s32g_from_dw_pcie(pci);
+
+ s32g_pcie_enable_ltssm(s32g_pp);
+
+ return 0;
+}
+
+static void s32g_pcie_stop_link(struct dw_pcie *pci)
+{
+ struct s32g_pcie *s32g_pp = to_s32g_from_dw_pcie(pci);
+
+ s32g_pcie_disable_ltssm(s32g_pp);
+}
+
+static struct dw_pcie_ops s32g_pcie_ops = {
+ .start_link = s32g_pcie_start_link,
+ .stop_link = s32g_pcie_stop_link,
+};
+
+/* Configure the AMBA AXI Coherency Extensions (ACE) interface */
+static void s32g_pcie_reset_mstr_ace(struct dw_pcie *pci)
+{
+ u32 ddr_base_low = lower_32_bits(S32G_MEMORY_BOUNDARY_ADDR);
+ u32 ddr_base_high = upper_32_bits(S32G_MEMORY_BOUNDARY_ADDR);
+
+ dw_pcie_dbi_ro_wr_en(pci);
+ dw_pcie_writel_dbi(pci, COHERENCY_CONTROL_3_OFF, 0x0);
+
+ /*
+ * Ncore is a cache-coherent interconnect module that enables the
+ * integration of heterogeneous coherent and non-coherent agents in
+ * the chip. Ncore transactions to peripheral should be non-coherent
+ * or it might drop them.
+ *
+ * One example where this is needed are PCIe MSIs, which use NoSnoop=0
+ * and might end up routed to Ncore. PCIe coherent traffic (e.g. MSIs)
+ * that targets peripheral space will be dropped by Ncore because
+ * peripherals on S32G are not coherent as slaves. We add a hard
+ * boundary in the PCIe controller coherency control registers to
+ * separate physical memory space from peripheral space.
+ *
+ * Define the start of DDR as seen by Linux as this boundary between
+ * "memory" and "peripherals", with peripherals being below.
+ */
+ dw_pcie_writel_dbi(pci, COHERENCY_CONTROL_1_OFF,
+ (ddr_base_low & CFG_MEMTYPE_BOUNDARY_LOW_ADDR_MASK));
+ dw_pcie_writel_dbi(pci, COHERENCY_CONTROL_2_OFF, ddr_base_high);
+ dw_pcie_dbi_ro_wr_dis(pci);
+}
+
+static int s32g_init_pcie_controller(struct dw_pcie_rp *pp)
+{
+ struct dw_pcie *pci = to_dw_pcie_from_pp(pp);
+ struct s32g_pcie *s32g_pp = to_s32g_from_dw_pcie(pci);
+ u32 val;
+
+ /* Set RP mode */
+ val = s32g_pcie_readl_ctrl(s32g_pp, PCIE_S32G_PE0_GEN_CTRL_1);
+ val &= ~DEVICE_TYPE_MASK;
+ val |= FIELD_PREP(DEVICE_TYPE_MASK, PCI_EXP_TYPE_ROOT_PORT);
+
+ /* Use default CRNS */
+ val &= ~SRIS_MODE;
+
+ s32g_pcie_writel_ctrl(s32g_pp, PCIE_S32G_PE0_GEN_CTRL_1, val);
+
+ /*
+ * Make sure we use the coherency defaults (just in case the settings
+ * have been changed from their reset values)
+ */
+ s32g_pcie_reset_mstr_ace(pci);
+
+ dw_pcie_dbi_ro_wr_en(pci);
+
+ val = dw_pcie_readl_dbi(pci, PCIE_PORT_FORCE);
+ val |= PORT_FORCE_DO_DESKEW_FOR_SRIS;
+ dw_pcie_writel_dbi(pci, PCIE_PORT_FORCE, val);
+
+ val = dw_pcie_readl_dbi(pci, GEN3_RELATED_OFF);
+ val |= GEN3_RELATED_OFF_EQ_PHASE_2_3;
+ dw_pcie_writel_dbi(pci, GEN3_RELATED_OFF, val);
+
+ dw_pcie_dbi_ro_wr_dis(pci);
+
+ return 0;
+}
+
+static const struct dw_pcie_host_ops s32g_pcie_host_ops = {
+ .init = s32g_init_pcie_controller,
+};
+
+static int s32g_init_pcie_phy(struct s32g_pcie *s32g_pp)
+{
+ struct dw_pcie *pci = &s32g_pp->pci;
+ struct device *dev = pci->dev;
+ struct s32g_pcie_port *port, *tmp;
+ int ret;
+
+ list_for_each_entry(port, &s32g_pp->ports, list) {
+ ret = phy_init(port->phy);
+ if (ret) {
+ dev_err(dev, "Failed to init serdes PHY\n");
+ goto err_phy_revert;
+ }
+
+ ret = phy_set_mode_ext(port->phy, PHY_MODE_PCIE, 0);
+ if (ret) {
+ dev_err(dev, "Failed to set mode on serdes PHY\n");
+ goto err_phy_exit;
+ }
+
+ ret = phy_power_on(port->phy);
+ if (ret) {
+ dev_err(dev, "Failed to power on serdes PHY\n");
+ goto err_phy_exit;
+ }
+ }
+
+ return 0;
+
+err_phy_exit:
+ phy_exit(port->phy);
+
+err_phy_revert:
+ list_for_each_entry_continue_reverse(port, &s32g_pp->ports, list) {
+ phy_power_off(port->phy);
+ phy_exit(port->phy);
+ }
+
+ list_for_each_entry_safe(port, tmp, &s32g_pp->ports, list)
+ list_del(&port->list);
+
+ return ret;
+}
+
+static void s32g_deinit_pcie_phy(struct s32g_pcie *s32g_pp)
+{
+ struct s32g_pcie_port *port, *tmp;
+
+ list_for_each_entry_safe(port, tmp, &s32g_pp->ports, list) {
+ phy_power_off(port->phy);
+ phy_exit(port->phy);
+ list_del(&port->list);
+ }
+}
+
+static int s32g_pcie_init(struct device *dev, struct s32g_pcie *s32g_pp)
+{
+ s32g_pcie_disable_ltssm(s32g_pp);
+
+ return s32g_init_pcie_phy(s32g_pp);
+}
+
+static void s32g_pcie_deinit(struct s32g_pcie *s32g_pp)
+{
+ s32g_pcie_disable_ltssm(s32g_pp);
+
+ s32g_deinit_pcie_phy(s32g_pp);
+}
+
+static int s32g_pcie_parse_port(struct s32g_pcie *s32g_pp, struct device_node *node)
+{
+ struct device *dev = s32g_pp->pci.dev;
+ struct s32g_pcie_port *port;
+ int num_lanes;
+
+ port = devm_kzalloc(dev, sizeof(*port), GFP_KERNEL);
+ if (!port)
+ return -ENOMEM;
+
+ port->phy = devm_of_phy_get(dev, node, NULL);
+ if (IS_ERR(port->phy))
+ return dev_err_probe(dev, PTR_ERR(port->phy),
+ "Failed to get serdes PHY\n");
+
+ INIT_LIST_HEAD(&port->list);
+ list_add_tail(&port->list, &s32g_pp->ports);
+
+ /*
+ * The DWC core initialization code cannot yet parse the num-lanes
+ * attribute in the Root Port node. The S32G only supports one Root
+ * Port for now so its driver can parse the node and set the num_lanes
+ * field of struct dwc_pcie before calling dw_pcie_host_init().
+ */
+ if (!of_property_read_u32(node, "num-lanes", &num_lanes))
+ s32g_pp->pci.num_lanes = num_lanes;
+
+ return 0;
+}
+
+static int s32g_pcie_parse_ports(struct device *dev, struct s32g_pcie *s32g_pp)
+{
+ struct s32g_pcie_port *port, *tmp;
+ int ret = -ENOENT;
+
+ for_each_available_child_of_node_scoped(dev->of_node, of_port) {
+ if (!of_node_is_type(of_port, "pci"))
+ continue;
+
+ ret = s32g_pcie_parse_port(s32g_pp, of_port);
+ if (ret)
+ goto err_port;
+ }
+
+err_port:
+ list_for_each_entry_safe(port, tmp, &s32g_pp->ports, list)
+ list_del(&port->list);
+
+ return ret;
+}
+
+static int s32g_pcie_get_resources(struct platform_device *pdev,
+ struct s32g_pcie *s32g_pp)
+{
+ struct dw_pcie *pci = &s32g_pp->pci;
+ struct device *dev = &pdev->dev;
+ int ret;
+
+ pci->dev = dev;
+ pci->ops = &s32g_pcie_ops;
+
+ s32g_pp->ctrl_base = devm_platform_ioremap_resource_byname(pdev, "ctrl");
+ if (IS_ERR(s32g_pp->ctrl_base))
+ return PTR_ERR(s32g_pp->ctrl_base);
+
+ INIT_LIST_HEAD(&s32g_pp->ports);
+
+ ret = s32g_pcie_parse_ports(dev, s32g_pp);
+ if (ret)
+ return dev_err_probe(dev, ret,
+ "Failed to parse Root Port: %d\n", ret);
+
+ platform_set_drvdata(pdev, s32g_pp);
+
+ return 0;
+}
+
+static int s32g_pcie_probe(struct platform_device *pdev)
+{
+ struct device *dev = &pdev->dev;
+ struct s32g_pcie *s32g_pp;
+ struct dw_pcie_rp *pp;
+ int ret;
+
+ s32g_pp = devm_kzalloc(dev, sizeof(*s32g_pp), GFP_KERNEL);
+ if (!s32g_pp)
+ return -ENOMEM;
+
+ ret = s32g_pcie_get_resources(pdev, s32g_pp);
+ if (ret)
+ return ret;
+
+ pm_runtime_no_callbacks(dev);
+ devm_pm_runtime_enable(dev);
+ ret = pm_runtime_get_sync(dev);
+ if (ret < 0)
+ goto err_pm_runtime_put;
+
+ ret = s32g_pcie_init(dev, s32g_pp);
+ if (ret)
+ goto err_pm_runtime_put;
+
+ pp = &s32g_pp->pci.pp;
+ pp->ops = &s32g_pcie_host_ops;
+ pp->use_atu_msg = true;
+
+ ret = dw_pcie_host_init(pp);
+ if (ret)
+ goto err_pcie_deinit;
+
+ return 0;
+
+err_pcie_deinit:
+ s32g_pcie_deinit(s32g_pp);
+err_pm_runtime_put:
+ pm_runtime_put(dev);
+
+ return ret;
+}
+
+static int s32g_pcie_suspend_noirq(struct device *dev)
+{
+ struct s32g_pcie *s32g_pp = dev_get_drvdata(dev);
+ struct dw_pcie *pci = &s32g_pp->pci;
+
+ return dw_pcie_suspend_noirq(pci);
+}
+
+static int s32g_pcie_resume_noirq(struct device *dev)
+{
+ struct s32g_pcie *s32g_pp = dev_get_drvdata(dev);
+ struct dw_pcie *pci = &s32g_pp->pci;
+
+ return dw_pcie_resume_noirq(pci);
+}
+
+static const struct dev_pm_ops s32g_pcie_pm_ops = {
+ NOIRQ_SYSTEM_SLEEP_PM_OPS(s32g_pcie_suspend_noirq,
+ s32g_pcie_resume_noirq)
+};
+
+static const struct of_device_id s32g_pcie_of_match[] = {
+ { .compatible = "nxp,s32g2-pcie" },
+ { /* sentinel */ },
+};
+MODULE_DEVICE_TABLE(of, s32g_pcie_of_match);
+
+static struct platform_driver s32g_pcie_driver = {
+ .driver = {
+ .name = "s32g-pcie",
+ .of_match_table = s32g_pcie_of_match,
+ .suppress_bind_attrs = true,
+ .pm = pm_sleep_ptr(&s32g_pcie_pm_ops),
+ .probe_type = PROBE_PREFER_ASYNCHRONOUS,
+ },
+ .probe = s32g_pcie_probe,
+};
+
+builtin_platform_driver(s32g_pcie_driver);
+
+MODULE_AUTHOR("Ionut Vicovan <Ionut.Vicovan@nxp.com>");
+MODULE_DESCRIPTION("NXP S32G PCIe Host controller driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/pci/controller/dwc/pcie-qcom.c b/drivers/pci/controller/dwc/pcie-qcom.c
index c48a20602d7f..7b92e7a1c0d9 100644
--- a/drivers/pci/controller/dwc/pcie-qcom.c
+++ b/drivers/pci/controller/dwc/pcie-qcom.c
@@ -641,6 +641,18 @@ static int qcom_pcie_post_init_1_0_0(struct qcom_pcie *pcie)
return 0;
}
+static int qcom_pcie_assert_perst(struct dw_pcie *pci, bool assert)
+{
+ struct qcom_pcie *pcie = to_qcom_pcie(pci);
+
+ if (assert)
+ qcom_ep_reset_assert(pcie);
+ else
+ qcom_ep_reset_deassert(pcie);
+
+ return 0;
+}
+
static void qcom_pcie_2_3_2_ltssm_enable(struct qcom_pcie *pcie)
{
u32 val;
@@ -1012,6 +1024,8 @@ static int qcom_pcie_init_2_7_0(struct qcom_pcie *pcie)
val &= ~REQ_NOT_ENTR_L1;
writel(val, pcie->parf + PARF_PM_CTRL);
+ pci->l1ss_support = true;
+
val = readl(pcie->parf + PARF_AXI_MSTR_WR_ADDR_HALT_V2);
val |= EN;
writel(val, pcie->parf + PARF_AXI_MSTR_WR_ADDR_HALT_V2);
@@ -1480,6 +1494,7 @@ static const struct qcom_pcie_cfg cfg_fw_managed = {
static const struct dw_pcie_ops dw_pcie_ops = {
.link_up = qcom_pcie_link_up,
.start_link = qcom_pcie_start_link,
+ .assert_perst = qcom_pcie_assert_perst,
};
static int qcom_pcie_icc_init(struct qcom_pcie *pcie)
@@ -1529,6 +1544,7 @@ static void qcom_pcie_icc_opp_update(struct qcom_pcie *pcie)
{
u32 offset, status, width, speed;
struct dw_pcie *pci = pcie->pci;
+ struct dev_pm_opp_key key = {};
unsigned long freq_kbps;
struct dev_pm_opp *opp;
int ret, freq_mbps;
@@ -1556,8 +1572,20 @@ static void qcom_pcie_icc_opp_update(struct qcom_pcie *pcie)
return;
freq_kbps = freq_mbps * KILO;
- opp = dev_pm_opp_find_freq_exact(pci->dev, freq_kbps * width,
- true);
+ opp = dev_pm_opp_find_level_exact(pci->dev, speed);
+ if (IS_ERR(opp)) {
+ /* opp-level is not defined use only frequency */
+ opp = dev_pm_opp_find_freq_exact(pci->dev, freq_kbps * width,
+ true);
+ } else {
+ /* put opp-level OPP */
+ dev_pm_opp_put(opp);
+
+ key.freq = freq_kbps * width;
+ key.level = speed;
+ key.bw = 0;
+ opp = dev_pm_opp_find_key_exact(pci->dev, &key, true);
+ }
if (!IS_ERR(opp)) {
ret = dev_pm_opp_set_opp(pci->dev, opp);
if (ret)
diff --git a/drivers/pci/controller/dwc/pcie-spacemit-k1.c b/drivers/pci/controller/dwc/pcie-spacemit-k1.c
new file mode 100644
index 000000000000..be20a520255b
--- /dev/null
+++ b/drivers/pci/controller/dwc/pcie-spacemit-k1.c
@@ -0,0 +1,357 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * SpacemiT K1 PCIe host driver
+ *
+ * Copyright (C) 2025 by RISCstar Solutions Corporation. All rights reserved.
+ * Copyright (c) 2023, spacemit Corporation.
+ */
+
+#include <linux/clk.h>
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/gfp.h>
+#include <linux/mfd/syscon.h>
+#include <linux/mod_devicetable.h>
+#include <linux/phy/phy.h>
+#include <linux/platform_device.h>
+#include <linux/regmap.h>
+#include <linux/reset.h>
+#include <linux/types.h>
+
+#include "pcie-designware.h"
+
+#define PCI_VENDOR_ID_SPACEMIT 0x201f
+#define PCI_DEVICE_ID_SPACEMIT_K1 0x0001
+
+/* Offsets and field definitions for link management registers */
+#define K1_PHY_AHB_IRQ_EN 0x0000
+#define PCIE_INTERRUPT_EN BIT(0)
+
+#define K1_PHY_AHB_LINK_STS 0x0004
+#define SMLH_LINK_UP BIT(1)
+#define RDLH_LINK_UP BIT(12)
+
+#define INTR_ENABLE 0x0014
+#define MSI_CTRL_INT BIT(11)
+
+/* Some controls require APMU regmap access */
+#define SYSCON_APMU "spacemit,apmu"
+
+/* Offsets and field definitions for APMU registers */
+#define PCIE_CLK_RESET_CONTROL 0x0000
+#define LTSSM_EN BIT(6)
+#define PCIE_AUX_PWR_DET BIT(9)
+#define PCIE_RC_PERST BIT(12) /* 1: assert PERST# */
+#define APP_HOLD_PHY_RST BIT(30)
+#define DEVICE_TYPE_RC BIT(31) /* 0: endpoint; 1: RC */
+
+#define PCIE_CONTROL_LOGIC 0x0004
+#define PCIE_SOFT_RESET BIT(0)
+
+struct k1_pcie {
+ struct dw_pcie pci;
+ struct phy *phy;
+ void __iomem *link;
+ struct regmap *pmu; /* Errors ignored; MMIO-backed regmap */
+ u32 pmu_off;
+};
+
+#define to_k1_pcie(dw_pcie) \
+ platform_get_drvdata(to_platform_device((dw_pcie)->dev))
+
+static void k1_pcie_toggle_soft_reset(struct k1_pcie *k1)
+{
+ u32 offset;
+ u32 val;
+
+ /*
+ * Write, then read back to guarantee it has reached the device
+ * before we start the delay.
+ */
+ offset = k1->pmu_off + PCIE_CONTROL_LOGIC;
+ regmap_set_bits(k1->pmu, offset, PCIE_SOFT_RESET);
+ regmap_read(k1->pmu, offset, &val);
+
+ mdelay(2);
+
+ regmap_clear_bits(k1->pmu, offset, PCIE_SOFT_RESET);
+}
+
+/* Enable app clocks, deassert resets */
+static int k1_pcie_enable_resources(struct k1_pcie *k1)
+{
+ struct dw_pcie *pci = &k1->pci;
+ int ret;
+
+ ret = clk_bulk_prepare_enable(ARRAY_SIZE(pci->app_clks), pci->app_clks);
+ if (ret)
+ return ret;
+
+ ret = reset_control_bulk_deassert(ARRAY_SIZE(pci->app_rsts),
+ pci->app_rsts);
+ if (ret)
+ goto err_disable_clks;
+
+ return 0;
+
+err_disable_clks:
+ clk_bulk_disable_unprepare(ARRAY_SIZE(pci->app_clks), pci->app_clks);
+
+ return ret;
+}
+
+/* Assert resets, disable app clocks */
+static void k1_pcie_disable_resources(struct k1_pcie *k1)
+{
+ struct dw_pcie *pci = &k1->pci;
+
+ reset_control_bulk_assert(ARRAY_SIZE(pci->app_rsts), pci->app_rsts);
+ clk_bulk_disable_unprepare(ARRAY_SIZE(pci->app_clks), pci->app_clks);
+}
+
+/* FIXME: Disable ASPM L1 to avoid errors reported on some NVMe drives */
+static void k1_pcie_disable_aspm_l1(struct k1_pcie *k1)
+{
+ struct dw_pcie *pci = &k1->pci;
+ u8 offset;
+ u32 val;
+
+ offset = dw_pcie_find_capability(pci, PCI_CAP_ID_EXP);
+ offset += PCI_EXP_LNKCAP;
+
+ dw_pcie_dbi_ro_wr_en(pci);
+ val = dw_pcie_readl_dbi(pci, offset);
+ val &= ~PCI_EXP_LNKCAP_ASPM_L1;
+ dw_pcie_writel_dbi(pci, offset, val);
+ dw_pcie_dbi_ro_wr_dis(pci);
+}
+
+static int k1_pcie_init(struct dw_pcie_rp *pp)
+{
+ struct dw_pcie *pci = to_dw_pcie_from_pp(pp);
+ struct k1_pcie *k1 = to_k1_pcie(pci);
+ u32 reset_ctrl;
+ u32 val;
+ int ret;
+
+ k1_pcie_toggle_soft_reset(k1);
+
+ ret = k1_pcie_enable_resources(k1);
+ if (ret)
+ return ret;
+
+ /* Set the PCI vendor and device ID */
+ dw_pcie_dbi_ro_wr_en(pci);
+ dw_pcie_writew_dbi(pci, PCI_VENDOR_ID, PCI_VENDOR_ID_SPACEMIT);
+ dw_pcie_writew_dbi(pci, PCI_DEVICE_ID, PCI_DEVICE_ID_SPACEMIT_K1);
+ dw_pcie_dbi_ro_wr_dis(pci);
+
+ /*
+ * Start by asserting fundamental reset (drive PERST# low). The
+ * PCI CEM spec says that PERST# should be deasserted at least
+ * 100ms after the power becomes stable, so we'll insert that
+ * delay first. Write, then read it back to guarantee the write
+ * reaches the device before we start the delay.
+ */
+ reset_ctrl = k1->pmu_off + PCIE_CLK_RESET_CONTROL;
+ regmap_set_bits(k1->pmu, reset_ctrl, PCIE_RC_PERST);
+ regmap_read(k1->pmu, reset_ctrl, &val);
+ mdelay(PCIE_T_PVPERL_MS);
+
+ /*
+ * Put the controller in root complex mode, and indicate that
+ * Vaux (3.3v) is present.
+ */
+ regmap_set_bits(k1->pmu, reset_ctrl, DEVICE_TYPE_RC | PCIE_AUX_PWR_DET);
+
+ ret = phy_init(k1->phy);
+ if (ret) {
+ k1_pcie_disable_resources(k1);
+
+ return ret;
+ }
+
+ /* Deassert fundamental reset (drive PERST# high) */
+ regmap_clear_bits(k1->pmu, reset_ctrl, PCIE_RC_PERST);
+
+ /* Finally, as a workaround, disable ASPM L1 */
+ k1_pcie_disable_aspm_l1(k1);
+
+ return 0;
+}
+
+static void k1_pcie_deinit(struct dw_pcie_rp *pp)
+{
+ struct dw_pcie *pci = to_dw_pcie_from_pp(pp);
+ struct k1_pcie *k1 = to_k1_pcie(pci);
+
+ /* Assert fundamental reset (drive PERST# low) */
+ regmap_set_bits(k1->pmu, k1->pmu_off + PCIE_CLK_RESET_CONTROL,
+ PCIE_RC_PERST);
+
+ phy_exit(k1->phy);
+
+ k1_pcie_disable_resources(k1);
+}
+
+static const struct dw_pcie_host_ops k1_pcie_host_ops = {
+ .init = k1_pcie_init,
+ .deinit = k1_pcie_deinit,
+};
+
+static bool k1_pcie_link_up(struct dw_pcie *pci)
+{
+ struct k1_pcie *k1 = to_k1_pcie(pci);
+ u32 val;
+
+ val = readl_relaxed(k1->link + K1_PHY_AHB_LINK_STS);
+
+ return (val & RDLH_LINK_UP) && (val & SMLH_LINK_UP);
+}
+
+static int k1_pcie_start_link(struct dw_pcie *pci)
+{
+ struct k1_pcie *k1 = to_k1_pcie(pci);
+ u32 val;
+
+ /* Stop holding the PHY in reset, and enable link training */
+ regmap_update_bits(k1->pmu, k1->pmu_off + PCIE_CLK_RESET_CONTROL,
+ APP_HOLD_PHY_RST | LTSSM_EN, LTSSM_EN);
+
+ /* Enable the MSI interrupt */
+ writel_relaxed(MSI_CTRL_INT, k1->link + INTR_ENABLE);
+
+ /* Top-level interrupt enable */
+ val = readl_relaxed(k1->link + K1_PHY_AHB_IRQ_EN);
+ val |= PCIE_INTERRUPT_EN;
+ writel_relaxed(val, k1->link + K1_PHY_AHB_IRQ_EN);
+
+ return 0;
+}
+
+static void k1_pcie_stop_link(struct dw_pcie *pci)
+{
+ struct k1_pcie *k1 = to_k1_pcie(pci);
+ u32 val;
+
+ /* Disable interrupts */
+ val = readl_relaxed(k1->link + K1_PHY_AHB_IRQ_EN);
+ val &= ~PCIE_INTERRUPT_EN;
+ writel_relaxed(val, k1->link + K1_PHY_AHB_IRQ_EN);
+
+ writel_relaxed(0, k1->link + INTR_ENABLE);
+
+ /* Disable the link and hold the PHY in reset */
+ regmap_update_bits(k1->pmu, k1->pmu_off + PCIE_CLK_RESET_CONTROL,
+ APP_HOLD_PHY_RST | LTSSM_EN, APP_HOLD_PHY_RST);
+}
+
+static const struct dw_pcie_ops k1_pcie_ops = {
+ .link_up = k1_pcie_link_up,
+ .start_link = k1_pcie_start_link,
+ .stop_link = k1_pcie_stop_link,
+};
+
+static int k1_pcie_parse_port(struct k1_pcie *k1)
+{
+ struct device *dev = k1->pci.dev;
+ struct device_node *root_port;
+ struct phy *phy;
+
+ /* We assume only one root port */
+ root_port = of_get_next_available_child(dev_of_node(dev), NULL);
+ if (!root_port)
+ return -EINVAL;
+
+ phy = devm_of_phy_get(dev, root_port, NULL);
+
+ of_node_put(root_port);
+
+ if (IS_ERR(phy))
+ return PTR_ERR(phy);
+
+ k1->phy = phy;
+
+ return 0;
+}
+
+static int k1_pcie_probe(struct platform_device *pdev)
+{
+ struct device *dev = &pdev->dev;
+ struct k1_pcie *k1;
+ int ret;
+
+ k1 = devm_kzalloc(dev, sizeof(*k1), GFP_KERNEL);
+ if (!k1)
+ return -ENOMEM;
+
+ k1->pmu = syscon_regmap_lookup_by_phandle_args(dev_of_node(dev),
+ SYSCON_APMU, 1,
+ &k1->pmu_off);
+ if (IS_ERR(k1->pmu))
+ return dev_err_probe(dev, PTR_ERR(k1->pmu),
+ "failed to lookup PMU registers\n");
+
+ k1->link = devm_platform_ioremap_resource_byname(pdev, "link");
+ if (IS_ERR(k1->link))
+ return dev_err_probe(dev, PTR_ERR(k1->link),
+ "failed to map \"link\" registers\n");
+
+ k1->pci.dev = dev;
+ k1->pci.ops = &k1_pcie_ops;
+ k1->pci.pp.num_vectors = MAX_MSI_IRQS;
+ dw_pcie_cap_set(&k1->pci, REQ_RES);
+
+ k1->pci.pp.ops = &k1_pcie_host_ops;
+
+ /* Hold the PHY in reset until we start the link */
+ regmap_set_bits(k1->pmu, k1->pmu_off + PCIE_CLK_RESET_CONTROL,
+ APP_HOLD_PHY_RST);
+
+ ret = devm_regulator_get_enable(dev, "vpcie3v3");
+ if (ret)
+ return dev_err_probe(dev, ret,
+ "failed to get \"vpcie3v3\" supply\n");
+
+ pm_runtime_set_active(dev);
+ pm_runtime_no_callbacks(dev);
+ devm_pm_runtime_enable(dev);
+
+ platform_set_drvdata(pdev, k1);
+
+ ret = k1_pcie_parse_port(k1);
+ if (ret)
+ return dev_err_probe(dev, ret, "failed to parse root port\n");
+
+ ret = dw_pcie_host_init(&k1->pci.pp);
+ if (ret)
+ return dev_err_probe(dev, ret, "failed to initialize host\n");
+
+ return 0;
+}
+
+static void k1_pcie_remove(struct platform_device *pdev)
+{
+ struct k1_pcie *k1 = platform_get_drvdata(pdev);
+
+ dw_pcie_host_deinit(&k1->pci.pp);
+}
+
+static const struct of_device_id k1_pcie_of_match_table[] = {
+ { .compatible = "spacemit,k1-pcie", },
+ { }
+};
+
+static struct platform_driver k1_pcie_driver = {
+ .probe = k1_pcie_probe,
+ .remove = k1_pcie_remove,
+ .driver = {
+ .name = "spacemit-k1-pcie",
+ .of_match_table = k1_pcie_of_match_table,
+ .probe_type = PROBE_PREFER_ASYNCHRONOUS,
+ },
+};
+module_platform_driver(k1_pcie_driver);
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SpacemiT K1 PCIe host driver");
diff --git a/drivers/pci/controller/dwc/pcie-stm32-ep.c b/drivers/pci/controller/dwc/pcie-stm32-ep.c
index 3400c7cd2d88..2cecf32d2b0f 100644
--- a/drivers/pci/controller/dwc/pcie-stm32-ep.c
+++ b/drivers/pci/controller/dwc/pcie-stm32-ep.c
@@ -7,9 +7,9 @@
*/
#include <linux/clk.h>
+#include <linux/gpio/consumer.h>
#include <linux/mfd/syscon.h>
#include <linux/of_platform.h>
-#include <linux/of_gpio.h>
#include <linux/phy/phy.h>
#include <linux/platform_device.h>
#include <linux/pm_runtime.h>
@@ -37,36 +37,9 @@ static void stm32_pcie_ep_init(struct dw_pcie_ep *ep)
dw_pcie_ep_reset_bar(pci, bar);
}
-static int stm32_pcie_enable_link(struct dw_pcie *pci)
-{
- struct stm32_pcie *stm32_pcie = to_stm32_pcie(pci);
-
- regmap_update_bits(stm32_pcie->regmap, SYSCFG_PCIECR,
- STM32MP25_PCIECR_LTSSM_EN,
- STM32MP25_PCIECR_LTSSM_EN);
-
- return dw_pcie_wait_for_link(pci);
-}
-
-static void stm32_pcie_disable_link(struct dw_pcie *pci)
-{
- struct stm32_pcie *stm32_pcie = to_stm32_pcie(pci);
-
- regmap_update_bits(stm32_pcie->regmap, SYSCFG_PCIECR, STM32MP25_PCIECR_LTSSM_EN, 0);
-}
-
static int stm32_pcie_start_link(struct dw_pcie *pci)
{
struct stm32_pcie *stm32_pcie = to_stm32_pcie(pci);
- int ret;
-
- dev_dbg(pci->dev, "Enable link\n");
-
- ret = stm32_pcie_enable_link(pci);
- if (ret) {
- dev_err(pci->dev, "PCIe cannot establish link: %d\n", ret);
- return ret;
- }
enable_irq(stm32_pcie->perst_irq);
@@ -77,11 +50,7 @@ static void stm32_pcie_stop_link(struct dw_pcie *pci)
{
struct stm32_pcie *stm32_pcie = to_stm32_pcie(pci);
- dev_dbg(pci->dev, "Disable link\n");
-
disable_irq(stm32_pcie->perst_irq);
-
- stm32_pcie_disable_link(pci);
}
static int stm32_pcie_raise_irq(struct dw_pcie_ep *ep, u8 func_no,
@@ -152,6 +121,9 @@ static void stm32_pcie_perst_assert(struct dw_pcie *pci)
dev_dbg(dev, "PERST asserted by host\n");
+ regmap_update_bits(stm32_pcie->regmap, SYSCFG_PCIECR,
+ STM32MP25_PCIECR_LTSSM_EN, 0);
+
pci_epc_deinit_notify(ep->epc);
stm32_pcie_disable_resources(stm32_pcie);
@@ -192,6 +164,11 @@ static void stm32_pcie_perst_deassert(struct dw_pcie *pci)
pci_epc_init_notify(ep->epc);
+ /* Enable link training */
+ regmap_update_bits(stm32_pcie->regmap, SYSCFG_PCIECR,
+ STM32MP25_PCIECR_LTSSM_EN,
+ STM32MP25_PCIECR_LTSSM_EN);
+
return;
err_disable_resources:
@@ -237,6 +214,8 @@ static int stm32_add_pcie_ep(struct stm32_pcie *stm32_pcie,
ep->ops = &stm32_pcie_ep_ops;
+ ep->page_size = stm32_pcie_epc_features.align;
+
ret = dw_pcie_ep_init(ep);
if (ret) {
dev_err(dev, "Failed to initialize ep: %d\n", ret);
diff --git a/drivers/pci/controller/dwc/pcie-stm32.c b/drivers/pci/controller/dwc/pcie-stm32.c
index 96a5fb893af4..a9e77478443b 100644
--- a/drivers/pci/controller/dwc/pcie-stm32.c
+++ b/drivers/pci/controller/dwc/pcie-stm32.c
@@ -7,18 +7,30 @@
*/
#include <linux/clk.h>
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/gpio/consumer.h>
+#include <linux/irq.h>
#include <linux/mfd/syscon.h>
+#include <linux/mod_devicetable.h>
+#include <linux/module.h>
+#include <linux/of.h>
#include <linux/of_platform.h>
#include <linux/phy/phy.h>
#include <linux/pinctrl/consumer.h>
#include <linux/platform_device.h>
+#include <linux/pm.h>
#include <linux/pm_runtime.h>
#include <linux/pm_wakeirq.h>
#include <linux/regmap.h>
#include <linux/reset.h>
+#include <linux/stddef.h>
+
+#include "../../pci.h"
+
#include "pcie-designware.h"
#include "pcie-stm32.h"
-#include "../../pci.h"
struct stm32_pcie {
struct dw_pcie pci;
diff --git a/drivers/pci/controller/dwc/pcie-stm32.h b/drivers/pci/controller/dwc/pcie-stm32.h
index 09d39f04e469..419cf1ff669d 100644
--- a/drivers/pci/controller/dwc/pcie-stm32.h
+++ b/drivers/pci/controller/dwc/pcie-stm32.h
@@ -6,6 +6,9 @@
* Author: Christian Bruel <christian.bruel@foss.st.com>
*/
+#include <linux/bits.h>
+#include <linux/device.h>
+
#define to_stm32_pcie(x) dev_get_drvdata((x)->dev)
#define STM32MP25_PCIECR_TYPE_MASK GENMASK(11, 8)
diff --git a/drivers/pci/controller/dwc/pcie-tegra194.c b/drivers/pci/controller/dwc/pcie-tegra194.c
index 10e74458e667..0ddeef70726d 100644
--- a/drivers/pci/controller/dwc/pcie-tegra194.c
+++ b/drivers/pci/controller/dwc/pcie-tegra194.c
@@ -260,7 +260,6 @@ struct tegra_pcie_dw {
u32 msi_ctrl_int;
u32 num_lanes;
u32 cid;
- u32 cfg_link_cap_l1sub;
u32 ras_des_cap;
u32 pcie_cap_base;
u32 aspm_cmrt;
@@ -475,8 +474,7 @@ static irqreturn_t tegra_pcie_ep_irq_thread(int irq, void *arg)
return IRQ_HANDLED;
/* If EP doesn't advertise L1SS, just return */
- val = dw_pcie_readl_dbi(pci, pcie->cfg_link_cap_l1sub);
- if (!(val & (PCI_L1SS_CAP_ASPM_L1_1 | PCI_L1SS_CAP_ASPM_L1_2)))
+ if (!pci->l1ss_support)
return IRQ_HANDLED;
/* Check if BME is set to '1' */
@@ -608,24 +606,6 @@ static struct pci_ops tegra_pci_ops = {
};
#if defined(CONFIG_PCIEASPM)
-static void disable_aspm_l11(struct tegra_pcie_dw *pcie)
-{
- u32 val;
-
- val = dw_pcie_readl_dbi(&pcie->pci, pcie->cfg_link_cap_l1sub);
- val &= ~PCI_L1SS_CAP_ASPM_L1_1;
- dw_pcie_writel_dbi(&pcie->pci, pcie->cfg_link_cap_l1sub, val);
-}
-
-static void disable_aspm_l12(struct tegra_pcie_dw *pcie)
-{
- u32 val;
-
- val = dw_pcie_readl_dbi(&pcie->pci, pcie->cfg_link_cap_l1sub);
- val &= ~PCI_L1SS_CAP_ASPM_L1_2;
- dw_pcie_writel_dbi(&pcie->pci, pcie->cfg_link_cap_l1sub, val);
-}
-
static inline u32 event_counter_prog(struct tegra_pcie_dw *pcie, u32 event)
{
u32 val;
@@ -682,10 +662,9 @@ static int aspm_state_cnt(struct seq_file *s, void *data)
static void init_host_aspm(struct tegra_pcie_dw *pcie)
{
struct dw_pcie *pci = &pcie->pci;
- u32 val;
+ u32 l1ss, val;
- val = dw_pcie_find_ext_capability(pci, PCI_EXT_CAP_ID_L1SS);
- pcie->cfg_link_cap_l1sub = val + PCI_L1SS_CAP;
+ l1ss = dw_pcie_find_ext_capability(pci, PCI_EXT_CAP_ID_L1SS);
pcie->ras_des_cap = dw_pcie_find_ext_capability(&pcie->pci,
PCI_EXT_CAP_ID_VNDR);
@@ -697,11 +676,14 @@ static void init_host_aspm(struct tegra_pcie_dw *pcie)
PCIE_RAS_DES_EVENT_COUNTER_CONTROL, val);
/* Program T_cmrt and T_pwr_on values */
- val = dw_pcie_readl_dbi(pci, pcie->cfg_link_cap_l1sub);
+ val = dw_pcie_readl_dbi(pci, l1ss + PCI_L1SS_CAP);
val &= ~(PCI_L1SS_CAP_CM_RESTORE_TIME | PCI_L1SS_CAP_P_PWR_ON_VALUE);
val |= (pcie->aspm_cmrt << 8);
val |= (pcie->aspm_pwr_on_t << 19);
- dw_pcie_writel_dbi(pci, pcie->cfg_link_cap_l1sub, val);
+ dw_pcie_writel_dbi(pci, l1ss + PCI_L1SS_CAP, val);
+
+ if (pcie->supports_clkreq)
+ pci->l1ss_support = true;
/* Program L0s and L1 entrance latencies */
val = dw_pcie_readl_dbi(pci, PCIE_PORT_AFR);
@@ -726,8 +708,6 @@ static void init_debugfs(struct tegra_pcie_dw *pcie)
aspm_state_cnt);
}
#else
-static inline void disable_aspm_l12(struct tegra_pcie_dw *pcie) { return; }
-static inline void disable_aspm_l11(struct tegra_pcie_dw *pcie) { return; }
static inline void init_host_aspm(struct tegra_pcie_dw *pcie) { return; }
static inline void init_debugfs(struct tegra_pcie_dw *pcie) { return; }
#endif
@@ -931,12 +911,6 @@ static int tegra_pcie_dw_host_init(struct dw_pcie_rp *pp)
init_host_aspm(pcie);
- /* Disable ASPM-L1SS advertisement if there is no CLKREQ routing */
- if (!pcie->supports_clkreq) {
- disable_aspm_l11(pcie);
- disable_aspm_l12(pcie);
- }
-
if (!pcie->of_data->has_l1ss_exit_fix) {
val = dw_pcie_readl_dbi(pci, GEN3_RELATED_OFF);
val &= ~GEN3_RELATED_OFF_GEN3_ZRXDC_NONCOMPL;
@@ -1871,12 +1845,6 @@ static void pex_ep_event_pex_rst_deassert(struct tegra_pcie_dw *pcie)
init_host_aspm(pcie);
- /* Disable ASPM-L1SS advertisement if there is no CLKREQ routing */
- if (!pcie->supports_clkreq) {
- disable_aspm_l11(pcie);
- disable_aspm_l12(pcie);
- }
-
if (!pcie->of_data->has_l1ss_exit_fix) {
val = dw_pcie_readl_dbi(pci, GEN3_RELATED_OFF);
val &= ~GEN3_RELATED_OFF_GEN3_ZRXDC_NONCOMPL;
diff --git a/drivers/pci/controller/pci-host-common.c b/drivers/pci/controller/pci-host-common.c
index 810d1c8de24e..c473e7c03bac 100644
--- a/drivers/pci/controller/pci-host-common.c
+++ b/drivers/pci/controller/pci-host-common.c
@@ -53,16 +53,12 @@ struct pci_config_window *pci_host_common_ecam_create(struct device *dev,
EXPORT_SYMBOL_GPL(pci_host_common_ecam_create);
int pci_host_common_init(struct platform_device *pdev,
+ struct pci_host_bridge *bridge,
const struct pci_ecam_ops *ops)
{
struct device *dev = &pdev->dev;
- struct pci_host_bridge *bridge;
struct pci_config_window *cfg;
- bridge = devm_pci_alloc_host_bridge(dev, 0);
- if (!bridge)
- return -ENOMEM;
-
of_pci_check_probe_only();
platform_set_drvdata(pdev, bridge);
@@ -85,12 +81,17 @@ EXPORT_SYMBOL_GPL(pci_host_common_init);
int pci_host_common_probe(struct platform_device *pdev)
{
const struct pci_ecam_ops *ops;
+ struct pci_host_bridge *bridge;
ops = of_device_get_match_data(&pdev->dev);
if (!ops)
return -ENODEV;
- return pci_host_common_init(pdev, ops);
+ bridge = devm_pci_alloc_host_bridge(&pdev->dev, 0);
+ if (!bridge)
+ return -ENOMEM;
+
+ return pci_host_common_init(pdev, bridge, ops);
}
EXPORT_SYMBOL_GPL(pci_host_common_probe);
diff --git a/drivers/pci/controller/pci-host-common.h b/drivers/pci/controller/pci-host-common.h
index 51c35ec0cf37..b5075d4bd7eb 100644
--- a/drivers/pci/controller/pci-host-common.h
+++ b/drivers/pci/controller/pci-host-common.h
@@ -14,6 +14,7 @@ struct pci_ecam_ops;
int pci_host_common_probe(struct platform_device *pdev);
int pci_host_common_init(struct platform_device *pdev,
+ struct pci_host_bridge *bridge,
const struct pci_ecam_ops *ops);
void pci_host_common_remove(struct platform_device *pdev);
diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c
index 146b43981b27..1e237d3538f9 100644
--- a/drivers/pci/controller/pci-hyperv.c
+++ b/drivers/pci/controller/pci-hyperv.c
@@ -3696,48 +3696,6 @@ static int hv_send_resources_released(struct hv_device *hdev)
return 0;
}
-#define HVPCI_DOM_MAP_SIZE (64 * 1024)
-static DECLARE_BITMAP(hvpci_dom_map, HVPCI_DOM_MAP_SIZE);
-
-/*
- * PCI domain number 0 is used by emulated devices on Gen1 VMs, so define 0
- * as invalid for passthrough PCI devices of this driver.
- */
-#define HVPCI_DOM_INVALID 0
-
-/**
- * hv_get_dom_num() - Get a valid PCI domain number
- * Check if the PCI domain number is in use, and return another number if
- * it is in use.
- *
- * @dom: Requested domain number
- *
- * return: domain number on success, HVPCI_DOM_INVALID on failure
- */
-static u16 hv_get_dom_num(u16 dom)
-{
- unsigned int i;
-
- if (test_and_set_bit(dom, hvpci_dom_map) == 0)
- return dom;
-
- for_each_clear_bit(i, hvpci_dom_map, HVPCI_DOM_MAP_SIZE) {
- if (test_and_set_bit(i, hvpci_dom_map) == 0)
- return i;
- }
-
- return HVPCI_DOM_INVALID;
-}
-
-/**
- * hv_put_dom_num() - Mark the PCI domain number as free
- * @dom: Domain number to be freed
- */
-static void hv_put_dom_num(u16 dom)
-{
- clear_bit(dom, hvpci_dom_map);
-}
-
/**
* hv_pci_probe() - New VMBus channel probe, for a root PCI bus
* @hdev: VMBus's tracking struct for this root PCI bus
@@ -3750,9 +3708,9 @@ static int hv_pci_probe(struct hv_device *hdev,
{
struct pci_host_bridge *bridge;
struct hv_pcibus_device *hbus;
- u16 dom_req, dom;
+ int ret, dom;
+ u16 dom_req;
char *name;
- int ret;
bridge = devm_pci_alloc_host_bridge(&hdev->device, 0);
if (!bridge)
@@ -3779,11 +3737,14 @@ static int hv_pci_probe(struct hv_device *hdev,
* PCI bus (which is actually emulated by the hypervisor) is domain 0.
* (2) There will be no overlap between domains (after fixing possible
* collisions) in the same VM.
+ *
+ * Because Gen1 VMs use domain 0, don't allow picking domain 0 here,
+ * even if bytes 4 and 5 of the instance GUID are both zero. For wider
+ * userspace compatibility, limit the domain ID to a 16-bit value.
*/
dom_req = hdev->dev_instance.b[5] << 8 | hdev->dev_instance.b[4];
- dom = hv_get_dom_num(dom_req);
-
- if (dom == HVPCI_DOM_INVALID) {
+ dom = pci_bus_find_emul_domain_nr(dom_req, 1, U16_MAX);
+ if (dom < 0) {
dev_err(&hdev->device,
"Unable to use dom# 0x%x or other numbers", dom_req);
ret = -EINVAL;
@@ -3917,7 +3878,7 @@ close:
destroy_wq:
destroy_workqueue(hbus->wq);
free_dom:
- hv_put_dom_num(hbus->bridge->domain_nr);
+ pci_bus_release_emul_domain_nr(hbus->bridge->domain_nr);
free_bus:
kfree(hbus);
return ret;
@@ -4042,8 +4003,6 @@ static void hv_pci_remove(struct hv_device *hdev)
irq_domain_remove(hbus->irq_domain);
irq_domain_free_fwnode(hbus->fwnode);
- hv_put_dom_num(hbus->bridge->domain_nr);
-
kfree(hbus);
}
@@ -4217,9 +4176,6 @@ static int __init init_hv_pci_drv(void)
if (ret)
return ret;
- /* Set the invalid domain number's bit, so it will not be used */
- set_bit(HVPCI_DOM_INVALID, hvpci_dom_map);
-
/* Initialize PCI block r/w interface */
hvpci_block_ops.read_block = hv_read_config_block;
hvpci_block_ops.write_block = hv_write_config_block;
diff --git a/drivers/pci/controller/pci-ixp4xx.c b/drivers/pci/controller/pci-ixp4xx.c
index acb85e0d5675..9fd401838bad 100644
--- a/drivers/pci/controller/pci-ixp4xx.c
+++ b/drivers/pci/controller/pci-ixp4xx.c
@@ -214,6 +214,7 @@ static u32 ixp4xx_crp_byte_lane_enable_bits(u32 n, int size)
return 0xffffffff;
}
+#ifdef CONFIG_ARM
static int ixp4xx_crp_read_config(struct ixp4xx_pci *p, int where, int size,
u32 *value)
{
@@ -251,6 +252,7 @@ static int ixp4xx_crp_read_config(struct ixp4xx_pci *p, int where, int size,
return PCIBIOS_SUCCESSFUL;
}
+#endif
static int ixp4xx_crp_write_config(struct ixp4xx_pci *p, int where, int size,
u32 value)
@@ -470,6 +472,7 @@ static int ixp4xx_pci_parse_map_dma_ranges(struct ixp4xx_pci *p)
return 0;
}
+#ifdef CONFIG_ARM
/* Only used to get context for abort handling */
static struct ixp4xx_pci *ixp4xx_pci_abort_singleton;
@@ -509,6 +512,7 @@ static int ixp4xx_pci_abort_handler(unsigned long addr, unsigned int fsr,
return 0;
}
+#endif
static int __init ixp4xx_pci_probe(struct platform_device *pdev)
{
@@ -555,10 +559,12 @@ static int __init ixp4xx_pci_probe(struct platform_device *pdev)
dev_info(dev, "controller is in %s mode\n",
p->host_mode ? "host" : "option");
+#ifdef CONFIG_ARM
/* Hook in our fault handler for PCI errors */
ixp4xx_pci_abort_singleton = p;
hook_fault_code(16+6, ixp4xx_pci_abort_handler, SIGBUS, 0,
"imprecise external abort");
+#endif
ret = ixp4xx_pci_parse_map_ranges(p);
if (ret)
diff --git a/drivers/pci/controller/pcie-apple.c b/drivers/pci/controller/pcie-apple.c
index 0380d300adca..2d92fc79f6dd 100644
--- a/drivers/pci/controller/pcie-apple.c
+++ b/drivers/pci/controller/pcie-apple.c
@@ -187,7 +187,6 @@ struct apple_pcie {
const struct hw_info *hw;
unsigned long *bitmap;
struct list_head ports;
- struct list_head entry;
struct completion event;
struct irq_fwspec fwspec;
u32 nvecs;
@@ -206,9 +205,6 @@ struct apple_pcie_port {
int idx;
};
-static LIST_HEAD(pcie_list);
-static DEFINE_MUTEX(pcie_list_lock);
-
static void rmw_set(u32 set, void __iomem *addr)
{
writel_relaxed(readl_relaxed(addr) | set, addr);
@@ -724,32 +720,9 @@ static int apple_msi_init(struct apple_pcie *pcie)
return 0;
}
-static void apple_pcie_register(struct apple_pcie *pcie)
-{
- guard(mutex)(&pcie_list_lock);
-
- list_add_tail(&pcie->entry, &pcie_list);
-}
-
-static void apple_pcie_unregister(struct apple_pcie *pcie)
-{
- guard(mutex)(&pcie_list_lock);
-
- list_del(&pcie->entry);
-}
-
static struct apple_pcie *apple_pcie_lookup(struct device *dev)
{
- struct apple_pcie *pcie;
-
- guard(mutex)(&pcie_list_lock);
-
- list_for_each_entry(pcie, &pcie_list, entry) {
- if (pcie->dev == dev)
- return pcie;
- }
-
- return NULL;
+ return pci_host_bridge_priv(dev_get_drvdata(dev));
}
static struct apple_pcie_port *apple_pcie_get_port(struct pci_dev *pdev)
@@ -875,13 +848,15 @@ static const struct pci_ecam_ops apple_pcie_cfg_ecam_ops = {
static int apple_pcie_probe(struct platform_device *pdev)
{
struct device *dev = &pdev->dev;
+ struct pci_host_bridge *bridge;
struct apple_pcie *pcie;
int ret;
- pcie = devm_kzalloc(dev, sizeof(*pcie), GFP_KERNEL);
- if (!pcie)
+ bridge = devm_pci_alloc_host_bridge(dev, sizeof(*pcie));
+ if (!bridge)
return -ENOMEM;
+ pcie = pci_host_bridge_priv(bridge);
pcie->dev = dev;
pcie->hw = of_device_get_match_data(dev);
if (!pcie->hw)
@@ -897,13 +872,7 @@ static int apple_pcie_probe(struct platform_device *pdev)
if (ret)
return ret;
- apple_pcie_register(pcie);
-
- ret = pci_host_common_init(pdev, &apple_pcie_cfg_ecam_ops);
- if (ret)
- apple_pcie_unregister(pcie);
-
- return ret;
+ return pci_host_common_init(pdev, bridge, &apple_pcie_cfg_ecam_ops);
}
static const struct of_device_id apple_pcie_of_match[] = {
diff --git a/drivers/pci/controller/pcie-brcmstb.c b/drivers/pci/controller/pcie-brcmstb.c
index 9afbd02ded35..062f55690012 100644
--- a/drivers/pci/controller/pcie-brcmstb.c
+++ b/drivers/pci/controller/pcie-brcmstb.c
@@ -14,15 +14,18 @@
#include <linux/irqchip/chained_irq.h>
#include <linux/irqchip/irq-msi-lib.h>
#include <linux/irqdomain.h>
+#include <linux/kdebug.h>
#include <linux/kernel.h>
#include <linux/list.h>
#include <linux/log2.h>
#include <linux/module.h>
#include <linux/msi.h>
+#include <linux/notifier.h>
#include <linux/of_address.h>
#include <linux/of_irq.h>
#include <linux/of_pci.h>
#include <linux/of_platform.h>
+#include <linux/panic_notifier.h>
#include <linux/pci.h>
#include <linux/pci-ecam.h>
#include <linux/printk.h>
@@ -30,7 +33,9 @@
#include <linux/reset.h>
#include <linux/sizes.h>
#include <linux/slab.h>
+#include <linux/spinlock.h>
#include <linux/string.h>
+#include <linux/string_choices.h>
#include <linux/types.h>
#include "../pci.h"
@@ -48,7 +53,6 @@
#define PCIE_RC_CFG_PRIV1_LINK_CAPABILITY 0x04dc
#define PCIE_RC_CFG_PRIV1_LINK_CAPABILITY_MAX_LINK_WIDTH_MASK 0x1f0
-#define PCIE_RC_CFG_PRIV1_LINK_CAPABILITY_ASPM_SUPPORT_MASK 0xc00
#define PCIE_RC_CFG_PRIV1_ROOT_CAP 0x4f8
#define PCIE_RC_CFG_PRIV1_ROOT_CAP_L1SS_MODE_MASK 0xf8
@@ -155,8 +159,40 @@
#define MSI_INT_MASK_SET 0x10
#define MSI_INT_MASK_CLR 0x14
+/* Error report registers */
+#define PCIE_OUTB_ERR_TREAT 0x6000
+#define PCIE_OUTB_ERR_TREAT_CONFIG 0x1
+#define PCIE_OUTB_ERR_TREAT_MEM 0x2
+#define PCIE_OUTB_ERR_VALID 0x6004
+#define PCIE_OUTB_ERR_CLEAR 0x6008
+#define PCIE_OUTB_ERR_ACC_INFO 0x600c
+#define PCIE_OUTB_ERR_ACC_INFO_CFG_ERR BIT(0)
+#define PCIE_OUTB_ERR_ACC_INFO_MEM_ERR BIT(1)
+#define PCIE_OUTB_ERR_ACC_INFO_TYPE_64 BIT(2)
+#define PCIE_OUTB_ERR_ACC_INFO_DIR_WRITE BIT(4)
+#define PCIE_OUTB_ERR_ACC_INFO_BYTE_LANES 0xff00
+#define PCIE_OUTB_ERR_ACC_ADDR 0x6010
+#define PCIE_OUTB_ERR_ACC_ADDR_BUS 0xff00000
+#define PCIE_OUTB_ERR_ACC_ADDR_DEV 0xf8000
+#define PCIE_OUTB_ERR_ACC_ADDR_FUNC 0x7000
+#define PCIE_OUTB_ERR_ACC_ADDR_REG 0xfff
+#define PCIE_OUTB_ERR_CFG_CAUSE 0x6014
+#define PCIE_OUTB_ERR_CFG_CAUSE_TIMEOUT BIT(6)
+#define PCIE_OUTB_ERR_CFG_CAUSE_ABORT BIT(5)
+#define PCIE_OUTB_ERR_CFG_CAUSE_UNSUPP_REQ BIT(4)
+#define PCIE_OUTB_ERR_CFG_CAUSE_ACC_TIMEOUT BIT(2)
+#define PCIE_OUTB_ERR_CFG_CAUSE_ACC_DISABLED BIT(1)
+#define PCIE_OUTB_ERR_CFG_CAUSE_ACC_64BIT BIT(0)
+#define PCIE_OUTB_ERR_MEM_ADDR_LO 0x6018
+#define PCIE_OUTB_ERR_MEM_ADDR_HI 0x601c
+#define PCIE_OUTB_ERR_MEM_CAUSE 0x6020
+#define PCIE_OUTB_ERR_MEM_CAUSE_TIMEOUT BIT(6)
+#define PCIE_OUTB_ERR_MEM_CAUSE_ABORT BIT(5)
+#define PCIE_OUTB_ERR_MEM_CAUSE_UNSUPP_REQ BIT(4)
+#define PCIE_OUTB_ERR_MEM_CAUSE_ACC_DISABLED BIT(1)
+#define PCIE_OUTB_ERR_MEM_CAUSE_BAD_ADDR BIT(0)
+
#define PCIE_RGR1_SW_INIT_1_PERST_MASK 0x1
-#define PCIE_RGR1_SW_INIT_1_PERST_SHIFT 0x0
#define RGR1_SW_INIT_1_INIT_GENERIC_MASK 0x2
#define RGR1_SW_INIT_1_INIT_GENERIC_SHIFT 0x1
@@ -259,6 +295,7 @@ struct pcie_cfg_data {
int (*perst_set)(struct brcm_pcie *pcie, u32 val);
int (*bridge_sw_init_set)(struct brcm_pcie *pcie, u32 val);
int (*post_setup)(struct brcm_pcie *pcie);
+ bool has_err_report;
};
struct subdev_regulators {
@@ -303,6 +340,10 @@ struct brcm_pcie {
struct subdev_regulators *sr;
bool ep_wakeup_capable;
const struct pcie_cfg_data *cfg;
+ bool bridge_in_reset;
+ struct notifier_block die_notifier;
+ struct notifier_block panic_notifier;
+ spinlock_t bridge_lock;
};
static inline bool is_bmips(const struct brcm_pcie *pcie)
@@ -310,6 +351,24 @@ static inline bool is_bmips(const struct brcm_pcie *pcie)
return pcie->cfg->soc_base == BCM7435 || pcie->cfg->soc_base == BCM7425;
}
+static int brcm_pcie_bridge_sw_init_set(struct brcm_pcie *pcie, u32 val)
+{
+ unsigned long flags;
+ int ret;
+
+ if (pcie->cfg->has_err_report)
+ spin_lock_irqsave(&pcie->bridge_lock, flags);
+
+ ret = pcie->cfg->bridge_sw_init_set(pcie, val);
+ /* If we fail, assume the bridge is in reset (off) */
+ pcie->bridge_in_reset = ret ? true : val;
+
+ if (pcie->cfg->has_err_report)
+ spin_unlock_irqrestore(&pcie->bridge_lock, flags);
+
+ return ret;
+}
+
/*
* This is to convert the size of the inbound "BAR" region to the
* non-linear values of PCIE_X_MISC_RC_BAR[123]_CONFIG_LO.SIZE
@@ -1075,13 +1134,13 @@ static int brcm_pcie_setup(struct brcm_pcie *pcie)
void __iomem *base = pcie->base;
struct pci_host_bridge *bridge;
struct resource_entry *entry;
- u32 tmp, burst, aspm_support, num_lanes, num_lanes_cap;
+ u32 tmp, burst, num_lanes, num_lanes_cap;
u8 num_out_wins = 0;
int num_inbound_wins = 0;
int memc, ret;
/* Reset the bridge */
- ret = pcie->cfg->bridge_sw_init_set(pcie, 1);
+ ret = brcm_pcie_bridge_sw_init_set(pcie, 1);
if (ret)
return ret;
@@ -1097,7 +1156,7 @@ static int brcm_pcie_setup(struct brcm_pcie *pcie)
usleep_range(100, 200);
/* Take the bridge out of reset */
- ret = pcie->cfg->bridge_sw_init_set(pcie, 0);
+ ret = brcm_pcie_bridge_sw_init_set(pcie, 0);
if (ret)
return ret;
@@ -1175,12 +1234,9 @@ static int brcm_pcie_setup(struct brcm_pcie *pcie)
/* Don't advertise L0s capability if 'aspm-no-l0s' */
- aspm_support = PCIE_LINK_STATE_L1;
- if (!of_property_read_bool(pcie->np, "aspm-no-l0s"))
- aspm_support |= PCIE_LINK_STATE_L0S;
tmp = readl(base + PCIE_RC_CFG_PRIV1_LINK_CAPABILITY);
- u32p_replace_bits(&tmp, aspm_support,
- PCIE_RC_CFG_PRIV1_LINK_CAPABILITY_ASPM_SUPPORT_MASK);
+ if (of_property_read_bool(pcie->np, "aspm-no-l0s"))
+ tmp &= ~PCI_EXP_LNKCAP_ASPM_L0S;
writel(tmp, base + PCIE_RC_CFG_PRIV1_LINK_CAPABILITY);
/* 'tmp' still holds the contents of PRIV1_LINK_CAPABILITY */
@@ -1565,7 +1621,7 @@ static int brcm_pcie_turn_off(struct brcm_pcie *pcie)
if (!(pcie->cfg->quirks & CFG_QUIRK_AVOID_BRIDGE_SHUTDOWN))
/* Shutdown PCIe bridge */
- ret = pcie->cfg->bridge_sw_init_set(pcie, 1);
+ ret = brcm_pcie_bridge_sw_init_set(pcie, 1);
return ret;
}
@@ -1653,7 +1709,9 @@ static int brcm_pcie_resume_noirq(struct device *dev)
goto err_reset;
/* Take bridge out of reset so we can access the SERDES reg */
- pcie->cfg->bridge_sw_init_set(pcie, 0);
+ ret = brcm_pcie_bridge_sw_init_set(pcie, 0);
+ if (ret)
+ goto err_reset;
/* SERDES_IDDQ = 0 */
tmp = readl(base + HARD_DEBUG(pcie));
@@ -1707,6 +1765,119 @@ err_disable_clk:
return ret;
}
+/* Dump out PCIe errors on die or panic */
+static int brcm_pcie_dump_err(struct brcm_pcie *pcie,
+ const char *type)
+{
+ void __iomem *base = pcie->base;
+ int i, is_cfg_err, is_mem_err, lanes;
+ const char *width_str, *direction_str;
+ u32 info, cfg_addr, cfg_cause, mem_cause, lo, hi;
+ struct pci_host_bridge *bridge = pci_host_bridge_from_priv(pcie);
+ unsigned long flags;
+ char lanes_str[9];
+
+ spin_lock_irqsave(&pcie->bridge_lock, flags);
+ /* Don't access registers when the bridge is off */
+ if (pcie->bridge_in_reset || readl(base + PCIE_OUTB_ERR_VALID) == 0) {
+ spin_unlock_irqrestore(&pcie->bridge_lock, flags);
+ return NOTIFY_DONE;
+ }
+
+ /* Read all necessary registers so we can release the spinlock ASAP */
+ info = readl(base + PCIE_OUTB_ERR_ACC_INFO);
+ is_cfg_err = !!(info & PCIE_OUTB_ERR_ACC_INFO_CFG_ERR);
+ is_mem_err = !!(info & PCIE_OUTB_ERR_ACC_INFO_MEM_ERR);
+ if (is_cfg_err) {
+ cfg_addr = readl(base + PCIE_OUTB_ERR_ACC_ADDR);
+ cfg_cause = readl(base + PCIE_OUTB_ERR_CFG_CAUSE);
+ }
+ if (is_mem_err) {
+ mem_cause = readl(base + PCIE_OUTB_ERR_MEM_CAUSE);
+ lo = readl(base + PCIE_OUTB_ERR_MEM_ADDR_LO);
+ hi = readl(base + PCIE_OUTB_ERR_MEM_ADDR_HI);
+ }
+ /* We've got all of the info, clear the error */
+ writel(1, base + PCIE_OUTB_ERR_CLEAR);
+ spin_unlock_irqrestore(&pcie->bridge_lock, flags);
+
+ dev_err(pcie->dev, "reporting PCIe info which may be related to %s error\n",
+ type);
+ width_str = (info & PCIE_OUTB_ERR_ACC_INFO_TYPE_64) ? "64bit" : "32bit";
+ direction_str = str_read_write(!(info & PCIE_OUTB_ERR_ACC_INFO_DIR_WRITE));
+ lanes = FIELD_GET(PCIE_OUTB_ERR_ACC_INFO_BYTE_LANES, info);
+ for (i = 0, lanes_str[8] = 0; i < 8; i++)
+ lanes_str[i] = (lanes & (1 << i)) ? '1' : '0';
+
+ if (is_cfg_err) {
+ int bus = FIELD_GET(PCIE_OUTB_ERR_ACC_ADDR_BUS, cfg_addr);
+ int dev = FIELD_GET(PCIE_OUTB_ERR_ACC_ADDR_DEV, cfg_addr);
+ int func = FIELD_GET(PCIE_OUTB_ERR_ACC_ADDR_FUNC, cfg_addr);
+ int reg = FIELD_GET(PCIE_OUTB_ERR_ACC_ADDR_REG, cfg_addr);
+
+ dev_err(pcie->dev, "Error: CFG Acc, %s, %s (%04x:%02x:%02x.%d) reg=0x%x, lanes=%s\n",
+ width_str, direction_str, bridge->domain_nr, bus, dev,
+ func, reg, lanes_str);
+ dev_err(pcie->dev, " Type: TO=%d Abt=%d UnsupReq=%d AccTO=%d AccDsbld=%d Acc64bit=%d\n",
+ !!(cfg_cause & PCIE_OUTB_ERR_CFG_CAUSE_TIMEOUT),
+ !!(cfg_cause & PCIE_OUTB_ERR_CFG_CAUSE_ABORT),
+ !!(cfg_cause & PCIE_OUTB_ERR_CFG_CAUSE_UNSUPP_REQ),
+ !!(cfg_cause & PCIE_OUTB_ERR_CFG_CAUSE_ACC_TIMEOUT),
+ !!(cfg_cause & PCIE_OUTB_ERR_CFG_CAUSE_ACC_DISABLED),
+ !!(cfg_cause & PCIE_OUTB_ERR_CFG_CAUSE_ACC_64BIT));
+ }
+
+ if (is_mem_err) {
+ u64 addr = ((u64)hi << 32) | (u64)lo;
+
+ dev_err(pcie->dev, "Error: Mem Acc, %s, %s, @0x%llx, lanes=%s\n",
+ width_str, direction_str, addr, lanes_str);
+ dev_err(pcie->dev, " Type: TO=%d Abt=%d UnsupReq=%d AccDsble=%d BadAddr=%d\n",
+ !!(mem_cause & PCIE_OUTB_ERR_MEM_CAUSE_TIMEOUT),
+ !!(mem_cause & PCIE_OUTB_ERR_MEM_CAUSE_ABORT),
+ !!(mem_cause & PCIE_OUTB_ERR_MEM_CAUSE_UNSUPP_REQ),
+ !!(mem_cause & PCIE_OUTB_ERR_MEM_CAUSE_ACC_DISABLED),
+ !!(mem_cause & PCIE_OUTB_ERR_MEM_CAUSE_BAD_ADDR));
+ }
+
+ return NOTIFY_DONE;
+}
+
+static int brcm_pcie_die_notify_cb(struct notifier_block *self,
+ unsigned long v, void *p)
+{
+ struct brcm_pcie *pcie =
+ container_of(self, struct brcm_pcie, die_notifier);
+
+ return brcm_pcie_dump_err(pcie, "Die");
+}
+
+static int brcm_pcie_panic_notify_cb(struct notifier_block *self,
+ unsigned long v, void *p)
+{
+ struct brcm_pcie *pcie =
+ container_of(self, struct brcm_pcie, panic_notifier);
+
+ return brcm_pcie_dump_err(pcie, "Panic");
+}
+
+static void brcm_register_die_notifiers(struct brcm_pcie *pcie)
+{
+ pcie->panic_notifier.notifier_call = brcm_pcie_panic_notify_cb;
+ atomic_notifier_chain_register(&panic_notifier_list,
+ &pcie->panic_notifier);
+
+ pcie->die_notifier.notifier_call = brcm_pcie_die_notify_cb;
+ register_die_notifier(&pcie->die_notifier);
+}
+
+static void brcm_unregister_die_notifiers(struct brcm_pcie *pcie)
+{
+ unregister_die_notifier(&pcie->die_notifier);
+ atomic_notifier_chain_unregister(&panic_notifier_list,
+ &pcie->panic_notifier);
+}
+
static void __brcm_pcie_remove(struct brcm_pcie *pcie)
{
brcm_msi_remove(pcie);
@@ -1725,6 +1896,9 @@ static void brcm_pcie_remove(struct platform_device *pdev)
pci_stop_root_bus(bridge->bus);
pci_remove_root_bus(bridge->bus);
+ if (pcie->cfg->has_err_report)
+ brcm_unregister_die_notifiers(pcie);
+
__brcm_pcie_remove(pcie);
}
@@ -1825,6 +1999,7 @@ static const struct pcie_cfg_data bcm7216_cfg = {
.bridge_sw_init_set = brcm_pcie_bridge_sw_init_set_7278,
.has_phy = true,
.num_inbound_wins = 3,
+ .has_err_report = true,
};
static const struct pcie_cfg_data bcm7712_cfg = {
@@ -1921,7 +2096,10 @@ static int brcm_pcie_probe(struct platform_device *pdev)
if (ret)
return dev_err_probe(&pdev->dev, ret, "could not enable clock\n");
- pcie->cfg->bridge_sw_init_set(pcie, 0);
+ ret = brcm_pcie_bridge_sw_init_set(pcie, 0);
+ if (ret)
+ return dev_err_probe(&pdev->dev, ret,
+ "could not de-assert bridge reset\n");
if (pcie->swinit_reset) {
ret = reset_control_assert(pcie->swinit_reset);
@@ -1996,6 +2174,11 @@ static int brcm_pcie_probe(struct platform_device *pdev)
return ret;
}
+ if (pcie->cfg->has_err_report) {
+ spin_lock_init(&pcie->bridge_lock);
+ brcm_register_die_notifiers(pcie);
+ }
+
return 0;
fail:
diff --git a/drivers/pci/controller/pcie-mediatek.c b/drivers/pci/controller/pcie-mediatek.c
index 24cc30a2ab6c..4b78b6528f9f 100644
--- a/drivers/pci/controller/pcie-mediatek.c
+++ b/drivers/pci/controller/pcie-mediatek.c
@@ -143,23 +143,33 @@
struct mtk_pcie_port;
/**
+ * enum mtk_pcie_quirks - MTK PCIe quirks
+ * @MTK_PCIE_FIX_CLASS_ID: host's class ID needed to be fixed
+ * @MTK_PCIE_FIX_DEVICE_ID: host's device ID needed to be fixed
+ * @MTK_PCIE_NO_MSI: Bridge has no MSI support, and relies on an external block
+ * @MTK_PCIE_SKIP_RSTB: Skip calling RSTB bits on PCIe probe
+ */
+enum mtk_pcie_quirks {
+ MTK_PCIE_FIX_CLASS_ID = BIT(0),
+ MTK_PCIE_FIX_DEVICE_ID = BIT(1),
+ MTK_PCIE_NO_MSI = BIT(2),
+ MTK_PCIE_SKIP_RSTB = BIT(3),
+};
+
+/**
* struct mtk_pcie_soc - differentiate between host generations
- * @need_fix_class_id: whether this host's class ID needed to be fixed or not
- * @need_fix_device_id: whether this host's device ID needed to be fixed or not
- * @no_msi: Bridge has no MSI support, and relies on an external block
* @device_id: device ID which this host need to be fixed
* @ops: pointer to configuration access functions
* @startup: pointer to controller setting functions
* @setup_irq: pointer to initialize IRQ functions
+ * @quirks: PCIe device quirks.
*/
struct mtk_pcie_soc {
- bool need_fix_class_id;
- bool need_fix_device_id;
- bool no_msi;
unsigned int device_id;
struct pci_ops *ops;
int (*startup)(struct mtk_pcie_port *port);
int (*setup_irq)(struct mtk_pcie_port *port, struct device_node *node);
+ enum mtk_pcie_quirks quirks;
};
/**
@@ -679,31 +689,28 @@ static int mtk_pcie_startup_port_v2(struct mtk_pcie_port *port)
regmap_update_bits(pcie->cfg, PCIE_SYS_CFG_V2, val, val);
}
- /* Assert all reset signals */
- writel(0, port->base + PCIE_RST_CTRL);
+ if (!(soc->quirks & MTK_PCIE_SKIP_RSTB)) {
+ /* Assert all reset signals */
+ writel(0, port->base + PCIE_RST_CTRL);
- /*
- * Enable PCIe link down reset, if link status changed from link up to
- * link down, this will reset MAC control registers and configuration
- * space.
- */
- writel(PCIE_LINKDOWN_RST_EN, port->base + PCIE_RST_CTRL);
+ /*
+ * Enable PCIe link down reset, if link status changed from
+ * link up to link down, this will reset MAC control registers
+ * and configuration space.
+ */
+ writel(PCIE_LINKDOWN_RST_EN, port->base + PCIE_RST_CTRL);
- /*
- * Described in PCIe CEM specification sections 2.2 (PERST# Signal) and
- * 2.2.1 (Initial Power-Up (G3 to S0)). The deassertion of PERST# should
- * be delayed 100ms (TPVPERL) for the power and clock to become stable.
- */
- msleep(100);
+ msleep(PCIE_T_PVPERL_MS);
- /* De-assert PHY, PE, PIPE, MAC and configuration reset */
- val = readl(port->base + PCIE_RST_CTRL);
- val |= PCIE_PHY_RSTB | PCIE_PERSTB | PCIE_PIPE_SRSTB |
- PCIE_MAC_SRSTB | PCIE_CRSTB;
- writel(val, port->base + PCIE_RST_CTRL);
+ /* De-assert PHY, PE, PIPE, MAC and configuration reset */
+ val = readl(port->base + PCIE_RST_CTRL);
+ val |= PCIE_PHY_RSTB | PCIE_PERSTB | PCIE_PIPE_SRSTB |
+ PCIE_MAC_SRSTB | PCIE_CRSTB;
+ writel(val, port->base + PCIE_RST_CTRL);
+ }
/* Set up vendor ID and class code */
- if (soc->need_fix_class_id) {
+ if (soc->quirks & MTK_PCIE_FIX_CLASS_ID) {
val = PCI_VENDOR_ID_MEDIATEK;
writew(val, port->base + PCIE_CONF_VEND_ID);
@@ -711,7 +718,7 @@ static int mtk_pcie_startup_port_v2(struct mtk_pcie_port *port)
writew(val, port->base + PCIE_CONF_CLASS_ID);
}
- if (soc->need_fix_device_id)
+ if (soc->quirks & MTK_PCIE_FIX_DEVICE_ID)
writew(soc->device_id, port->base + PCIE_CONF_DEVICE_ID);
/* 100ms timeout value should be enough for Gen1/2 training */
@@ -821,6 +828,41 @@ static int mtk_pcie_startup_port(struct mtk_pcie_port *port)
return 0;
}
+static int mtk_pcie_startup_port_an7583(struct mtk_pcie_port *port)
+{
+ struct mtk_pcie *pcie = port->pcie;
+ struct device *dev = pcie->dev;
+ struct pci_host_bridge *host;
+ struct resource_entry *entry;
+ struct regmap *pbus_regmap;
+ resource_size_t addr;
+ u32 args[2], size;
+
+ /*
+ * Configure PBus base address and base address mask to allow
+ * the hw to detect if a given address is accessible on PCIe
+ * controller.
+ */
+ pbus_regmap = syscon_regmap_lookup_by_phandle_args(dev->of_node,
+ "mediatek,pbus-csr",
+ ARRAY_SIZE(args),
+ args);
+ if (IS_ERR(pbus_regmap))
+ return PTR_ERR(pbus_regmap);
+
+ host = pci_host_bridge_from_priv(pcie);
+ entry = resource_list_first_type(&host->windows, IORESOURCE_MEM);
+ if (!entry)
+ return -ENODEV;
+
+ addr = entry->res->start - entry->offset;
+ regmap_write(pbus_regmap, args[0], lower_32_bits(addr));
+ size = lower_32_bits(resource_size(entry->res));
+ regmap_write(pbus_regmap, args[1], GENMASK(31, __fls(size)));
+
+ return mtk_pcie_startup_port_v2(port);
+}
+
static void mtk_pcie_enable_port(struct mtk_pcie_port *port)
{
struct mtk_pcie *pcie = port->pcie;
@@ -1099,7 +1141,7 @@ static int mtk_pcie_probe(struct platform_device *pdev)
host->ops = pcie->soc->ops;
host->sysdata = pcie;
- host->msi_domain = pcie->soc->no_msi;
+ host->msi_domain = !!(pcie->soc->quirks & MTK_PCIE_NO_MSI);
err = pci_host_probe(host);
if (err)
@@ -1187,9 +1229,9 @@ static const struct dev_pm_ops mtk_pcie_pm_ops = {
};
static const struct mtk_pcie_soc mtk_pcie_soc_v1 = {
- .no_msi = true,
.ops = &mtk_pcie_ops,
.startup = mtk_pcie_startup_port,
+ .quirks = MTK_PCIE_NO_MSI,
};
static const struct mtk_pcie_soc mtk_pcie_soc_mt2712 = {
@@ -1199,22 +1241,29 @@ static const struct mtk_pcie_soc mtk_pcie_soc_mt2712 = {
};
static const struct mtk_pcie_soc mtk_pcie_soc_mt7622 = {
- .need_fix_class_id = true,
.ops = &mtk_pcie_ops_v2,
.startup = mtk_pcie_startup_port_v2,
.setup_irq = mtk_pcie_setup_irq,
+ .quirks = MTK_PCIE_FIX_CLASS_ID,
+};
+
+static const struct mtk_pcie_soc mtk_pcie_soc_an7583 = {
+ .ops = &mtk_pcie_ops_v2,
+ .startup = mtk_pcie_startup_port_an7583,
+ .setup_irq = mtk_pcie_setup_irq,
+ .quirks = MTK_PCIE_FIX_CLASS_ID | MTK_PCIE_SKIP_RSTB,
};
static const struct mtk_pcie_soc mtk_pcie_soc_mt7629 = {
- .need_fix_class_id = true,
- .need_fix_device_id = true,
.device_id = PCI_DEVICE_ID_MEDIATEK_7629,
.ops = &mtk_pcie_ops_v2,
.startup = mtk_pcie_startup_port_v2,
.setup_irq = mtk_pcie_setup_irq,
+ .quirks = MTK_PCIE_FIX_CLASS_ID | MTK_PCIE_FIX_DEVICE_ID,
};
static const struct of_device_id mtk_pcie_ids[] = {
+ { .compatible = "airoha,an7583-pcie", .data = &mtk_pcie_soc_an7583 },
{ .compatible = "mediatek,mt2701-pcie", .data = &mtk_pcie_soc_v1 },
{ .compatible = "mediatek,mt7623-pcie", .data = &mtk_pcie_soc_v1 },
{ .compatible = "mediatek,mt2712-pcie", .data = &mtk_pcie_soc_mt2712 },
diff --git a/drivers/pci/controller/pcie-rzg3s-host.c b/drivers/pci/controller/pcie-rzg3s-host.c
new file mode 100644
index 000000000000..667e6d629474
--- /dev/null
+++ b/drivers/pci/controller/pcie-rzg3s-host.c
@@ -0,0 +1,1761 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * PCIe driver for Renesas RZ/G3S SoCs
+ *
+ * Copyright (C) 2025 Renesas Electronics Corp.
+ *
+ * Based on:
+ * drivers/pci/controller/pcie-rcar-host.c
+ * Copyright (C) 2009 - 2011 Paul Mundt
+ */
+
+#include <linux/bitfield.h>
+#include <linux/bitmap.h>
+#include <linux/bitops.h>
+#include <linux/cleanup.h>
+#include <linux/clk.h>
+#include <linux/delay.h>
+#include <linux/iopoll.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/irqchip/chained_irq.h>
+#include <linux/irqchip/irq-msi-lib.h>
+#include <linux/irqdomain.h>
+#include <linux/kernel.h>
+#include <linux/mfd/syscon.h>
+#include <linux/mutex.h>
+#include <linux/msi.h>
+#include <linux/of_irq.h>
+#include <linux/pci.h>
+#include <linux/platform_device.h>
+#include <linux/pm_runtime.h>
+#include <linux/regmap.h>
+#include <linux/reset.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+#include <linux/units.h>
+
+#include "../pci.h"
+
+/* AXI registers */
+#define RZG3S_PCI_REQDATA(id) (0x80 + (id) * 0x4)
+#define RZG3S_PCI_REQRCVDAT 0x8c
+
+#define RZG3S_PCI_REQADR1 0x90
+#define RZG3S_PCI_REQADR1_BUS GENMASK(31, 24)
+#define RZG3S_PCI_REQADR1_DEV GENMASK(23, 19)
+#define RZG3S_PCI_REQADR1_FUNC GENMASK(18, 16)
+#define RZG3S_PCI_REQADR1_REG GENMASK(11, 0)
+
+#define RZG3S_PCI_REQBE 0x98
+#define RZG3S_PCI_REQBE_BYTE_EN GENMASK(3, 0)
+
+#define RZG3S_PCI_REQISS 0x9c
+#define RZG3S_PCI_REQISS_MOR_STATUS GENMASK(18, 16)
+#define RZG3S_PCI_REQISS_TR_TYPE GENMASK(11, 8)
+#define RZG3S_PCI_REQISS_TR_TP0_RD FIELD_PREP(RZG3S_PCI_REQISS_TR_TYPE, 0x4)
+#define RZG3S_PCI_REQISS_TR_TP0_WR FIELD_PREP(RZG3S_PCI_REQISS_TR_TYPE, 0x5)
+#define RZG3S_PCI_REQISS_TR_TP1_RD FIELD_PREP(RZG3S_PCI_REQISS_TR_TYPE, 0x6)
+#define RZG3S_PCI_REQISS_TR_TP1_WR FIELD_PREP(RZG3S_PCI_REQISS_TR_TYPE, 0x7)
+#define RZG3S_PCI_REQISS_REQ_ISSUE BIT(0)
+
+#define RZG3S_PCI_MSIRCVWADRL 0x100
+#define RZG3S_PCI_MSIRCVWADRL_MASK GENMASK(31, 3)
+#define RZG3S_PCI_MSIRCVWADRL_MSG_DATA_ENA BIT(1)
+#define RZG3S_PCI_MSIRCVWADRL_ENA BIT(0)
+
+#define RZG3S_PCI_MSIRCVWADRU 0x104
+
+#define RZG3S_PCI_MSIRCVWMSKL 0x108
+#define RZG3S_PCI_MSIRCVWMSKL_MASK GENMASK(31, 2)
+
+#define RZG3S_PCI_PINTRCVIE 0x110
+#define RZG3S_PCI_PINTRCVIE_INTX(i) BIT(i)
+#define RZG3S_PCI_PINTRCVIE_MSI BIT(4)
+
+#define RZG3S_PCI_PINTRCVIS 0x114
+#define RZG3S_PCI_PINTRCVIS_INTX(i) BIT(i)
+#define RZG3S_PCI_PINTRCVIS_MSI BIT(4)
+
+#define RZG3S_PCI_MSGRCVIE 0x120
+#define RZG3S_PCI_MSGRCVIE_MSG_RCV BIT(24)
+
+#define RZG3S_PCI_MSGRCVIS 0x124
+#define RZG3S_PCI_MSGRCVIS_MRI BIT(24)
+
+#define RZG3S_PCI_PEIE0 0x200
+
+#define RZG3S_PCI_PEIS0 0x204
+#define RZG3S_PCI_PEIS0_RX_DLLP_PM_ENTER BIT(12)
+#define RZG3S_PCI_PEIS0_DL_UPDOWN BIT(9)
+
+#define RZG3S_PCI_PEIE1 0x208
+#define RZG3S_PCI_PEIS1 0x20c
+#define RZG3S_PCI_AMEIS 0x214
+#define RZG3S_PCI_ASEIS1 0x224
+
+#define RZG3S_PCI_PCSTAT1 0x408
+#define RZG3S_PCI_PCSTAT1_LTSSM_STATE GENMASK(14, 10)
+#define RZG3S_PCI_PCSTAT1_DL_DOWN_STS BIT(0)
+
+#define RZG3S_PCI_PCCTRL2 0x410
+#define RZG3S_PCI_PCCTRL2_LS_CHG GENMASK(9, 8)
+#define RZG3S_PCI_PCCTRL2_LS_CHG_REQ BIT(0)
+
+#define RZG3S_PCI_PCSTAT2 0x414
+#define RZG3S_PCI_PCSTAT2_LS_CHG_DONE BIT(28)
+#define RZG3S_PCI_PCSTAT2_SDRIRE GENMASK(7, 1)
+
+#define RZG3S_PCI_PERM 0x300
+#define RZG3S_PCI_PERM_CFG_HWINIT_EN BIT(2)
+#define RZG3S_PCI_PERM_PIPE_PHY_REG_EN BIT(1)
+
+#define RZG3S_PCI_MSIRE(id) (0x600 + (id) * 0x10)
+#define RZG3S_PCI_MSIRE_ENA BIT(0)
+
+#define RZG3S_PCI_MSIRM(id) (0x608 + (id) * 0x10)
+#define RZG3S_PCI_MSIRS(id) (0x60c + (id) * 0x10)
+
+#define RZG3S_PCI_AWBASEL(id) (0x1000 + (id) * 0x20)
+#define RZG3S_PCI_AWBASEL_WIN_ENA BIT(0)
+
+#define RZG3S_PCI_AWBASEU(id) (0x1004 + (id) * 0x20)
+#define RZG3S_PCI_AWMASKL(id) (0x1008 + (id) * 0x20)
+#define RZG3S_PCI_AWMASKU(id) (0x100c + (id) * 0x20)
+#define RZG3S_PCI_ADESTL(id) (0x1010 + (id) * 0x20)
+#define RZG3S_PCI_ADESTU(id) (0x1014 + (id) * 0x20)
+
+#define RZG3S_PCI_PWBASEL(id) (0x1100 + (id) * 0x20)
+#define RZG3S_PCI_PWBASEL_ENA BIT(0)
+
+#define RZG3S_PCI_PWBASEU(id) (0x1104 + (id) * 0x20)
+#define RZG3S_PCI_PDESTL(id) (0x1110 + (id) * 0x20)
+#define RZG3S_PCI_PDESTU(id) (0x1114 + (id) * 0x20)
+#define RZG3S_PCI_PWMASKL(id) (0x1108 + (id) * 0x20)
+#define RZG3S_PCI_PWMASKU(id) (0x110c + (id) * 0x20)
+
+/* PHY control registers */
+#define RZG3S_PCI_PHY_XCFGD(id) (0x2000 + (id) * 0x10)
+#define RZG3S_PCI_PHY_XCFGD_NUM 39
+
+#define RZG3S_PCI_PHY_XCFGA_CMN(id) (0x2400 + (id) * 0x10)
+#define RZG3S_PCI_PHY_XCFGA_CMN_NUM 16
+
+#define RZG3S_PCI_PHY_XCFGA_RX(id) (0x2500 + (id) * 0x10)
+#define RZG3S_PCI_PHY_XCFGA_RX_NUM 13
+
+#define RZG3S_PCI_PHY_XCFGA_TX 0x25d0
+
+#define RZG3S_PCI_PHY_XCFG_CTRL 0x2a20
+#define RZG3S_PCI_PHY_XCFG_CTRL_PHYREG_SEL BIT(0)
+
+/* PCIe registers */
+#define RZG3S_PCI_CFG_BASE 0x6000
+#define RZG3S_PCI_CFG_BARMSK00L 0xa0
+#define RZG3S_PCI_CFG_BARMSK00U 0xa4
+
+#define RZG3S_PCI_CFG_PCIEC 0x60
+
+/* System controller registers */
+#define RZG3S_SYS_PCIE_RST_RSM_B 0xd74
+#define RZG3S_SYS_PCIE_RST_RSM_B_MASK BIT(0)
+
+/* Maximum number of windows */
+#define RZG3S_MAX_WINDOWS 8
+
+/* Number of MSI interrupts per register */
+#define RZG3S_PCI_MSI_INT_PER_REG 32
+/* The number of MSI interrupts */
+#define RZG3S_PCI_MSI_INT_NR RZG3S_PCI_MSI_INT_PER_REG
+
+/* Timeouts experimentally determined */
+#define RZG3S_REQ_ISSUE_TIMEOUT_US 2500
+
+/**
+ * struct rzg3s_pcie_msi - RZ/G3S PCIe MSI data structure
+ * @domain: IRQ domain
+ * @map: bitmap with the allocated MSIs
+ * @dma_addr: address of the allocated MSI window
+ * @window_base: base address of the MSI window
+ * @pages: allocated pages for MSI window mapping
+ * @map_lock: lock for bitmap with the allocated MSIs
+ * @irq: MSI interrupt
+ */
+struct rzg3s_pcie_msi {
+ struct irq_domain *domain;
+ DECLARE_BITMAP(map, RZG3S_PCI_MSI_INT_NR);
+ dma_addr_t dma_addr;
+ dma_addr_t window_base;
+ unsigned long pages;
+ struct mutex map_lock;
+ int irq;
+};
+
+struct rzg3s_pcie_host;
+
+/**
+ * struct rzg3s_pcie_soc_data - SoC specific data
+ * @init_phy: PHY initialization function
+ * @power_resets: array with the resets that need to be de-asserted after
+ * power-on
+ * @cfg_resets: array with the resets that need to be de-asserted after
+ * configuration
+ * @num_power_resets: number of power resets
+ * @num_cfg_resets: number of configuration resets
+ */
+struct rzg3s_pcie_soc_data {
+ int (*init_phy)(struct rzg3s_pcie_host *host);
+ const char * const *power_resets;
+ const char * const *cfg_resets;
+ u8 num_power_resets;
+ u8 num_cfg_resets;
+};
+
+/**
+ * struct rzg3s_pcie_port - RZ/G3S PCIe Root Port data structure
+ * @refclk: PCIe reference clock
+ * @vendor_id: Vendor ID
+ * @device_id: Device ID
+ */
+struct rzg3s_pcie_port {
+ struct clk *refclk;
+ u32 vendor_id;
+ u32 device_id;
+};
+
+/**
+ * struct rzg3s_pcie_host - RZ/G3S PCIe data structure
+ * @axi: base address for AXI registers
+ * @pcie: base address for PCIe registers
+ * @dev: struct device
+ * @power_resets: reset control signals that should be set after power up
+ * @cfg_resets: reset control signals that should be set after configuration
+ * @sysc: SYSC regmap
+ * @intx_domain: INTx IRQ domain
+ * @data: SoC specific data
+ * @msi: MSI data structure
+ * @port: PCIe Root Port
+ * @hw_lock: lock for access to the HW resources
+ * @intx_irqs: INTx interrupts
+ * @max_link_speed: maximum supported link speed
+ */
+struct rzg3s_pcie_host {
+ void __iomem *axi;
+ void __iomem *pcie;
+ struct device *dev;
+ struct reset_control_bulk_data *power_resets;
+ struct reset_control_bulk_data *cfg_resets;
+ struct regmap *sysc;
+ struct irq_domain *intx_domain;
+ const struct rzg3s_pcie_soc_data *data;
+ struct rzg3s_pcie_msi msi;
+ struct rzg3s_pcie_port port;
+ raw_spinlock_t hw_lock;
+ int intx_irqs[PCI_NUM_INTX];
+ int max_link_speed;
+};
+
+#define rzg3s_msi_to_host(_msi) container_of(_msi, struct rzg3s_pcie_host, msi)
+
+static void rzg3s_pcie_update_bits(void __iomem *base, u32 offset, u32 mask,
+ u32 val)
+{
+ u32 tmp;
+
+ tmp = readl_relaxed(base + offset);
+ tmp &= ~mask;
+ tmp |= val & mask;
+ writel_relaxed(tmp, base + offset);
+}
+
+static int rzg3s_pcie_child_issue_request(struct rzg3s_pcie_host *host)
+{
+ u32 val;
+ int ret;
+
+ rzg3s_pcie_update_bits(host->axi, RZG3S_PCI_REQISS,
+ RZG3S_PCI_REQISS_REQ_ISSUE,
+ RZG3S_PCI_REQISS_REQ_ISSUE);
+ ret = readl_poll_timeout_atomic(host->axi + RZG3S_PCI_REQISS, val,
+ !(val & RZG3S_PCI_REQISS_REQ_ISSUE),
+ 5, RZG3S_REQ_ISSUE_TIMEOUT_US);
+
+ if (val & RZG3S_PCI_REQISS_MOR_STATUS)
+ return -EIO;
+
+ return ret;
+}
+
+static void rzg3s_pcie_child_prepare_bus(struct pci_bus *bus,
+ unsigned int devfn, int where)
+{
+ struct rzg3s_pcie_host *host = bus->sysdata;
+ unsigned int dev, func, reg;
+
+ dev = PCI_SLOT(devfn);
+ func = PCI_FUNC(devfn);
+ reg = where & ~0x3;
+
+ /* Set the destination */
+ writel_relaxed(FIELD_PREP(RZG3S_PCI_REQADR1_BUS, bus->number) |
+ FIELD_PREP(RZG3S_PCI_REQADR1_DEV, dev) |
+ FIELD_PREP(RZG3S_PCI_REQADR1_FUNC, func) |
+ FIELD_PREP(RZG3S_PCI_REQADR1_REG, reg),
+ host->axi + RZG3S_PCI_REQADR1);
+
+ /* Set byte enable */
+ writel_relaxed(RZG3S_PCI_REQBE_BYTE_EN, host->axi + RZG3S_PCI_REQBE);
+}
+
+static int rzg3s_pcie_child_read_conf(struct rzg3s_pcie_host *host,
+ struct pci_bus *bus, unsigned int devfn,
+ int where, u32 *data)
+{
+ bool type0 = pci_is_root_bus(bus->parent) ? true : false;
+ int ret;
+
+ rzg3s_pcie_child_prepare_bus(bus, devfn, where);
+
+ /* Set the type of request */
+ writel_relaxed(type0 ? RZG3S_PCI_REQISS_TR_TP0_RD :
+ RZG3S_PCI_REQISS_TR_TP1_RD,
+ host->axi + RZG3S_PCI_REQISS);
+
+ /* Issue the request and wait to finish */
+ ret = rzg3s_pcie_child_issue_request(host);
+ if (ret)
+ return PCIBIOS_SET_FAILED;
+
+ /* Read the data */
+ *data = readl_relaxed(host->axi + RZG3S_PCI_REQRCVDAT);
+
+ return PCIBIOS_SUCCESSFUL;
+}
+
+/* Serialization is provided by 'pci_lock' in drivers/pci/access.c */
+static int rzg3s_pcie_child_read(struct pci_bus *bus, unsigned int devfn,
+ int where, int size, u32 *val)
+{
+ struct rzg3s_pcie_host *host = bus->sysdata;
+ int ret;
+
+ ret = rzg3s_pcie_child_read_conf(host, bus, devfn, where, val);
+ if (ret != PCIBIOS_SUCCESSFUL)
+ return ret;
+
+ if (size <= 2)
+ *val = (*val >> (8 * (where & 3))) & ((1 << (size * 8)) - 1);
+
+ return PCIBIOS_SUCCESSFUL;
+}
+
+static int rzg3s_pcie_child_write_conf(struct rzg3s_pcie_host *host,
+ struct pci_bus *bus, unsigned int devfn,
+ int where, u32 data)
+{
+ bool type0 = pci_is_root_bus(bus->parent) ? true : false;
+ int ret;
+
+ rzg3s_pcie_child_prepare_bus(bus, devfn, where);
+
+ /* Set the write data */
+ writel_relaxed(0, host->axi + RZG3S_PCI_REQDATA(0));
+ writel_relaxed(0, host->axi + RZG3S_PCI_REQDATA(1));
+ writel_relaxed(data, host->axi + RZG3S_PCI_REQDATA(2));
+
+ /* Set the type of request */
+ writel_relaxed(type0 ? RZG3S_PCI_REQISS_TR_TP0_WR :
+ RZG3S_PCI_REQISS_TR_TP1_WR,
+ host->axi + RZG3S_PCI_REQISS);
+
+ /* Issue the request and wait to finish */
+ ret = rzg3s_pcie_child_issue_request(host);
+ if (ret)
+ return PCIBIOS_SET_FAILED;
+
+ return PCIBIOS_SUCCESSFUL;
+}
+
+/* Serialization is provided by 'pci_lock' in drivers/pci/access.c */
+static int rzg3s_pcie_child_write(struct pci_bus *bus, unsigned int devfn,
+ int where, int size, u32 val)
+{
+ struct rzg3s_pcie_host *host = bus->sysdata;
+ u32 data, shift;
+ int ret;
+
+ if (size == 4)
+ return rzg3s_pcie_child_write_conf(host, bus, devfn, where, val);
+
+ /*
+ * Controller does 32 bit accesses. To do byte accesses software need
+ * to do read/modify/write. This may have potential side effects. For
+ * example, software may perform a 16-bit write. If the hardware only
+ * supports 32-bit accesses, we must do a 32-bit read, merge in the 16
+ * bits we intend to write, followed by a 32-bit write. If the 16 bits
+ * we *don't* intend to write happen to have any RW1C
+ * (write-one-to-clear) bits set, we just inadvertently cleared
+ * something we shouldn't have.
+ */
+ if (!bus->unsafe_warn) {
+ dev_warn(&bus->dev, "%d-byte config write to %04x:%02x:%02x.%d offset %#x may corrupt adjacent RW1C bits\n",
+ size, pci_domain_nr(bus), bus->number,
+ PCI_SLOT(devfn), PCI_FUNC(devfn), where);
+ bus->unsafe_warn = 1;
+ }
+
+ ret = rzg3s_pcie_child_read_conf(host, bus, devfn, where, &data);
+ if (ret != PCIBIOS_SUCCESSFUL)
+ return ret;
+
+ if (size == 1) {
+ shift = BITS_PER_BYTE * (where & 3);
+ data &= ~(0xff << shift);
+ data |= ((val & 0xff) << shift);
+ } else if (size == 2) {
+ shift = BITS_PER_BYTE * (where & 2);
+ data &= ~(0xffff << shift);
+ data |= ((val & 0xffff) << shift);
+ } else {
+ data = val;
+ }
+
+ return rzg3s_pcie_child_write_conf(host, bus, devfn, where, data);
+}
+
+static struct pci_ops rzg3s_pcie_child_ops = {
+ .read = rzg3s_pcie_child_read,
+ .write = rzg3s_pcie_child_write,
+};
+
+static void __iomem *rzg3s_pcie_root_map_bus(struct pci_bus *bus,
+ unsigned int devfn, int where)
+{
+ struct rzg3s_pcie_host *host = bus->sysdata;
+
+ if (devfn)
+ return NULL;
+
+ return host->pcie + where;
+}
+
+/* Serialized by 'pci_lock' */
+static int rzg3s_pcie_root_write(struct pci_bus *bus, unsigned int devfn,
+ int where, int size, u32 val)
+{
+ struct rzg3s_pcie_host *host = bus->sysdata;
+ int ret;
+
+ /* Enable access control to the CFGU */
+ writel_relaxed(RZG3S_PCI_PERM_CFG_HWINIT_EN,
+ host->axi + RZG3S_PCI_PERM);
+
+ ret = pci_generic_config_write(bus, devfn, where, size, val);
+
+ /* Disable access control to the CFGU */
+ writel_relaxed(0, host->axi + RZG3S_PCI_PERM);
+
+ return ret;
+}
+
+static struct pci_ops rzg3s_pcie_root_ops = {
+ .read = pci_generic_config_read,
+ .write = rzg3s_pcie_root_write,
+ .map_bus = rzg3s_pcie_root_map_bus,
+};
+
+static void rzg3s_pcie_intx_irq_handler(struct irq_desc *desc)
+{
+ struct rzg3s_pcie_host *host = irq_desc_get_handler_data(desc);
+ struct irq_chip *chip = irq_desc_get_chip(desc);
+ unsigned int irq = irq_desc_get_irq(desc);
+ u32 intx = irq - host->intx_irqs[0];
+
+ chained_irq_enter(chip, desc);
+ generic_handle_domain_irq(host->intx_domain, intx);
+ chained_irq_exit(chip, desc);
+}
+
+static irqreturn_t rzg3s_pcie_msi_irq(int irq, void *data)
+{
+ u8 regs = RZG3S_PCI_MSI_INT_NR / RZG3S_PCI_MSI_INT_PER_REG;
+ DECLARE_BITMAP(bitmap, RZG3S_PCI_MSI_INT_NR);
+ struct rzg3s_pcie_host *host = data;
+ struct rzg3s_pcie_msi *msi = &host->msi;
+ unsigned long bit;
+ u32 status;
+
+ status = readl_relaxed(host->axi + RZG3S_PCI_PINTRCVIS);
+ if (!(status & RZG3S_PCI_PINTRCVIS_MSI))
+ return IRQ_NONE;
+
+ /* Clear the MSI */
+ rzg3s_pcie_update_bits(host->axi, RZG3S_PCI_PINTRCVIS,
+ RZG3S_PCI_PINTRCVIS_MSI,
+ RZG3S_PCI_PINTRCVIS_MSI);
+ rzg3s_pcie_update_bits(host->axi, RZG3S_PCI_MSGRCVIS,
+ RZG3S_PCI_MSGRCVIS_MRI, RZG3S_PCI_MSGRCVIS_MRI);
+
+ for (u8 reg_id = 0; reg_id < regs; reg_id++) {
+ status = readl_relaxed(host->axi + RZG3S_PCI_MSIRS(reg_id));
+ bitmap_write(bitmap, status, reg_id * RZG3S_PCI_MSI_INT_PER_REG,
+ RZG3S_PCI_MSI_INT_PER_REG);
+ }
+
+ for_each_set_bit(bit, bitmap, RZG3S_PCI_MSI_INT_NR) {
+ int ret;
+
+ ret = generic_handle_domain_irq(msi->domain, bit);
+ if (ret) {
+ u8 reg_bit = bit % RZG3S_PCI_MSI_INT_PER_REG;
+ u8 reg_id = bit / RZG3S_PCI_MSI_INT_PER_REG;
+
+ /* Unknown MSI, just clear it */
+ writel_relaxed(BIT(reg_bit),
+ host->axi + RZG3S_PCI_MSIRS(reg_id));
+ }
+ }
+
+ return IRQ_HANDLED;
+}
+
+static void rzg3s_pcie_msi_irq_ack(struct irq_data *d)
+{
+ struct rzg3s_pcie_msi *msi = irq_data_get_irq_chip_data(d);
+ struct rzg3s_pcie_host *host = rzg3s_msi_to_host(msi);
+ u8 reg_bit = d->hwirq % RZG3S_PCI_MSI_INT_PER_REG;
+ u8 reg_id = d->hwirq / RZG3S_PCI_MSI_INT_PER_REG;
+
+ guard(raw_spinlock_irqsave)(&host->hw_lock);
+
+ writel_relaxed(BIT(reg_bit), host->axi + RZG3S_PCI_MSIRS(reg_id));
+}
+
+static void rzg3s_pcie_msi_irq_mask(struct irq_data *d)
+{
+ struct rzg3s_pcie_msi *msi = irq_data_get_irq_chip_data(d);
+ struct rzg3s_pcie_host *host = rzg3s_msi_to_host(msi);
+ u8 reg_bit = d->hwirq % RZG3S_PCI_MSI_INT_PER_REG;
+ u8 reg_id = d->hwirq / RZG3S_PCI_MSI_INT_PER_REG;
+
+ guard(raw_spinlock_irqsave)(&host->hw_lock);
+
+ rzg3s_pcie_update_bits(host->axi, RZG3S_PCI_MSIRM(reg_id), BIT(reg_bit),
+ BIT(reg_bit));
+}
+
+static void rzg3s_pcie_msi_irq_unmask(struct irq_data *d)
+{
+ struct rzg3s_pcie_msi *msi = irq_data_get_irq_chip_data(d);
+ struct rzg3s_pcie_host *host = rzg3s_msi_to_host(msi);
+ u8 reg_bit = d->hwirq % RZG3S_PCI_MSI_INT_PER_REG;
+ u8 reg_id = d->hwirq / RZG3S_PCI_MSI_INT_PER_REG;
+
+ guard(raw_spinlock_irqsave)(&host->hw_lock);
+
+ rzg3s_pcie_update_bits(host->axi, RZG3S_PCI_MSIRM(reg_id), BIT(reg_bit),
+ 0);
+}
+
+static void rzg3s_pcie_irq_compose_msi_msg(struct irq_data *data,
+ struct msi_msg *msg)
+{
+ struct rzg3s_pcie_msi *msi = irq_data_get_irq_chip_data(data);
+ struct rzg3s_pcie_host *host = rzg3s_msi_to_host(msi);
+ u32 lo, hi;
+
+ /*
+ * Enable and msg data enable bits are part of the address lo. Drop
+ * them along with the unused bit.
+ */
+ lo = readl_relaxed(host->axi + RZG3S_PCI_MSIRCVWADRL) &
+ RZG3S_PCI_MSIRCVWADRL_MASK;
+ hi = readl_relaxed(host->axi + RZG3S_PCI_MSIRCVWADRU);
+
+ msg->address_lo = lo;
+ msg->address_hi = hi;
+ msg->data = data->hwirq;
+}
+
+static struct irq_chip rzg3s_pcie_msi_bottom_chip = {
+ .name = "rzg3s-pcie-msi",
+ .irq_ack = rzg3s_pcie_msi_irq_ack,
+ .irq_mask = rzg3s_pcie_msi_irq_mask,
+ .irq_unmask = rzg3s_pcie_msi_irq_unmask,
+ .irq_compose_msi_msg = rzg3s_pcie_irq_compose_msi_msg,
+};
+
+static int rzg3s_pcie_msi_domain_alloc(struct irq_domain *domain,
+ unsigned int virq, unsigned int nr_irqs,
+ void *args)
+{
+ struct rzg3s_pcie_msi *msi = domain->host_data;
+ int hwirq;
+
+ scoped_guard(mutex, &msi->map_lock) {
+ hwirq = bitmap_find_free_region(msi->map, RZG3S_PCI_MSI_INT_NR,
+ order_base_2(nr_irqs));
+ }
+
+ if (hwirq < 0)
+ return -ENOSPC;
+
+ for (unsigned int i = 0; i < nr_irqs; i++) {
+ irq_domain_set_info(domain, virq + i, hwirq + i,
+ &rzg3s_pcie_msi_bottom_chip,
+ domain->host_data, handle_edge_irq, NULL,
+ NULL);
+ }
+
+ return 0;
+}
+
+static void rzg3s_pcie_msi_domain_free(struct irq_domain *domain,
+ unsigned int virq, unsigned int nr_irqs)
+{
+ struct irq_data *d = irq_domain_get_irq_data(domain, virq);
+ struct rzg3s_pcie_msi *msi = domain->host_data;
+
+ guard(mutex)(&msi->map_lock);
+
+ bitmap_release_region(msi->map, d->hwirq, order_base_2(nr_irqs));
+}
+
+static const struct irq_domain_ops rzg3s_pcie_msi_domain_ops = {
+ .alloc = rzg3s_pcie_msi_domain_alloc,
+ .free = rzg3s_pcie_msi_domain_free,
+};
+
+#define RZG3S_PCIE_MSI_FLAGS_REQUIRED (MSI_FLAG_USE_DEF_DOM_OPS | \
+ MSI_FLAG_USE_DEF_CHIP_OPS | \
+ MSI_FLAG_NO_AFFINITY | \
+ MSI_FLAG_PCI_MSI_MASK_PARENT)
+
+#define RZG3S_PCIE_MSI_FLAGS_SUPPORTED (MSI_FLAG_MULTI_PCI_MSI | \
+ MSI_GENERIC_FLAGS_MASK)
+
+static const struct msi_parent_ops rzg3s_pcie_msi_parent_ops = {
+ .required_flags = RZG3S_PCIE_MSI_FLAGS_REQUIRED,
+ .supported_flags = RZG3S_PCIE_MSI_FLAGS_SUPPORTED,
+ .bus_select_token = DOMAIN_BUS_PCI_MSI,
+ .chip_flags = MSI_CHIP_FLAG_SET_ACK,
+ .prefix = "RZG3S-",
+ .init_dev_msi_info = msi_lib_init_dev_msi_info,
+};
+
+static int rzg3s_pcie_msi_allocate_domains(struct rzg3s_pcie_msi *msi)
+{
+ struct rzg3s_pcie_host *host = rzg3s_msi_to_host(msi);
+ struct device *dev = host->dev;
+ struct irq_domain_info info = {
+ .fwnode = dev_fwnode(dev),
+ .ops = &rzg3s_pcie_msi_domain_ops,
+ .size = RZG3S_PCI_MSI_INT_NR,
+ .host_data = msi,
+ };
+
+ msi->domain = msi_create_parent_irq_domain(&info,
+ &rzg3s_pcie_msi_parent_ops);
+ if (!msi->domain)
+ return dev_err_probe(dev, -ENOMEM,
+ "failed to create IRQ domain\n");
+
+ return 0;
+}
+
+static int rzg3s_pcie_msi_hw_setup(struct rzg3s_pcie_host *host)
+{
+ u8 regs = RZG3S_PCI_MSI_INT_NR / RZG3S_PCI_MSI_INT_PER_REG;
+ struct rzg3s_pcie_msi *msi = &host->msi;
+
+ /*
+ * Set MSI window size. HW will set the window to
+ * RZG3S_PCI_MSI_INT_NR * 4 bytes.
+ */
+ writel_relaxed(FIELD_PREP(RZG3S_PCI_MSIRCVWMSKL_MASK,
+ RZG3S_PCI_MSI_INT_NR - 1),
+ host->axi + RZG3S_PCI_MSIRCVWMSKL);
+
+ /* Set MSI window address and enable MSI window */
+ writel_relaxed(upper_32_bits(msi->window_base),
+ host->axi + RZG3S_PCI_MSIRCVWADRU);
+ writel_relaxed(lower_32_bits(msi->window_base) |
+ RZG3S_PCI_MSIRCVWADRL_ENA |
+ RZG3S_PCI_MSIRCVWADRL_MSG_DATA_ENA,
+ host->axi + RZG3S_PCI_MSIRCVWADRL);
+
+ /* Set MSI receive enable */
+ for (u8 reg_id = 0; reg_id < regs; reg_id++) {
+ writel_relaxed(RZG3S_PCI_MSIRE_ENA,
+ host->axi + RZG3S_PCI_MSIRE(reg_id));
+ }
+
+ /* Enable message receive interrupts */
+ writel_relaxed(RZG3S_PCI_MSGRCVIE_MSG_RCV,
+ host->axi + RZG3S_PCI_MSGRCVIE);
+
+ /* Enable MSI */
+ rzg3s_pcie_update_bits(host->axi, RZG3S_PCI_PINTRCVIE,
+ RZG3S_PCI_PINTRCVIE_MSI,
+ RZG3S_PCI_PINTRCVIE_MSI);
+
+ return 0;
+}
+
+static int rzg3s_pcie_msi_setup(struct rzg3s_pcie_host *host)
+{
+ size_t size = RZG3S_PCI_MSI_INT_NR * sizeof(u32);
+ struct rzg3s_pcie_msi *msi = &host->msi;
+ struct device *dev = host->dev;
+ int id, ret;
+
+ msi->pages = __get_free_pages(GFP_KERNEL | GFP_DMA, 0);
+ if (!msi->pages)
+ return -ENOMEM;
+
+ msi->dma_addr = dma_map_single(dev, (void *)msi->pages, size * 2,
+ DMA_BIDIRECTIONAL);
+ if (dma_mapping_error(dev, msi->dma_addr)) {
+ ret = -ENOMEM;
+ goto free_pages;
+ }
+
+ /*
+ * According to the RZ/G3S HW manual (Rev.1.10, section 34.4.5.2 Setting
+ * the MSI Window) the MSI window needs to fall within one of the
+ * enabled AXI windows. Find an enabled AXI window to setup the MSI
+ * window.
+ */
+ for (id = 0; id < RZG3S_MAX_WINDOWS; id++) {
+ u64 base, basel, baseu;
+ u64 mask, maskl, masku;
+
+ basel = readl_relaxed(host->axi + RZG3S_PCI_AWBASEL(id));
+ /* Skip checking this AXI window if it's not enabled */
+ if (!(basel & RZG3S_PCI_AWBASEL_WIN_ENA))
+ continue;
+
+ baseu = readl_relaxed(host->axi + RZG3S_PCI_AWBASEU(id));
+ base = baseu << 32 | basel;
+
+ maskl = readl_relaxed(host->axi + RZG3S_PCI_AWMASKL(id));
+ masku = readl_relaxed(host->axi + RZG3S_PCI_AWMASKU(id));
+ mask = masku << 32 | maskl;
+
+ if (msi->dma_addr < base || msi->dma_addr > base + mask)
+ continue;
+
+ break;
+ }
+
+ if (id == RZG3S_MAX_WINDOWS) {
+ ret = -EINVAL;
+ goto dma_unmap;
+ }
+
+ /* The MSI base address must be aligned to the MSI size */
+ msi->window_base = ALIGN(msi->dma_addr, size);
+ if (msi->window_base < msi->dma_addr) {
+ ret = -EINVAL;
+ goto dma_unmap;
+ }
+
+ rzg3s_pcie_msi_hw_setup(host);
+
+ return 0;
+
+dma_unmap:
+ dma_unmap_single(dev, msi->dma_addr, size * 2, DMA_BIDIRECTIONAL);
+free_pages:
+ free_pages(msi->pages, 0);
+ return ret;
+}
+
+static void rzg3s_pcie_msi_hw_teardown(struct rzg3s_pcie_host *host)
+{
+ u8 regs = RZG3S_PCI_MSI_INT_NR / RZG3S_PCI_MSI_INT_PER_REG;
+
+ /* Disable MSI */
+ rzg3s_pcie_update_bits(host->axi, RZG3S_PCI_PINTRCVIE,
+ RZG3S_PCI_PINTRCVIE_MSI, 0);
+
+ /* Disable message receive interrupts */
+ rzg3s_pcie_update_bits(host->axi, RZG3S_PCI_MSGRCVIE,
+ RZG3S_PCI_MSGRCVIE_MSG_RCV, 0);
+
+ /* Disable MSI receive enable */
+ for (u8 reg_id = 0; reg_id < regs; reg_id++)
+ writel_relaxed(0, host->axi + RZG3S_PCI_MSIRE(reg_id));
+
+ /* Disable MSI window */
+ writel_relaxed(0, host->axi + RZG3S_PCI_MSIRCVWADRL);
+}
+
+static void rzg3s_pcie_teardown_msi(struct rzg3s_pcie_host *host)
+{
+ size_t size = RZG3S_PCI_MSI_INT_NR * sizeof(u32);
+ struct rzg3s_pcie_msi *msi = &host->msi;
+
+ rzg3s_pcie_msi_hw_teardown(host);
+
+ free_irq(msi->irq, host);
+ irq_domain_remove(msi->domain);
+
+ /* Free unused memory */
+ dma_unmap_single(host->dev, msi->dma_addr, size * 2, DMA_BIDIRECTIONAL);
+ free_pages(msi->pages, 0);
+}
+
+static int rzg3s_pcie_init_msi(struct rzg3s_pcie_host *host)
+{
+ struct platform_device *pdev = to_platform_device(host->dev);
+ struct rzg3s_pcie_msi *msi = &host->msi;
+ struct device *dev = host->dev;
+ const char *devname;
+ int ret;
+
+ ret = devm_mutex_init(dev, &msi->map_lock);
+ if (ret)
+ return ret;
+
+ msi->irq = platform_get_irq_byname(pdev, "msi");
+ if (msi->irq < 0)
+ return dev_err_probe(dev, msi->irq, "Failed to get MSI IRQ!\n");
+
+ devname = devm_kasprintf(dev, GFP_KERNEL, "%s-msi", dev_name(dev));
+ if (!devname)
+ return -ENOMEM;
+
+ ret = rzg3s_pcie_msi_allocate_domains(msi);
+ if (ret)
+ return ret;
+
+ /*
+ * Don't use devm_request_irq() as the driver uses non-devm helpers
+ * to control clocks. Mixing them may lead to subtle bugs.
+ */
+ ret = request_irq(msi->irq, rzg3s_pcie_msi_irq, 0, devname, host);
+ if (ret) {
+ dev_err_probe(dev, ret, "Failed to request IRQ: %d\n", ret);
+ goto free_domains;
+ }
+
+ ret = rzg3s_pcie_msi_setup(host);
+ if (ret) {
+ dev_err_probe(dev, ret, "Failed to setup MSI!\n");
+ goto free_irq;
+ }
+
+ return 0;
+
+free_irq:
+ free_irq(msi->irq, host);
+free_domains:
+ irq_domain_remove(msi->domain);
+ return ret;
+}
+
+static void rzg3s_pcie_intx_irq_ack(struct irq_data *d)
+{
+ struct rzg3s_pcie_host *host = irq_data_get_irq_chip_data(d);
+
+ guard(raw_spinlock_irqsave)(&host->hw_lock);
+
+ rzg3s_pcie_update_bits(host->axi, RZG3S_PCI_PINTRCVIS,
+ RZG3S_PCI_PINTRCVIS_INTX(d->hwirq),
+ RZG3S_PCI_PINTRCVIS_INTX(d->hwirq));
+}
+
+static void rzg3s_pcie_intx_irq_mask(struct irq_data *d)
+{
+ struct rzg3s_pcie_host *host = irq_data_get_irq_chip_data(d);
+
+ guard(raw_spinlock_irqsave)(&host->hw_lock);
+
+ rzg3s_pcie_update_bits(host->axi, RZG3S_PCI_PINTRCVIE,
+ RZG3S_PCI_PINTRCVIE_INTX(d->hwirq), 0);
+}
+
+static void rzg3s_pcie_intx_irq_unmask(struct irq_data *d)
+{
+ struct rzg3s_pcie_host *host = irq_data_get_irq_chip_data(d);
+
+ guard(raw_spinlock_irqsave)(&host->hw_lock);
+
+ rzg3s_pcie_update_bits(host->axi, RZG3S_PCI_PINTRCVIE,
+ RZG3S_PCI_PINTRCVIE_INTX(d->hwirq),
+ RZG3S_PCI_PINTRCVIE_INTX(d->hwirq));
+}
+
+static struct irq_chip rzg3s_pcie_intx_irq_chip = {
+ .name = "PCIe INTx",
+ .irq_ack = rzg3s_pcie_intx_irq_ack,
+ .irq_mask = rzg3s_pcie_intx_irq_mask,
+ .irq_unmask = rzg3s_pcie_intx_irq_unmask,
+};
+
+static int rzg3s_pcie_intx_map(struct irq_domain *domain, unsigned int irq,
+ irq_hw_number_t hwirq)
+{
+ irq_set_chip_and_handler(irq, &rzg3s_pcie_intx_irq_chip,
+ handle_level_irq);
+ irq_set_chip_data(irq, domain->host_data);
+
+ return 0;
+}
+
+static const struct irq_domain_ops rzg3s_pcie_intx_domain_ops = {
+ .map = rzg3s_pcie_intx_map,
+ .xlate = irq_domain_xlate_onetwocell,
+};
+
+static int rzg3s_pcie_init_irqdomain(struct rzg3s_pcie_host *host)
+{
+ struct device *dev = host->dev;
+ struct platform_device *pdev = to_platform_device(dev);
+
+ for (int i = 0; i < PCI_NUM_INTX; i++) {
+ char irq_name[5] = {0};
+ int irq;
+
+ scnprintf(irq_name, ARRAY_SIZE(irq_name), "int%c", 'a' + i);
+
+ irq = platform_get_irq_byname(pdev, irq_name);
+ if (irq < 0)
+ return dev_err_probe(dev, -EINVAL,
+ "Failed to parse and map INT%c IRQ\n",
+ 'A' + i);
+
+ host->intx_irqs[i] = irq;
+ irq_set_chained_handler_and_data(irq,
+ rzg3s_pcie_intx_irq_handler,
+ host);
+ }
+
+ host->intx_domain = irq_domain_create_linear(dev_fwnode(dev),
+ PCI_NUM_INTX,
+ &rzg3s_pcie_intx_domain_ops,
+ host);
+ if (!host->intx_domain)
+ return dev_err_probe(dev, -EINVAL,
+ "Failed to add irq domain for INTx IRQs\n");
+ irq_domain_update_bus_token(host->intx_domain, DOMAIN_BUS_WIRED);
+
+ if (IS_ENABLED(CONFIG_PCI_MSI)) {
+ int ret = rzg3s_pcie_init_msi(host);
+
+ if (ret) {
+ irq_domain_remove(host->intx_domain);
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static void rzg3s_pcie_teardown_irqdomain(struct rzg3s_pcie_host *host)
+{
+ if (IS_ENABLED(CONFIG_PCI_MSI))
+ rzg3s_pcie_teardown_msi(host);
+
+ irq_domain_remove(host->intx_domain);
+}
+
+static int rzg3s_pcie_set_max_link_speed(struct rzg3s_pcie_host *host)
+{
+ u32 remote_supported_link_speeds, max_supported_link_speeds;
+ u32 cs2, tmp, pcie_cap = RZG3S_PCI_CFG_PCIEC;
+ u32 cur_link_speed, link_speed;
+ u8 ltssm_state_l0 = 0xc;
+ int ret;
+ u16 ls;
+
+ /*
+ * According to the RZ/G3S HW manual (Rev.1.10, section 34.6.3 Caution
+ * when Changing the Speed Spontaneously) link speed change can be done
+ * only when the LTSSM is in L0.
+ */
+ ret = readl_poll_timeout(host->axi + RZG3S_PCI_PCSTAT1, tmp,
+ FIELD_GET(RZG3S_PCI_PCSTAT1_LTSSM_STATE, tmp) == ltssm_state_l0,
+ PCIE_LINK_WAIT_SLEEP_MS * MILLI,
+ PCIE_LINK_WAIT_SLEEP_MS * MILLI *
+ PCIE_LINK_WAIT_MAX_RETRIES);
+ if (ret)
+ return ret;
+
+ ls = readw_relaxed(host->pcie + pcie_cap + PCI_EXP_LNKSTA);
+ cs2 = readl_relaxed(host->axi + RZG3S_PCI_PCSTAT2);
+
+ switch (pcie_link_speed[host->max_link_speed]) {
+ case PCIE_SPEED_5_0GT:
+ max_supported_link_speeds = GENMASK(PCI_EXP_LNKSTA_CLS_5_0GB - 1, 0);
+ link_speed = PCI_EXP_LNKCTL2_TLS_5_0GT;
+ break;
+ default:
+ /* Should not happen */
+ return -EINVAL;
+ }
+
+ cur_link_speed = FIELD_GET(PCI_EXP_LNKSTA_CLS, ls);
+ remote_supported_link_speeds = FIELD_GET(RZG3S_PCI_PCSTAT2_SDRIRE, cs2);
+ /* Drop reserved bits */
+ remote_supported_link_speeds &= max_supported_link_speeds;
+
+ /*
+ * Return if max link speed is already set or the connected device
+ * doesn't support it.
+ */
+ if (cur_link_speed == host->max_link_speed ||
+ remote_supported_link_speeds != max_supported_link_speeds)
+ return 0;
+
+ /* Set target Link speed */
+ rzg3s_pcie_update_bits(host->pcie, pcie_cap + PCI_EXP_LNKCTL2,
+ PCI_EXP_LNKCTL2_TLS,
+ FIELD_PREP(PCI_EXP_LNKCTL2_TLS, link_speed));
+
+ /* Request link speed change */
+ rzg3s_pcie_update_bits(host->axi, RZG3S_PCI_PCCTRL2,
+ RZG3S_PCI_PCCTRL2_LS_CHG_REQ |
+ RZG3S_PCI_PCCTRL2_LS_CHG,
+ RZG3S_PCI_PCCTRL2_LS_CHG_REQ |
+ FIELD_PREP(RZG3S_PCI_PCCTRL2_LS_CHG,
+ link_speed - 1));
+
+ ret = readl_poll_timeout(host->axi + RZG3S_PCI_PCSTAT2, cs2,
+ (cs2 & RZG3S_PCI_PCSTAT2_LS_CHG_DONE),
+ PCIE_LINK_WAIT_SLEEP_MS * MILLI,
+ PCIE_LINK_WAIT_SLEEP_MS * MILLI *
+ PCIE_LINK_WAIT_MAX_RETRIES);
+
+ /*
+ * According to the RZ/G3S HW manual (Rev.1.10, section 34.6.3 Caution
+ * when Changing the Speed Spontaneously) the PCI_PCCTRL2_LS_CHG_REQ
+ * should be de-asserted after checking for PCI_PCSTAT2_LS_CHG_DONE.
+ */
+ rzg3s_pcie_update_bits(host->axi, RZG3S_PCI_PCCTRL2,
+ RZG3S_PCI_PCCTRL2_LS_CHG_REQ, 0);
+
+ return ret;
+}
+
+static int rzg3s_pcie_config_init(struct rzg3s_pcie_host *host)
+{
+ struct pci_host_bridge *bridge = pci_host_bridge_from_priv(host);
+ struct resource_entry *ft;
+ struct resource *bus;
+ u8 subordinate_bus;
+ u8 secondary_bus;
+ u8 primary_bus;
+
+ ft = resource_list_first_type(&bridge->windows, IORESOURCE_BUS);
+ if (!ft)
+ return -ENODEV;
+
+ bus = ft->res;
+ primary_bus = bus->start;
+ secondary_bus = bus->start + 1;
+ subordinate_bus = bus->end;
+
+ /* Enable access control to the CFGU */
+ writel_relaxed(RZG3S_PCI_PERM_CFG_HWINIT_EN,
+ host->axi + RZG3S_PCI_PERM);
+
+ /* HW manual recommends to write 0xffffffff on initialization */
+ writel_relaxed(0xffffffff, host->pcie + RZG3S_PCI_CFG_BARMSK00L);
+ writel_relaxed(0xffffffff, host->pcie + RZG3S_PCI_CFG_BARMSK00U);
+
+ /* Update bus info */
+ writeb_relaxed(primary_bus, host->pcie + PCI_PRIMARY_BUS);
+ writeb_relaxed(secondary_bus, host->pcie + PCI_SECONDARY_BUS);
+ writeb_relaxed(subordinate_bus, host->pcie + PCI_SUBORDINATE_BUS);
+
+ /* Disable access control to the CFGU */
+ writel_relaxed(0, host->axi + RZG3S_PCI_PERM);
+
+ return 0;
+}
+
+static void rzg3s_pcie_irq_init(struct rzg3s_pcie_host *host)
+{
+ /*
+ * According to the HW manual of the RZ/G3S (Rev.1.10, sections
+ * corresponding to all registers written with ~0U), the hardware
+ * ignores value written to unused bits. Writing ~0U to these registers
+ * should be safe.
+ */
+
+ /* Clear the link state and PM transitions */
+ writel_relaxed(RZG3S_PCI_PEIS0_DL_UPDOWN |
+ RZG3S_PCI_PEIS0_RX_DLLP_PM_ENTER,
+ host->axi + RZG3S_PCI_PEIS0);
+
+ /* Disable all interrupts */
+ writel_relaxed(0, host->axi + RZG3S_PCI_PEIE0);
+
+ /* Clear all parity and ecc error interrupts */
+ writel_relaxed(~0U, host->axi + RZG3S_PCI_PEIS1);
+
+ /* Disable all parity and ecc error interrupts */
+ writel_relaxed(0, host->axi + RZG3S_PCI_PEIE1);
+
+ /* Clear all AXI master error interrupts */
+ writel_relaxed(~0U, host->axi + RZG3S_PCI_AMEIS);
+
+ /* Clear all AXI slave error interrupts */
+ writel_relaxed(~0U, host->axi + RZG3S_PCI_ASEIS1);
+
+ /* Clear all message receive interrupts */
+ writel_relaxed(~0U, host->axi + RZG3S_PCI_MSGRCVIS);
+}
+
+static int rzg3s_pcie_power_resets_deassert(struct rzg3s_pcie_host *host)
+{
+ const struct rzg3s_pcie_soc_data *data = host->data;
+
+ /*
+ * According to the RZ/G3S HW manual (Rev.1.10, section
+ * 34.5.1.2 De-asserting the Reset) the PCIe IP needs to wait 5ms from
+ * power on to the de-assertion of reset.
+ */
+ fsleep(5000);
+ return reset_control_bulk_deassert(data->num_power_resets,
+ host->power_resets);
+}
+
+static int rzg3s_pcie_resets_prepare_and_get(struct rzg3s_pcie_host *host)
+{
+ const struct rzg3s_pcie_soc_data *data = host->data;
+ unsigned int i;
+ int ret;
+
+ host->power_resets = devm_kmalloc_array(host->dev,
+ data->num_power_resets,
+ sizeof(*host->power_resets),
+ GFP_KERNEL);
+ if (!host->power_resets)
+ return -ENOMEM;
+
+ for (i = 0; i < data->num_power_resets; i++)
+ host->power_resets[i].id = data->power_resets[i];
+
+ host->cfg_resets = devm_kmalloc_array(host->dev,
+ data->num_cfg_resets,
+ sizeof(*host->cfg_resets),
+ GFP_KERNEL);
+ if (!host->cfg_resets)
+ return -ENOMEM;
+
+ for (i = 0; i < data->num_cfg_resets; i++)
+ host->cfg_resets[i].id = data->cfg_resets[i];
+
+ ret = devm_reset_control_bulk_get_exclusive(host->dev,
+ data->num_power_resets,
+ host->power_resets);
+ if (ret)
+ return ret;
+
+ return devm_reset_control_bulk_get_exclusive(host->dev,
+ data->num_cfg_resets,
+ host->cfg_resets);
+}
+
+static int rzg3s_pcie_host_parse_port(struct rzg3s_pcie_host *host)
+{
+ struct device_node *of_port = of_get_next_child(host->dev->of_node, NULL);
+ struct rzg3s_pcie_port *port = &host->port;
+ int ret;
+
+ ret = of_property_read_u32(of_port, "vendor-id", &port->vendor_id);
+ if (ret)
+ return ret;
+
+ ret = of_property_read_u32(of_port, "device-id", &port->device_id);
+ if (ret)
+ return ret;
+
+ port->refclk = of_clk_get_by_name(of_port, "ref");
+ if (IS_ERR(port->refclk))
+ return PTR_ERR(port->refclk);
+
+ return 0;
+}
+
+static int rzg3s_pcie_host_init_port(struct rzg3s_pcie_host *host)
+{
+ struct rzg3s_pcie_port *port = &host->port;
+ struct device *dev = host->dev;
+ int ret;
+
+ /* Enable access control to the CFGU */
+ writel_relaxed(RZG3S_PCI_PERM_CFG_HWINIT_EN,
+ host->axi + RZG3S_PCI_PERM);
+
+ /* Update vendor ID and device ID */
+ writew_relaxed(port->vendor_id, host->pcie + PCI_VENDOR_ID);
+ writew_relaxed(port->device_id, host->pcie + PCI_DEVICE_ID);
+
+ /* Disable access control to the CFGU */
+ writel_relaxed(0, host->axi + RZG3S_PCI_PERM);
+
+ ret = clk_prepare_enable(port->refclk);
+ if (ret)
+ return dev_err_probe(dev, ret, "Failed to enable refclk!\n");
+
+ /* Set the PHY, if any */
+ if (host->data->init_phy) {
+ ret = host->data->init_phy(host);
+ if (ret) {
+ dev_err_probe(dev, ret, "Failed to set the PHY!\n");
+ goto refclk_disable;
+ }
+ }
+
+ return 0;
+
+refclk_disable:
+ clk_disable_unprepare(port->refclk);
+ return ret;
+}
+
+static int rzg3s_pcie_host_init(struct rzg3s_pcie_host *host)
+{
+ u32 val;
+ int ret;
+
+ /* Initialize the PCIe related registers */
+ ret = rzg3s_pcie_config_init(host);
+ if (ret)
+ return ret;
+
+ ret = rzg3s_pcie_host_init_port(host);
+ if (ret)
+ return ret;
+
+ /* Initialize the interrupts */
+ rzg3s_pcie_irq_init(host);
+
+ ret = reset_control_bulk_deassert(host->data->num_cfg_resets,
+ host->cfg_resets);
+ if (ret)
+ goto disable_port_refclk;
+
+ /* Wait for link up */
+ ret = readl_poll_timeout(host->axi + RZG3S_PCI_PCSTAT1, val,
+ !(val & RZG3S_PCI_PCSTAT1_DL_DOWN_STS),
+ PCIE_LINK_WAIT_SLEEP_MS * MILLI,
+ PCIE_LINK_WAIT_SLEEP_MS * MILLI *
+ PCIE_LINK_WAIT_MAX_RETRIES);
+ if (ret)
+ goto cfg_resets_deassert;
+
+ val = readl_relaxed(host->axi + RZG3S_PCI_PCSTAT2);
+ dev_info(host->dev, "PCIe link status [0x%x]\n", val);
+
+ return 0;
+
+cfg_resets_deassert:
+ reset_control_bulk_assert(host->data->num_cfg_resets,
+ host->cfg_resets);
+disable_port_refclk:
+ clk_disable_unprepare(host->port.refclk);
+ return ret;
+}
+
+static void rzg3s_pcie_set_inbound_window(struct rzg3s_pcie_host *host,
+ u64 cpu_addr, u64 pci_addr, u64 size,
+ int id)
+{
+ /* Set CPU window base address */
+ writel_relaxed(upper_32_bits(cpu_addr),
+ host->axi + RZG3S_PCI_ADESTU(id));
+ writel_relaxed(lower_32_bits(cpu_addr),
+ host->axi + RZG3S_PCI_ADESTL(id));
+
+ /* Set window size */
+ writel_relaxed(upper_32_bits(size), host->axi + RZG3S_PCI_AWMASKU(id));
+ writel_relaxed(lower_32_bits(size), host->axi + RZG3S_PCI_AWMASKL(id));
+
+ /* Set PCIe window base address and enable the window */
+ writel_relaxed(upper_32_bits(pci_addr),
+ host->axi + RZG3S_PCI_AWBASEU(id));
+ writel_relaxed(lower_32_bits(pci_addr) | RZG3S_PCI_AWBASEL_WIN_ENA,
+ host->axi + RZG3S_PCI_AWBASEL(id));
+}
+
+static int rzg3s_pcie_set_inbound_windows(struct rzg3s_pcie_host *host,
+ struct resource_entry *entry,
+ int *index)
+{
+ u64 pci_addr = entry->res->start - entry->offset;
+ u64 cpu_addr = entry->res->start;
+ u64 cpu_end = entry->res->end;
+ u64 size_id = 0;
+ int id = *index;
+ u64 size;
+
+ while (cpu_addr < cpu_end) {
+ if (id >= RZG3S_MAX_WINDOWS)
+ return dev_err_probe(host->dev, -ENOSPC,
+ "Failed to map inbound window for resource (%s)\n",
+ entry->res->name);
+
+ size = resource_size(entry->res) - size_id;
+
+ /*
+ * According to the RZ/G3S HW manual (Rev.1.10,
+ * section 34.3.1.71 AXI Window Mask (Lower) Registers) the min
+ * size is 4K.
+ */
+ size = max(size, SZ_4K);
+
+ /*
+ * According the RZ/G3S HW manual (Rev.1.10, sections:
+ * - 34.3.1.69 AXI Window Base (Lower) Registers
+ * - 34.3.1.71 AXI Window Mask (Lower) Registers
+ * - 34.3.1.73 AXI Destination (Lower) Registers)
+ * the CPU addr, PCIe addr, size should be 4K aligned and be a
+ * power of 2.
+ */
+ size = ALIGN(size, SZ_4K);
+ size = roundup_pow_of_two(size);
+
+ cpu_addr = ALIGN(cpu_addr, SZ_4K);
+ pci_addr = ALIGN(pci_addr, SZ_4K);
+
+ /*
+ * According to the RZ/G3S HW manual (Rev.1.10, section
+ * 34.3.1.71 AXI Window Mask (Lower) Registers) HW expects first
+ * 12 LSB bits to be 0xfff. Subtract 1 from size for this.
+ */
+ rzg3s_pcie_set_inbound_window(host, cpu_addr, pci_addr,
+ size - 1, id);
+
+ pci_addr += size;
+ cpu_addr += size;
+ size_id = size;
+ id++;
+ }
+ *index = id;
+
+ return 0;
+}
+
+static int rzg3s_pcie_parse_map_dma_ranges(struct rzg3s_pcie_host *host)
+{
+ struct pci_host_bridge *bridge = pci_host_bridge_from_priv(host);
+ struct resource_entry *entry;
+ int i = 0, ret;
+
+ resource_list_for_each_entry(entry, &bridge->dma_ranges) {
+ ret = rzg3s_pcie_set_inbound_windows(host, entry, &i);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static void rzg3s_pcie_set_outbound_window(struct rzg3s_pcie_host *host,
+ struct resource_entry *win, int id)
+{
+ struct resource *res = win->res;
+ resource_size_t size = resource_size(res);
+ resource_size_t res_start;
+
+ if (res->flags & IORESOURCE_IO)
+ res_start = pci_pio_to_address(res->start) - win->offset;
+ else
+ res_start = res->start - win->offset;
+
+ /*
+ * According to the RZ/G3S HW manual (Rev.1.10, section 34.3.1.75 PCIe
+ * Window Base (Lower) Registers) the window base address need to be 4K
+ * aligned.
+ */
+ res_start = ALIGN(res_start, SZ_4K);
+
+ size = ALIGN(size, SZ_4K);
+ size = roundup_pow_of_two(size) - 1;
+
+ /* Set PCIe destination */
+ writel_relaxed(upper_32_bits(res_start),
+ host->axi + RZG3S_PCI_PDESTU(id));
+ writel_relaxed(lower_32_bits(res_start),
+ host->axi + RZG3S_PCI_PDESTL(id));
+
+ /* Set PCIe window mask */
+ writel_relaxed(upper_32_bits(size), host->axi + RZG3S_PCI_PWMASKU(id));
+ writel_relaxed(lower_32_bits(size), host->axi + RZG3S_PCI_PWMASKL(id));
+
+ /* Set PCIe window base and enable the window */
+ writel_relaxed(upper_32_bits(res_start),
+ host->axi + RZG3S_PCI_PWBASEU(id));
+ writel_relaxed(lower_32_bits(res_start) | RZG3S_PCI_PWBASEL_ENA,
+ host->axi + RZG3S_PCI_PWBASEL(id));
+}
+
+static int rzg3s_pcie_parse_map_ranges(struct rzg3s_pcie_host *host)
+{
+ struct pci_host_bridge *bridge = pci_host_bridge_from_priv(host);
+ struct resource_entry *win;
+ int i = 0;
+
+ resource_list_for_each_entry(win, &bridge->windows) {
+ struct resource *res = win->res;
+
+ if (i >= RZG3S_MAX_WINDOWS)
+ return dev_err_probe(host->dev, -ENOSPC,
+ "Failed to map outbound window for resource (%s)\n",
+ res->name);
+
+ if (!res->flags)
+ continue;
+
+ switch (resource_type(res)) {
+ case IORESOURCE_IO:
+ case IORESOURCE_MEM:
+ rzg3s_pcie_set_outbound_window(host, win, i);
+ i++;
+ break;
+ }
+ }
+
+ return 0;
+}
+
+static int rzg3s_soc_pcie_init_phy(struct rzg3s_pcie_host *host)
+{
+ static const u32 xcfgd_settings[RZG3S_PCI_PHY_XCFGD_NUM] = {
+ [8] = 0xe0006801, 0x007f7e30, 0x183e0000, 0x978ff500,
+ 0xec000000, 0x009f1400, 0x0000d009,
+ [17] = 0x78000000,
+ [19] = 0x00880000, 0x000005c0, 0x07000000, 0x00780920,
+ 0xc9400ce2, 0x90000c0c, 0x000c1414, 0x00005034,
+ 0x00006000, 0x00000001,
+ };
+ static const u32 xcfga_cmn_settings[RZG3S_PCI_PHY_XCFGA_CMN_NUM] = {
+ 0x00000d10, 0x08310100, 0x00c21404, 0x013c0010, 0x01874440,
+ 0x1a216082, 0x00103440, 0x00000080, 0x00000010, 0x0c1000c1,
+ 0x1000c100, 0x0222000c, 0x00640019, 0x00a00028, 0x01d11228,
+ 0x0201001d,
+ };
+ static const u32 xcfga_rx_settings[RZG3S_PCI_PHY_XCFGA_RX_NUM] = {
+ 0x07d55000, 0x030e3f00, 0x00000288, 0x102c5880, 0x0000000b,
+ 0x04141441, 0x00641641, 0x00d63d63, 0x00641641, 0x01970377,
+ 0x00190287, 0x00190028, 0x00000028,
+ };
+ unsigned int i;
+
+ /*
+ * Enable access permission for physical layer control and status
+ * registers.
+ */
+ writel_relaxed(RZG3S_PCI_PERM_PIPE_PHY_REG_EN,
+ host->axi + RZG3S_PCI_PERM);
+
+ for (i = 0; i < RZG3S_PCI_PHY_XCFGD_NUM; i++) {
+ writel_relaxed(xcfgd_settings[i],
+ host->axi + RZG3S_PCI_PHY_XCFGD(i));
+ }
+
+ for (i = 0; i < RZG3S_PCI_PHY_XCFGA_CMN_NUM; i++) {
+ writel_relaxed(xcfga_cmn_settings[i],
+ host->axi + RZG3S_PCI_PHY_XCFGA_CMN(i));
+ }
+
+ for (i = 0; i < RZG3S_PCI_PHY_XCFGA_RX_NUM; i++) {
+ writel_relaxed(xcfga_rx_settings[i],
+ host->axi + RZG3S_PCI_PHY_XCFGA_RX(i));
+ }
+
+ writel_relaxed(0x107, host->axi + RZG3S_PCI_PHY_XCFGA_TX);
+
+ /* Select PHY settings values */
+ writel_relaxed(RZG3S_PCI_PHY_XCFG_CTRL_PHYREG_SEL,
+ host->axi + RZG3S_PCI_PHY_XCFG_CTRL);
+
+ /*
+ * Disable access permission for physical layer control and status
+ * registers.
+ */
+ writel_relaxed(0, host->axi + RZG3S_PCI_PERM);
+
+ return 0;
+}
+
+static int
+rzg3s_pcie_host_setup(struct rzg3s_pcie_host *host,
+ int (*init_irqdomain)(struct rzg3s_pcie_host *host),
+ void (*teardown_irqdomain)(struct rzg3s_pcie_host *host))
+{
+ struct device *dev = host->dev;
+ int ret;
+
+ /* Set inbound windows */
+ ret = rzg3s_pcie_parse_map_dma_ranges(host);
+ if (ret)
+ return dev_err_probe(dev, ret,
+ "Failed to set inbound windows!\n");
+
+ /* Set outbound windows */
+ ret = rzg3s_pcie_parse_map_ranges(host);
+ if (ret)
+ return dev_err_probe(dev, ret,
+ "Failed to set outbound windows!\n");
+
+ ret = init_irqdomain(host);
+ if (ret)
+ return dev_err_probe(dev, ret, "Failed to init IRQ domain\n");
+
+ ret = rzg3s_pcie_host_init(host);
+ if (ret) {
+ dev_err_probe(dev, ret, "Failed to initialize the HW!\n");
+ goto teardown_irqdomain;
+ }
+
+ ret = rzg3s_pcie_set_max_link_speed(host);
+ if (ret)
+ dev_info(dev, "Failed to set max link speed\n");
+
+ msleep(PCIE_RESET_CONFIG_WAIT_MS);
+
+ return 0;
+
+teardown_irqdomain:
+ teardown_irqdomain(host);
+
+ return ret;
+}
+
+static int rzg3s_pcie_probe(struct platform_device *pdev)
+{
+ struct pci_host_bridge *bridge;
+ struct device *dev = &pdev->dev;
+ struct device_node *np = dev->of_node;
+ struct device_node *sysc_np __free(device_node) =
+ of_parse_phandle(np, "renesas,sysc", 0);
+ struct rzg3s_pcie_host *host;
+ int ret;
+
+ bridge = devm_pci_alloc_host_bridge(dev, sizeof(*host));
+ if (!bridge)
+ return -ENOMEM;
+
+ host = pci_host_bridge_priv(bridge);
+ host->dev = dev;
+ host->data = device_get_match_data(dev);
+ platform_set_drvdata(pdev, host);
+
+ host->axi = devm_platform_ioremap_resource(pdev, 0);
+ if (IS_ERR(host->axi))
+ return PTR_ERR(host->axi);
+ host->pcie = host->axi + RZG3S_PCI_CFG_BASE;
+
+ host->max_link_speed = of_pci_get_max_link_speed(np);
+ if (host->max_link_speed < 0)
+ host->max_link_speed = 2;
+
+ ret = rzg3s_pcie_host_parse_port(host);
+ if (ret)
+ return ret;
+
+ host->sysc = syscon_node_to_regmap(sysc_np);
+ if (IS_ERR(host->sysc)) {
+ ret = PTR_ERR(host->sysc);
+ goto port_refclk_put;
+ }
+
+ ret = regmap_update_bits(host->sysc, RZG3S_SYS_PCIE_RST_RSM_B,
+ RZG3S_SYS_PCIE_RST_RSM_B_MASK,
+ FIELD_PREP(RZG3S_SYS_PCIE_RST_RSM_B_MASK, 1));
+ if (ret)
+ goto port_refclk_put;
+
+ ret = rzg3s_pcie_resets_prepare_and_get(host);
+ if (ret)
+ goto sysc_signal_restore;
+
+ ret = rzg3s_pcie_power_resets_deassert(host);
+ if (ret)
+ goto sysc_signal_restore;
+
+ pm_runtime_enable(dev);
+
+ /*
+ * Controller clocks are part of a clock power domain. Enable them
+ * through runtime PM.
+ */
+ ret = pm_runtime_resume_and_get(dev);
+ if (ret)
+ goto rpm_disable;
+
+ raw_spin_lock_init(&host->hw_lock);
+
+ ret = rzg3s_pcie_host_setup(host, rzg3s_pcie_init_irqdomain,
+ rzg3s_pcie_teardown_irqdomain);
+ if (ret)
+ goto rpm_put;
+
+ bridge->sysdata = host;
+ bridge->ops = &rzg3s_pcie_root_ops;
+ bridge->child_ops = &rzg3s_pcie_child_ops;
+ ret = pci_host_probe(bridge);
+ if (ret)
+ goto host_probe_teardown;
+
+ return 0;
+
+host_probe_teardown:
+ rzg3s_pcie_teardown_irqdomain(host);
+ reset_control_bulk_deassert(host->data->num_cfg_resets,
+ host->cfg_resets);
+rpm_put:
+ pm_runtime_put_sync(dev);
+rpm_disable:
+ pm_runtime_disable(dev);
+ reset_control_bulk_assert(host->data->num_power_resets,
+ host->power_resets);
+sysc_signal_restore:
+ /*
+ * SYSC RST_RSM_B signal need to be asserted before turning off the
+ * power to the PHY.
+ */
+ regmap_update_bits(host->sysc, RZG3S_SYS_PCIE_RST_RSM_B,
+ RZG3S_SYS_PCIE_RST_RSM_B_MASK,
+ FIELD_PREP(RZG3S_SYS_PCIE_RST_RSM_B_MASK, 0));
+port_refclk_put:
+ clk_put(host->port.refclk);
+
+ return ret;
+}
+
+static int rzg3s_pcie_suspend_noirq(struct device *dev)
+{
+ struct rzg3s_pcie_host *host = dev_get_drvdata(dev);
+ const struct rzg3s_pcie_soc_data *data = host->data;
+ struct rzg3s_pcie_port *port = &host->port;
+ struct regmap *sysc = host->sysc;
+ int ret;
+
+ ret = pm_runtime_put_sync(dev);
+ if (ret)
+ return ret;
+
+ clk_disable_unprepare(port->refclk);
+
+ ret = reset_control_bulk_assert(data->num_power_resets,
+ host->power_resets);
+ if (ret)
+ goto refclk_restore;
+
+ ret = reset_control_bulk_assert(data->num_cfg_resets,
+ host->cfg_resets);
+ if (ret)
+ goto power_resets_restore;
+
+ ret = regmap_update_bits(sysc, RZG3S_SYS_PCIE_RST_RSM_B,
+ RZG3S_SYS_PCIE_RST_RSM_B_MASK,
+ FIELD_PREP(RZG3S_SYS_PCIE_RST_RSM_B_MASK, 0));
+ if (ret)
+ goto cfg_resets_restore;
+
+ return 0;
+
+ /* Restore the previous state if any error happens */
+cfg_resets_restore:
+ reset_control_bulk_deassert(data->num_cfg_resets,
+ host->cfg_resets);
+power_resets_restore:
+ reset_control_bulk_deassert(data->num_power_resets,
+ host->power_resets);
+refclk_restore:
+ clk_prepare_enable(port->refclk);
+ pm_runtime_resume_and_get(dev);
+ return ret;
+}
+
+static int rzg3s_pcie_resume_noirq(struct device *dev)
+{
+ struct rzg3s_pcie_host *host = dev_get_drvdata(dev);
+ const struct rzg3s_pcie_soc_data *data = host->data;
+ struct regmap *sysc = host->sysc;
+ int ret;
+
+ ret = regmap_update_bits(sysc, RZG3S_SYS_PCIE_RST_RSM_B,
+ RZG3S_SYS_PCIE_RST_RSM_B_MASK,
+ FIELD_PREP(RZG3S_SYS_PCIE_RST_RSM_B_MASK, 1));
+ if (ret)
+ return ret;
+
+ ret = rzg3s_pcie_power_resets_deassert(host);
+ if (ret)
+ goto assert_rst_rsm_b;
+
+ ret = pm_runtime_resume_and_get(dev);
+ if (ret)
+ goto assert_power_resets;
+
+ ret = rzg3s_pcie_host_setup(host, rzg3s_pcie_msi_hw_setup,
+ rzg3s_pcie_msi_hw_teardown);
+ if (ret)
+ goto rpm_put;
+
+ return 0;
+
+ /*
+ * If any error happens there is no way to recover the IP. Put it in the
+ * lowest possible power state.
+ */
+rpm_put:
+ pm_runtime_put_sync(dev);
+assert_power_resets:
+ reset_control_bulk_assert(data->num_power_resets,
+ host->power_resets);
+assert_rst_rsm_b:
+ regmap_update_bits(sysc, RZG3S_SYS_PCIE_RST_RSM_B,
+ RZG3S_SYS_PCIE_RST_RSM_B_MASK,
+ FIELD_PREP(RZG3S_SYS_PCIE_RST_RSM_B_MASK, 0));
+ return ret;
+}
+
+static const struct dev_pm_ops rzg3s_pcie_pm_ops = {
+ NOIRQ_SYSTEM_SLEEP_PM_OPS(rzg3s_pcie_suspend_noirq,
+ rzg3s_pcie_resume_noirq)
+};
+
+static const char * const rzg3s_soc_power_resets[] = {
+ "aresetn", "rst_cfg_b", "rst_load_b",
+};
+
+static const char * const rzg3s_soc_cfg_resets[] = {
+ "rst_b", "rst_ps_b", "rst_gp_b", "rst_rsm_b",
+};
+
+static const struct rzg3s_pcie_soc_data rzg3s_soc_data = {
+ .power_resets = rzg3s_soc_power_resets,
+ .num_power_resets = ARRAY_SIZE(rzg3s_soc_power_resets),
+ .cfg_resets = rzg3s_soc_cfg_resets,
+ .num_cfg_resets = ARRAY_SIZE(rzg3s_soc_cfg_resets),
+ .init_phy = rzg3s_soc_pcie_init_phy,
+};
+
+static const struct of_device_id rzg3s_pcie_of_match[] = {
+ {
+ .compatible = "renesas,r9a08g045-pcie",
+ .data = &rzg3s_soc_data,
+ },
+ {}
+};
+
+static struct platform_driver rzg3s_pcie_driver = {
+ .driver = {
+ .name = "rzg3s-pcie-host",
+ .of_match_table = rzg3s_pcie_of_match,
+ .pm = pm_ptr(&rzg3s_pcie_pm_ops),
+ .suppress_bind_attrs = true,
+ .probe_type = PROBE_PREFER_ASYNCHRONOUS,
+ },
+ .probe = rzg3s_pcie_probe,
+};
+builtin_platform_driver(rzg3s_pcie_driver);
diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c
index b4b62b9ccc45..ec6afc38e898 100644
--- a/drivers/pci/controller/vmd.c
+++ b/drivers/pci/controller/vmd.c
@@ -578,22 +578,6 @@ static void vmd_detach_resources(struct vmd_dev *vmd)
vmd->dev->resource[VMD_MEMBAR2].child = NULL;
}
-/*
- * VMD domains start at 0x10000 to not clash with ACPI _SEG domains.
- * Per ACPI r6.0, sec 6.5.6, _SEG returns an integer, of which the lower
- * 16 bits are the PCI Segment Group (domain) number. Other bits are
- * currently reserved.
- */
-static int vmd_find_free_domain(void)
-{
- int domain = 0xffff;
- struct pci_bus *bus = NULL;
-
- while ((bus = pci_find_next_bus(bus)) != NULL)
- domain = max_t(int, domain, pci_domain_nr(bus));
- return domain + 1;
-}
-
static int vmd_get_phys_offsets(struct vmd_dev *vmd, bool native_hint,
resource_size_t *offset1,
resource_size_t *offset2)
@@ -878,13 +862,6 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features)
.parent = res,
};
- sd->vmd_dev = vmd->dev;
- sd->domain = vmd_find_free_domain();
- if (sd->domain < 0)
- return sd->domain;
-
- sd->node = pcibus_to_node(vmd->dev->bus);
-
/*
* Currently MSI remapping must be enabled in guest passthrough mode
* due to some missing interrupt remapping plumbing. This is probably
@@ -910,9 +887,24 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features)
pci_add_resource_offset(&resources, &vmd->resources[1], offset[0]);
pci_add_resource_offset(&resources, &vmd->resources[2], offset[1]);
+ sd->vmd_dev = vmd->dev;
+
+ /*
+ * Emulated domains start at 0x10000 to not clash with ACPI _SEG
+ * domains. Per ACPI r6.0, sec 6.5.6, _SEG returns an integer, of
+ * which the lower 16 bits are the PCI Segment Group (domain) number.
+ * Other bits are currently reserved.
+ */
+ sd->domain = pci_bus_find_emul_domain_nr(0, 0x10000, INT_MAX);
+ if (sd->domain < 0)
+ return sd->domain;
+
+ sd->node = pcibus_to_node(vmd->dev->bus);
+
vmd->bus = pci_create_root_bus(&vmd->dev->dev, vmd->busn_start,
&vmd_ops, sd, &resources);
if (!vmd->bus) {
+ pci_bus_release_emul_domain_nr(sd->domain);
pci_free_resource_list(&resources);
vmd_remove_irq_domain(vmd);
return -ENODEV;
@@ -1005,6 +997,7 @@ static int vmd_probe(struct pci_dev *dev, const struct pci_device_id *id)
return -ENOMEM;
vmd->dev = dev;
+ vmd->sysdata.domain = PCI_DOMAIN_NR_NOT_SET;
vmd->instance = ida_alloc(&vmd_instance_ida, GFP_KERNEL);
if (vmd->instance < 0)
return vmd->instance;
@@ -1070,6 +1063,7 @@ static void vmd_remove(struct pci_dev *dev)
vmd_detach_resources(vmd);
vmd_remove_irq_domain(vmd);
ida_free(&vmd_instance_ida, vmd->instance);
+ pci_bus_release_emul_domain_nr(vmd->sysdata.domain);
}
static void vmd_shutdown(struct pci_dev *dev)
diff --git a/drivers/pci/endpoint/functions/pci-epf-test.c b/drivers/pci/endpoint/functions/pci-epf-test.c
index 0686f7ecbe91..debd235253c5 100644
--- a/drivers/pci/endpoint/functions/pci-epf-test.c
+++ b/drivers/pci/endpoint/functions/pci-epf-test.c
@@ -729,8 +729,9 @@ static void pci_epf_test_enable_doorbell(struct pci_epf_test *epf_test,
if (bar < BAR_0)
goto err_doorbell_cleanup;
- ret = request_irq(epf->db_msg[0].virq, pci_epf_test_doorbell_handler, 0,
- "pci-ep-test-doorbell", epf_test);
+ ret = request_threaded_irq(epf->db_msg[0].virq, NULL,
+ pci_epf_test_doorbell_handler, IRQF_ONESHOT,
+ "pci-ep-test-doorbell", epf_test);
if (ret) {
dev_err(&epf->dev,
"Failed to request doorbell IRQ: %d\n",
diff --git a/drivers/pci/endpoint/functions/pci-epf-vntb.c b/drivers/pci/endpoint/functions/pci-epf-vntb.c
index 83e9ab10f9c4..3ecc5059f92b 100644
--- a/drivers/pci/endpoint/functions/pci-epf-vntb.c
+++ b/drivers/pci/endpoint/functions/pci-epf-vntb.c
@@ -36,11 +36,13 @@
* PCIe Root Port PCI EP
*/
+#include <linux/atomic.h>
#include <linux/delay.h>
#include <linux/io.h>
#include <linux/module.h>
#include <linux/slab.h>
+#include <linux/pci-ep-msi.h>
#include <linux/pci-epc.h>
#include <linux/pci-epf.h>
#include <linux/ntb.h>
@@ -126,12 +128,13 @@ struct epf_ntb {
u32 db_count;
u32 spad_count;
u64 mws_size[MAX_MW];
- u64 db;
+ atomic64_t db;
u32 vbus_number;
u16 vntb_pid;
u16 vntb_vid;
bool linkup;
+ bool msi_doorbell;
u32 spad_size;
enum pci_barno epf_ntb_bar[VNTB_BAR_NUM];
@@ -258,9 +261,9 @@ static void epf_ntb_cmd_handler(struct work_struct *work)
ntb = container_of(work, struct epf_ntb, cmd_handler.work);
- for (i = 1; i < ntb->db_count; i++) {
+ for (i = 1; i < ntb->db_count && !ntb->msi_doorbell; i++) {
if (ntb->epf_db[i]) {
- ntb->db |= 1 << (i - 1);
+ atomic64_or(1 << (i - 1), &ntb->db);
ntb_db_event(&ntb->ntb, i);
ntb->epf_db[i] = 0;
}
@@ -319,7 +322,21 @@ static void epf_ntb_cmd_handler(struct work_struct *work)
reset_handler:
queue_delayed_work(kpcintb_workqueue, &ntb->cmd_handler,
- msecs_to_jiffies(5));
+ ntb->msi_doorbell ? msecs_to_jiffies(500) : msecs_to_jiffies(5));
+}
+
+static irqreturn_t epf_ntb_doorbell_handler(int irq, void *data)
+{
+ struct epf_ntb *ntb = data;
+ int i;
+
+ for (i = 1; i < ntb->db_count; i++)
+ if (irq == ntb->epf->db_msg[i].virq) {
+ atomic64_or(1 << (i - 1), &ntb->db);
+ ntb_db_event(&ntb->ntb, i);
+ }
+
+ return IRQ_HANDLED;
}
/**
@@ -500,6 +517,94 @@ static int epf_ntb_configure_interrupt(struct epf_ntb *ntb)
return 0;
}
+static int epf_ntb_db_bar_init_msi_doorbell(struct epf_ntb *ntb,
+ struct pci_epf_bar *db_bar,
+ const struct pci_epc_features *epc_features,
+ enum pci_barno barno)
+{
+ struct pci_epf *epf = ntb->epf;
+ dma_addr_t low, high;
+ struct msi_msg *msg;
+ size_t sz;
+ int ret;
+ int i;
+
+ ret = pci_epf_alloc_doorbell(epf, ntb->db_count);
+ if (ret)
+ return ret;
+
+ for (i = 0; i < ntb->db_count; i++) {
+ ret = request_irq(epf->db_msg[i].virq, epf_ntb_doorbell_handler,
+ 0, "pci_epf_vntb_db", ntb);
+
+ if (ret) {
+ dev_err(&epf->dev,
+ "Failed to request doorbell IRQ: %d\n",
+ epf->db_msg[i].virq);
+ goto err_free_irq;
+ }
+ }
+
+ msg = &epf->db_msg[0].msg;
+
+ high = 0;
+ low = (u64)msg->address_hi << 32 | msg->address_lo;
+
+ for (i = 0; i < ntb->db_count; i++) {
+ struct msi_msg *msg = &epf->db_msg[i].msg;
+ dma_addr_t addr = (u64)msg->address_hi << 32 | msg->address_lo;
+
+ low = min(low, addr);
+ high = max(high, addr);
+ }
+
+ sz = high - low + sizeof(u32);
+
+ ret = pci_epf_assign_bar_space(epf, sz, barno, epc_features, 0, low);
+ if (ret) {
+ dev_err(&epf->dev, "Failed to assign Doorbell BAR space\n");
+ goto err_free_irq;
+ }
+
+ ret = pci_epc_set_bar(ntb->epf->epc, ntb->epf->func_no,
+ ntb->epf->vfunc_no, db_bar);
+ if (ret) {
+ dev_err(&epf->dev, "Failed to set Doorbell BAR\n");
+ goto err_free_irq;
+ }
+
+ for (i = 0; i < ntb->db_count; i++) {
+ struct msi_msg *msg = &epf->db_msg[i].msg;
+ dma_addr_t addr;
+ size_t offset;
+
+ ret = pci_epf_align_inbound_addr(epf, db_bar->barno,
+ ((u64)msg->address_hi << 32) | msg->address_lo,
+ &addr, &offset);
+
+ if (ret) {
+ ntb->msi_doorbell = false;
+ goto err_free_irq;
+ }
+
+ ntb->reg->db_data[i] = msg->data;
+ ntb->reg->db_offset[i] = offset;
+ }
+
+ ntb->reg->db_entry_size = 0;
+
+ ntb->msi_doorbell = true;
+
+ return 0;
+
+err_free_irq:
+ for (i--; i >= 0; i--)
+ free_irq(epf->db_msg[i].virq, ntb);
+
+ pci_epf_free_doorbell(ntb->epf);
+ return ret;
+}
+
/**
* epf_ntb_db_bar_init() - Configure Doorbell window BARs
* @ntb: NTB device that facilitates communication between HOST and VHOST
@@ -520,21 +625,25 @@ static int epf_ntb_db_bar_init(struct epf_ntb *ntb)
ntb->epf->func_no,
ntb->epf->vfunc_no);
barno = ntb->epf_ntb_bar[BAR_DB];
-
- mw_addr = pci_epf_alloc_space(ntb->epf, size, barno, epc_features, 0);
- if (!mw_addr) {
- dev_err(dev, "Failed to allocate OB address\n");
- return -ENOMEM;
- }
-
- ntb->epf_db = mw_addr;
-
epf_bar = &ntb->epf->bar[barno];
- ret = pci_epc_set_bar(ntb->epf->epc, ntb->epf->func_no, ntb->epf->vfunc_no, epf_bar);
+ ret = epf_ntb_db_bar_init_msi_doorbell(ntb, epf_bar, epc_features, barno);
if (ret) {
- dev_err(dev, "Doorbell BAR set failed\n");
+ /* fall back to polling mode */
+ mw_addr = pci_epf_alloc_space(ntb->epf, size, barno, epc_features, 0);
+ if (!mw_addr) {
+ dev_err(dev, "Failed to allocate OB address\n");
+ return -ENOMEM;
+ }
+
+ ntb->epf_db = mw_addr;
+
+ ret = pci_epc_set_bar(ntb->epf->epc, ntb->epf->func_no,
+ ntb->epf->vfunc_no, epf_bar);
+ if (ret) {
+ dev_err(dev, "Doorbell BAR set failed\n");
goto err_alloc_peer_mem;
+ }
}
return ret;
@@ -554,6 +663,16 @@ static void epf_ntb_db_bar_clear(struct epf_ntb *ntb)
{
enum pci_barno barno;
+ if (ntb->msi_doorbell) {
+ int i;
+
+ for (i = 0; i < ntb->db_count; i++)
+ free_irq(ntb->epf->db_msg[i].virq, ntb);
+ }
+
+ if (ntb->epf->db_msg)
+ pci_epf_free_doorbell(ntb->epf);
+
barno = ntb->epf_ntb_bar[BAR_DB];
pci_epf_free_space(ntb->epf, ntb->epf_db, barno, 0);
pci_epc_clear_bar(ntb->epf->epc,
@@ -1268,7 +1387,7 @@ static u64 vntb_epf_db_read(struct ntb_dev *ndev)
{
struct epf_ntb *ntb = ntb_ndev(ndev);
- return ntb->db;
+ return atomic64_read(&ntb->db);
}
static int vntb_epf_mw_get_align(struct ntb_dev *ndev, int pidx, int idx,
@@ -1308,7 +1427,7 @@ static int vntb_epf_db_clear(struct ntb_dev *ndev, u64 db_bits)
{
struct epf_ntb *ntb = ntb_ndev(ndev);
- ntb->db &= ~db_bits;
+ atomic64_and(~db_bits, &ntb->db);
return 0;
}
diff --git a/drivers/pci/endpoint/pci-epf-core.c b/drivers/pci/endpoint/pci-epf-core.c
index d54e18872aef..9a505c796370 100644
--- a/drivers/pci/endpoint/pci-epf-core.c
+++ b/drivers/pci/endpoint/pci-epf-core.c
@@ -208,6 +208,48 @@ void pci_epf_remove_vepf(struct pci_epf *epf_pf, struct pci_epf *epf_vf)
}
EXPORT_SYMBOL_GPL(pci_epf_remove_vepf);
+static int pci_epf_get_required_bar_size(struct pci_epf *epf, size_t *bar_size,
+ size_t *aligned_mem_size,
+ enum pci_barno bar,
+ const struct pci_epc_features *epc_features,
+ enum pci_epc_interface_type type)
+{
+ u64 bar_fixed_size = epc_features->bar[bar].fixed_size;
+ size_t align = epc_features->align;
+ size_t size = *bar_size;
+
+ if (size < 128)
+ size = 128;
+
+ /* According to PCIe base spec, min size for a resizable BAR is 1 MB. */
+ if (epc_features->bar[bar].type == BAR_RESIZABLE && size < SZ_1M)
+ size = SZ_1M;
+
+ if (epc_features->bar[bar].type == BAR_FIXED && bar_fixed_size) {
+ if (size > bar_fixed_size) {
+ dev_err(&epf->dev,
+ "requested BAR size is larger than fixed size\n");
+ return -ENOMEM;
+ }
+ size = bar_fixed_size;
+ } else {
+ /* BAR size must be power of two */
+ size = roundup_pow_of_two(size);
+ }
+
+ *bar_size = size;
+
+ /*
+ * The EPC's BAR start address must meet alignment requirements. In most
+ * cases, the alignment will match the BAR size. However, differences
+ * can occur—for example, when the fixed BAR size (e.g., 128 bytes) is
+ * smaller than the required alignment (e.g., 4 KB).
+ */
+ *aligned_mem_size = align ? ALIGN(size, align) : size;
+
+ return 0;
+}
+
/**
* pci_epf_free_space() - free the allocated PCI EPF register space
* @epf: the EPF device from whom to free the memory
@@ -236,13 +278,13 @@ void pci_epf_free_space(struct pci_epf *epf, void *addr, enum pci_barno bar,
}
dev = epc->dev.parent;
- dma_free_coherent(dev, epf_bar[bar].aligned_size, addr,
+ dma_free_coherent(dev, epf_bar[bar].mem_size, addr,
epf_bar[bar].phys_addr);
epf_bar[bar].phys_addr = 0;
epf_bar[bar].addr = NULL;
epf_bar[bar].size = 0;
- epf_bar[bar].aligned_size = 0;
+ epf_bar[bar].mem_size = 0;
epf_bar[bar].barno = 0;
epf_bar[bar].flags = 0;
}
@@ -264,40 +306,16 @@ void *pci_epf_alloc_space(struct pci_epf *epf, size_t size, enum pci_barno bar,
const struct pci_epc_features *epc_features,
enum pci_epc_interface_type type)
{
- u64 bar_fixed_size = epc_features->bar[bar].fixed_size;
- size_t aligned_size, align = epc_features->align;
struct pci_epf_bar *epf_bar;
dma_addr_t phys_addr;
struct pci_epc *epc;
struct device *dev;
+ size_t mem_size;
void *space;
- if (size < 128)
- size = 128;
-
- /* According to PCIe base spec, min size for a resizable BAR is 1 MB. */
- if (epc_features->bar[bar].type == BAR_RESIZABLE && size < SZ_1M)
- size = SZ_1M;
-
- if (epc_features->bar[bar].type == BAR_FIXED && bar_fixed_size) {
- if (size > bar_fixed_size) {
- dev_err(&epf->dev,
- "requested BAR size is larger than fixed size\n");
- return NULL;
- }
- size = bar_fixed_size;
- } else {
- /* BAR size must be power of two */
- size = roundup_pow_of_two(size);
- }
-
- /*
- * Allocate enough memory to accommodate the iATU alignment
- * requirement. In most cases, this will be the same as .size but
- * it might be different if, for example, the fixed size of a BAR
- * is smaller than align.
- */
- aligned_size = align ? ALIGN(size, align) : size;
+ if (pci_epf_get_required_bar_size(epf, &size, &mem_size, bar,
+ epc_features, type))
+ return NULL;
if (type == PRIMARY_INTERFACE) {
epc = epf->epc;
@@ -308,7 +326,7 @@ void *pci_epf_alloc_space(struct pci_epf *epf, size_t size, enum pci_barno bar,
}
dev = epc->dev.parent;
- space = dma_alloc_coherent(dev, aligned_size, &phys_addr, GFP_KERNEL);
+ space = dma_alloc_coherent(dev, mem_size, &phys_addr, GFP_KERNEL);
if (!space) {
dev_err(dev, "failed to allocate mem space\n");
return NULL;
@@ -317,7 +335,7 @@ void *pci_epf_alloc_space(struct pci_epf *epf, size_t size, enum pci_barno bar,
epf_bar[bar].phys_addr = phys_addr;
epf_bar[bar].addr = space;
epf_bar[bar].size = size;
- epf_bar[bar].aligned_size = aligned_size;
+ epf_bar[bar].mem_size = mem_size;
epf_bar[bar].barno = bar;
if (upper_32_bits(size) || epc_features->bar[bar].only_64bit)
epf_bar[bar].flags |= PCI_BASE_ADDRESS_MEM_TYPE_64;
@@ -328,6 +346,83 @@ void *pci_epf_alloc_space(struct pci_epf *epf, size_t size, enum pci_barno bar,
}
EXPORT_SYMBOL_GPL(pci_epf_alloc_space);
+/**
+ * pci_epf_assign_bar_space() - Assign PCI EPF BAR space
+ * @epf: EPF device to assign the BAR memory
+ * @size: Size of the memory that has to be assigned
+ * @bar: BAR number for which the memory is assigned
+ * @epc_features: Features provided by the EPC specific to this EPF
+ * @type: Identifies if the assignment is for primary EPC or secondary EPC
+ * @bar_addr: Address to be assigned for the @bar
+ *
+ * Invoke to assign memory for the PCI EPF BAR.
+ * Flag PCI_BASE_ADDRESS_MEM_TYPE_64 will automatically get set if the BAR
+ * can only be a 64-bit BAR, or if the requested size is larger than 2 GB.
+ */
+int pci_epf_assign_bar_space(struct pci_epf *epf, size_t size,
+ enum pci_barno bar,
+ const struct pci_epc_features *epc_features,
+ enum pci_epc_interface_type type,
+ dma_addr_t bar_addr)
+{
+ size_t bar_size, aligned_mem_size;
+ struct pci_epf_bar *epf_bar;
+ dma_addr_t limit;
+ int pos;
+
+ if (!size)
+ return -EINVAL;
+
+ limit = bar_addr + size - 1;
+
+ /*
+ * Bits: 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
+ * bar_addr: U U U U U U 0 X X X X X X X X X
+ * limit: U U U U U U 1 X X X X X X X X X
+ *
+ * bar_addr^limit 0 0 0 0 0 0 1 X X X X X X X X X
+ *
+ * U: unchanged address bits in range [bar_addr, limit]
+ * X: bit 0 or 1
+ *
+ * (bar_addr^limit) & BIT_ULL(pos) will find the first set bit from MSB
+ * (pos). And value of (2 ^ pos) should be able to cover the BAR range.
+ */
+ for (pos = 8 * sizeof(dma_addr_t) - 1; pos > 0; pos--)
+ if ((limit ^ bar_addr) & BIT_ULL(pos))
+ break;
+
+ if (pos == 8 * sizeof(dma_addr_t) - 1)
+ return -EINVAL;
+
+ bar_size = BIT_ULL(pos + 1);
+ if (pci_epf_get_required_bar_size(epf, &bar_size, &aligned_mem_size,
+ bar, epc_features, type))
+ return -ENOMEM;
+
+ if (type == PRIMARY_INTERFACE)
+ epf_bar = epf->bar;
+ else
+ epf_bar = epf->sec_epc_bar;
+
+ epf_bar[bar].phys_addr = ALIGN_DOWN(bar_addr, aligned_mem_size);
+
+ if (epf_bar[bar].phys_addr + bar_size < limit)
+ return -ENOMEM;
+
+ epf_bar[bar].addr = NULL;
+ epf_bar[bar].size = bar_size;
+ epf_bar[bar].mem_size = aligned_mem_size;
+ epf_bar[bar].barno = bar;
+ if (upper_32_bits(size) || epc_features->bar[bar].only_64bit)
+ epf_bar[bar].flags |= PCI_BASE_ADDRESS_MEM_TYPE_64;
+ else
+ epf_bar[bar].flags |= PCI_BASE_ADDRESS_MEM_TYPE_32;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(pci_epf_assign_bar_space);
+
static void pci_epf_remove_cfs(struct pci_epf_driver *driver)
{
struct config_group *group, *tmp;
diff --git a/drivers/pci/host-bridge.c b/drivers/pci/host-bridge.c
index afa50b446567..be5ef6516cff 100644
--- a/drivers/pci/host-bridge.c
+++ b/drivers/pci/host-bridge.c
@@ -33,6 +33,7 @@ struct device *pci_get_host_bridge_device(struct pci_dev *dev)
kobject_get(&bridge->kobj);
return bridge;
}
+EXPORT_SYMBOL_GPL(pci_get_host_bridge_device);
void pci_put_host_bridge_device(struct device *dev)
{
diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index 77dee43b7858..00784a60ba80 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -158,8 +158,7 @@ resource_size_t pci_iov_resource_size(struct pci_dev *dev, int resno)
return dev->sriov->barsz[pci_resource_num_to_vf_bar(resno)];
}
-void pci_iov_resource_set_size(struct pci_dev *dev, int resno,
- resource_size_t size)
+void pci_iov_resource_set_size(struct pci_dev *dev, int resno, int size)
{
if (!pci_resource_is_iov(resno)) {
pci_warn(dev, "%s is not an IOV resource\n",
@@ -167,7 +166,8 @@ void pci_iov_resource_set_size(struct pci_dev *dev, int resno,
return;
}
- dev->sriov->barsz[pci_resource_num_to_vf_bar(resno)] = size;
+ resno = pci_resource_num_to_vf_bar(resno);
+ dev->sriov->barsz[resno] = pci_rebar_size_to_bytes(size);
}
bool pci_iov_is_memory_decoding_enabled(struct pci_dev *dev)
@@ -1339,29 +1339,16 @@ EXPORT_SYMBOL_GPL(pci_sriov_configure_simple);
*/
int pci_iov_vf_bar_set_size(struct pci_dev *dev, int resno, int size)
{
- u32 sizes;
- int ret;
-
if (!pci_resource_is_iov(resno))
return -EINVAL;
if (pci_iov_is_memory_decoding_enabled(dev))
return -EBUSY;
- sizes = pci_rebar_get_possible_sizes(dev, resno);
- if (!sizes)
- return -ENOTSUPP;
-
- if (!(sizes & BIT(size)))
+ if (!pci_rebar_size_supported(dev, resno, size))
return -EINVAL;
- ret = pci_rebar_set_size(dev, resno, size);
- if (ret)
- return ret;
-
- pci_iov_resource_set_size(dev, resno, pci_rebar_size_to_bytes(size));
-
- return 0;
+ return pci_rebar_set_size(dev, resno, size);
}
EXPORT_SYMBOL_GPL(pci_iov_vf_bar_set_size);
@@ -1380,7 +1367,7 @@ EXPORT_SYMBOL_GPL(pci_iov_vf_bar_set_size);
u32 pci_iov_vf_bar_get_sizes(struct pci_dev *dev, int resno, int num_vfs)
{
u64 vf_len = pci_resource_len(dev, resno);
- u32 sizes;
+ u64 sizes;
if (!num_vfs)
return 0;
diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index 78e108e47254..981a76b6b7c0 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c
@@ -25,12 +25,12 @@ struct pci_p2pdma {
struct gen_pool *pool;
bool p2pmem_published;
struct xarray map_types;
+ struct p2pdma_provider mem[PCI_STD_NUM_BARS];
};
struct pci_p2pdma_pagemap {
- struct pci_dev *provider;
- u64 bus_offset;
struct dev_pagemap pgmap;
+ struct p2pdma_provider *mem;
};
static struct pci_p2pdma_pagemap *to_p2p_pgmap(struct dev_pagemap *pgmap)
@@ -204,8 +204,8 @@ static void p2pdma_page_free(struct page *page)
{
struct pci_p2pdma_pagemap *pgmap = to_p2p_pgmap(page_pgmap(page));
/* safe to dereference while a reference is held to the percpu ref */
- struct pci_p2pdma *p2pdma =
- rcu_dereference_protected(pgmap->provider->p2pdma, 1);
+ struct pci_p2pdma *p2pdma = rcu_dereference_protected(
+ to_pci_dev(pgmap->mem->owner)->p2pdma, 1);
struct percpu_ref *ref;
gen_pool_free_owner(p2pdma->pool, (uintptr_t)page_to_virt(page),
@@ -228,56 +228,136 @@ static void pci_p2pdma_release(void *data)
/* Flush and disable pci_alloc_p2p_mem() */
pdev->p2pdma = NULL;
- synchronize_rcu();
+ if (p2pdma->pool)
+ synchronize_rcu();
+ xa_destroy(&p2pdma->map_types);
+
+ if (!p2pdma->pool)
+ return;
gen_pool_destroy(p2pdma->pool);
sysfs_remove_group(&pdev->dev.kobj, &p2pmem_group);
- xa_destroy(&p2pdma->map_types);
}
-static int pci_p2pdma_setup(struct pci_dev *pdev)
+/**
+ * pcim_p2pdma_init - Initialise peer-to-peer DMA providers
+ * @pdev: The PCI device to enable P2PDMA for
+ *
+ * This function initializes the peer-to-peer DMA infrastructure
+ * for a PCI device. It allocates and sets up the necessary data
+ * structures to support P2PDMA operations, including mapping type
+ * tracking.
+ */
+int pcim_p2pdma_init(struct pci_dev *pdev)
{
- int error = -ENOMEM;
struct pci_p2pdma *p2p;
+ int i, ret;
+
+ p2p = rcu_dereference_protected(pdev->p2pdma, 1);
+ if (p2p)
+ return 0;
p2p = devm_kzalloc(&pdev->dev, sizeof(*p2p), GFP_KERNEL);
if (!p2p)
return -ENOMEM;
xa_init(&p2p->map_types);
+ /*
+ * Iterate over all standard PCI BARs and record only those that
+ * correspond to MMIO regions. Skip non-memory resources (e.g. I/O
+ * port BARs) since they cannot be used for peer-to-peer (P2P)
+ * transactions.
+ */
+ for (i = 0; i < PCI_STD_NUM_BARS; i++) {
+ if (!(pci_resource_flags(pdev, i) & IORESOURCE_MEM))
+ continue;
- p2p->pool = gen_pool_create(PAGE_SHIFT, dev_to_node(&pdev->dev));
- if (!p2p->pool)
- goto out;
+ p2p->mem[i].owner = &pdev->dev;
+ p2p->mem[i].bus_offset =
+ pci_bus_address(pdev, i) - pci_resource_start(pdev, i);
+ }
- error = devm_add_action_or_reset(&pdev->dev, pci_p2pdma_release, pdev);
- if (error)
- goto out_pool_destroy;
+ ret = devm_add_action_or_reset(&pdev->dev, pci_p2pdma_release, pdev);
+ if (ret)
+ goto out_p2p;
- error = sysfs_create_group(&pdev->dev.kobj, &p2pmem_group);
- if (error)
+ rcu_assign_pointer(pdev->p2pdma, p2p);
+ return 0;
+
+out_p2p:
+ devm_kfree(&pdev->dev, p2p);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(pcim_p2pdma_init);
+
+/**
+ * pcim_p2pdma_provider - Get peer-to-peer DMA provider
+ * @pdev: The PCI device to enable P2PDMA for
+ * @bar: BAR index to get provider
+ *
+ * This function gets peer-to-peer DMA provider for a PCI device. The lifetime
+ * of the provider (and of course the MMIO) is bound to the lifetime of the
+ * driver. A driver calling this function must ensure that all references to the
+ * provider, and any DMA mappings created for any MMIO, are all cleaned up
+ * before the driver remove() completes.
+ *
+ * Since P2P is almost always shared with a second driver this means some system
+ * to notify, invalidate and revoke the MMIO's DMA must be in place to use this
+ * function. For example a revoke can be built using DMABUF.
+ */
+struct p2pdma_provider *pcim_p2pdma_provider(struct pci_dev *pdev, int bar)
+{
+ struct pci_p2pdma *p2p;
+
+ if (!(pci_resource_flags(pdev, bar) & IORESOURCE_MEM))
+ return NULL;
+
+ p2p = rcu_dereference_protected(pdev->p2pdma, 1);
+ if (WARN_ON(!p2p))
+ /* Someone forgot to call to pcim_p2pdma_init() before */
+ return NULL;
+
+ return &p2p->mem[bar];
+}
+EXPORT_SYMBOL_GPL(pcim_p2pdma_provider);
+
+static int pci_p2pdma_setup_pool(struct pci_dev *pdev)
+{
+ struct pci_p2pdma *p2pdma;
+ int ret;
+
+ p2pdma = rcu_dereference_protected(pdev->p2pdma, 1);
+ if (p2pdma->pool)
+ /* We already setup pools, do nothing, */
+ return 0;
+
+ p2pdma->pool = gen_pool_create(PAGE_SHIFT, dev_to_node(&pdev->dev));
+ if (!p2pdma->pool)
+ return -ENOMEM;
+
+ ret = sysfs_create_group(&pdev->dev.kobj, &p2pmem_group);
+ if (ret)
goto out_pool_destroy;
- rcu_assign_pointer(pdev->p2pdma, p2p);
return 0;
out_pool_destroy:
- gen_pool_destroy(p2p->pool);
-out:
- devm_kfree(&pdev->dev, p2p);
- return error;
+ gen_pool_destroy(p2pdma->pool);
+ p2pdma->pool = NULL;
+ return ret;
}
static void pci_p2pdma_unmap_mappings(void *data)
{
- struct pci_dev *pdev = data;
+ struct pci_p2pdma_pagemap *p2p_pgmap = data;
/*
* Removing the alloc attribute from sysfs will call
* unmap_mapping_range() on the inode, teardown any existing userspace
* mappings and prevent new ones from being created.
*/
- sysfs_remove_file_from_group(&pdev->dev.kobj, &p2pmem_alloc_attr.attr,
+ sysfs_remove_file_from_group(&p2p_pgmap->mem->owner->kobj,
+ &p2pmem_alloc_attr.attr,
p2pmem_group.name);
}
@@ -295,6 +375,7 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
u64 offset)
{
struct pci_p2pdma_pagemap *p2p_pgmap;
+ struct p2pdma_provider *mem;
struct dev_pagemap *pgmap;
struct pci_p2pdma *p2pdma;
void *addr;
@@ -312,11 +393,21 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
if (size + offset > pci_resource_len(pdev, bar))
return -EINVAL;
- if (!pdev->p2pdma) {
- error = pci_p2pdma_setup(pdev);
- if (error)
- return error;
- }
+ error = pcim_p2pdma_init(pdev);
+ if (error)
+ return error;
+
+ error = pci_p2pdma_setup_pool(pdev);
+ if (error)
+ return error;
+
+ mem = pcim_p2pdma_provider(pdev, bar);
+ /*
+ * We checked validity of BAR prior to call
+ * to pcim_p2pdma_provider. It should never return NULL.
+ */
+ if (WARN_ON(!mem))
+ return -EINVAL;
p2p_pgmap = devm_kzalloc(&pdev->dev, sizeof(*p2p_pgmap), GFP_KERNEL);
if (!p2p_pgmap)
@@ -328,10 +419,7 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
pgmap->nr_range = 1;
pgmap->type = MEMORY_DEVICE_PCI_P2PDMA;
pgmap->ops = &p2pdma_pgmap_ops;
-
- p2p_pgmap->provider = pdev;
- p2p_pgmap->bus_offset = pci_bus_address(pdev, bar) -
- pci_resource_start(pdev, bar);
+ p2p_pgmap->mem = mem;
addr = devm_memremap_pages(&pdev->dev, pgmap);
if (IS_ERR(addr)) {
@@ -340,7 +428,7 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
}
error = devm_add_action_or_reset(&pdev->dev, pci_p2pdma_unmap_mappings,
- pdev);
+ p2p_pgmap);
if (error)
goto pages_free;
@@ -972,16 +1060,26 @@ void pci_p2pmem_publish(struct pci_dev *pdev, bool publish)
}
EXPORT_SYMBOL_GPL(pci_p2pmem_publish);
-static enum pci_p2pdma_map_type pci_p2pdma_map_type(struct dev_pagemap *pgmap,
- struct device *dev)
+/**
+ * pci_p2pdma_map_type - Determine the mapping type for P2PDMA transfers
+ * @provider: P2PDMA provider structure
+ * @dev: Target device for the transfer
+ *
+ * Determines how peer-to-peer DMA transfers should be mapped between
+ * the provider and the target device. The mapping type indicates whether
+ * the transfer can be done directly through PCI switches or must go
+ * through the host bridge.
+ */
+enum pci_p2pdma_map_type pci_p2pdma_map_type(struct p2pdma_provider *provider,
+ struct device *dev)
{
enum pci_p2pdma_map_type type = PCI_P2PDMA_MAP_NOT_SUPPORTED;
- struct pci_dev *provider = to_p2p_pgmap(pgmap)->provider;
+ struct pci_dev *pdev = to_pci_dev(provider->owner);
struct pci_dev *client;
struct pci_p2pdma *p2pdma;
int dist;
- if (!provider->p2pdma)
+ if (!pdev->p2pdma)
return PCI_P2PDMA_MAP_NOT_SUPPORTED;
if (!dev_is_pci(dev))
@@ -990,7 +1088,7 @@ static enum pci_p2pdma_map_type pci_p2pdma_map_type(struct dev_pagemap *pgmap,
client = to_pci_dev(dev);
rcu_read_lock();
- p2pdma = rcu_dereference(provider->p2pdma);
+ p2pdma = rcu_dereference(pdev->p2pdma);
if (p2pdma)
type = xa_to_value(xa_load(&p2pdma->map_types,
@@ -998,7 +1096,7 @@ static enum pci_p2pdma_map_type pci_p2pdma_map_type(struct dev_pagemap *pgmap,
rcu_read_unlock();
if (type == PCI_P2PDMA_MAP_UNKNOWN)
- return calc_map_type_and_dist(provider, client, &dist, true);
+ return calc_map_type_and_dist(pdev, client, &dist, true);
return type;
}
@@ -1006,9 +1104,13 @@ static enum pci_p2pdma_map_type pci_p2pdma_map_type(struct dev_pagemap *pgmap,
void __pci_p2pdma_update_state(struct pci_p2pdma_map_state *state,
struct device *dev, struct page *page)
{
- state->pgmap = page_pgmap(page);
- state->map = pci_p2pdma_map_type(state->pgmap, dev);
- state->bus_off = to_p2p_pgmap(state->pgmap)->bus_offset;
+ struct pci_p2pdma_pagemap *p2p_pgmap = to_p2p_pgmap(page_pgmap(page));
+
+ if (state->mem == p2p_pgmap->mem)
+ return;
+
+ state->mem = p2p_pgmap->mem;
+ state->map = pci_p2pdma_map_type(p2p_pgmap->mem, dev);
}
/**
diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index 302d61783f6c..7c2d9d596258 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -629,6 +629,8 @@ static int pci_legacy_suspend(struct device *dev, pm_message_t state)
struct pci_dev *pci_dev = to_pci_dev(dev);
struct pci_driver *drv = pci_dev->driver;
+ pci_dev->state_saved = false;
+
if (drv && drv->suspend) {
pci_power_t prev = pci_dev->current_state;
int error;
@@ -1036,6 +1038,8 @@ static int pci_pm_freeze(struct device *dev)
if (!pm) {
pci_pm_default_suspend(pci_dev);
+ if (!pm_runtime_suspended(dev))
+ pci_dev->state_saved = false;
return 0;
}
@@ -1129,8 +1133,6 @@ static int pci_pm_thaw(struct device *dev)
pci_pm_reenable_device(pci_dev);
}
- pci_dev->state_saved = false;
-
return error;
}
diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index 3881359440b1..80a7c4fe6b03 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -1587,7 +1587,7 @@ static ssize_t __resource_resize_show(struct device *dev, int n, char *buf)
pci_config_pm_runtime_get(pdev);
ret = sysfs_emit(buf, "%016llx\n",
- (u64)pci_rebar_get_possible_sizes(pdev, n));
+ pci_rebar_get_possible_sizes(pdev, n));
pci_config_pm_runtime_put(pdev);
@@ -1599,18 +1599,13 @@ static ssize_t __resource_resize_store(struct device *dev, int n,
{
struct pci_dev *pdev = to_pci_dev(dev);
struct pci_bus *bus = pdev->bus;
- struct resource *b_win, *res;
unsigned long size;
- int ret, i;
+ int ret;
u16 cmd;
if (kstrtoul(buf, 0, &size) < 0)
return -EINVAL;
- b_win = pbus_select_window(bus, pci_resource_n(pdev, n));
- if (!b_win)
- return -EINVAL;
-
device_lock(dev);
if (dev->driver || pci_num_vf(pdev)) {
ret = -EBUSY;
@@ -1632,15 +1627,7 @@ static ssize_t __resource_resize_store(struct device *dev, int n,
pci_remove_resource_files(pdev);
- pci_dev_for_each_resource(pdev, res, i) {
- if (i >= PCI_BRIDGE_RESOURCES)
- break;
-
- if (b_win == pbus_select_window(bus, res))
- pci_release_resource(pdev, i);
- }
-
- ret = pci_resize_resource(pdev, n, size);
+ ret = pci_resize_resource(pdev, n, size, 0);
pci_assign_unassigned_bus_resources(bus);
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index b14dd064006c..13dbb405dc31 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -1823,41 +1823,12 @@ static void pci_restore_config_space(struct pci_dev *pdev)
}
}
-static void pci_restore_rebar_state(struct pci_dev *pdev)
-{
- unsigned int pos, nbars, i;
- u32 ctrl;
-
- pos = pdev->rebar_cap;
- if (!pos)
- return;
-
- pci_read_config_dword(pdev, pos + PCI_REBAR_CTRL, &ctrl);
- nbars = FIELD_GET(PCI_REBAR_CTRL_NBAR_MASK, ctrl);
-
- for (i = 0; i < nbars; i++, pos += 8) {
- struct resource *res;
- int bar_idx, size;
-
- pci_read_config_dword(pdev, pos + PCI_REBAR_CTRL, &ctrl);
- bar_idx = ctrl & PCI_REBAR_CTRL_BAR_IDX;
- res = pci_resource_n(pdev, bar_idx);
- size = pci_rebar_bytes_to_size(resource_size(res));
- ctrl &= ~PCI_REBAR_CTRL_BAR_SIZE;
- ctrl |= FIELD_PREP(PCI_REBAR_CTRL_BAR_SIZE, size);
- pci_write_config_dword(pdev, pos + PCI_REBAR_CTRL, ctrl);
- }
-}
-
/**
* pci_restore_state - Restore the saved state of a PCI device
* @dev: PCI device that we're dealing with
*/
void pci_restore_state(struct pci_dev *dev)
{
- if (!dev->state_saved)
- return;
-
pci_restore_pcie_state(dev);
pci_restore_pasid_state(dev);
pci_restore_pri_state(dev);
@@ -3687,125 +3658,6 @@ void pci_acs_init(struct pci_dev *dev)
pci_enable_acs(dev);
}
-void pci_rebar_init(struct pci_dev *pdev)
-{
- pdev->rebar_cap = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_REBAR);
-}
-
-/**
- * pci_rebar_find_pos - find position of resize ctrl reg for BAR
- * @pdev: PCI device
- * @bar: BAR to find
- *
- * Helper to find the position of the ctrl register for a BAR.
- * Returns -ENOTSUPP if resizable BARs are not supported at all.
- * Returns -ENOENT if no ctrl register for the BAR could be found.
- */
-static int pci_rebar_find_pos(struct pci_dev *pdev, int bar)
-{
- unsigned int pos, nbars, i;
- u32 ctrl;
-
- if (pci_resource_is_iov(bar)) {
- pos = pci_iov_vf_rebar_cap(pdev);
- bar = pci_resource_num_to_vf_bar(bar);
- } else {
- pos = pdev->rebar_cap;
- }
-
- if (!pos)
- return -ENOTSUPP;
-
- pci_read_config_dword(pdev, pos + PCI_REBAR_CTRL, &ctrl);
- nbars = FIELD_GET(PCI_REBAR_CTRL_NBAR_MASK, ctrl);
-
- for (i = 0; i < nbars; i++, pos += 8) {
- int bar_idx;
-
- pci_read_config_dword(pdev, pos + PCI_REBAR_CTRL, &ctrl);
- bar_idx = FIELD_GET(PCI_REBAR_CTRL_BAR_IDX, ctrl);
- if (bar_idx == bar)
- return pos;
- }
-
- return -ENOENT;
-}
-
-/**
- * pci_rebar_get_possible_sizes - get possible sizes for BAR
- * @pdev: PCI device
- * @bar: BAR to query
- *
- * Get the possible sizes of a resizable BAR as bitmask defined in the spec
- * (bit 0=1MB, bit 31=128TB). Returns 0 if BAR isn't resizable.
- */
-u32 pci_rebar_get_possible_sizes(struct pci_dev *pdev, int bar)
-{
- int pos;
- u32 cap;
-
- pos = pci_rebar_find_pos(pdev, bar);
- if (pos < 0)
- return 0;
-
- pci_read_config_dword(pdev, pos + PCI_REBAR_CAP, &cap);
- cap = FIELD_GET(PCI_REBAR_CAP_SIZES, cap);
-
- /* Sapphire RX 5600 XT Pulse has an invalid cap dword for BAR 0 */
- if (pdev->vendor == PCI_VENDOR_ID_ATI && pdev->device == 0x731f &&
- bar == 0 && cap == 0x700)
- return 0x3f00;
-
- return cap;
-}
-EXPORT_SYMBOL(pci_rebar_get_possible_sizes);
-
-/**
- * pci_rebar_get_current_size - get the current size of a BAR
- * @pdev: PCI device
- * @bar: BAR to set size to
- *
- * Read the size of a BAR from the resizable BAR config.
- * Returns size if found or negative error code.
- */
-int pci_rebar_get_current_size(struct pci_dev *pdev, int bar)
-{
- int pos;
- u32 ctrl;
-
- pos = pci_rebar_find_pos(pdev, bar);
- if (pos < 0)
- return pos;
-
- pci_read_config_dword(pdev, pos + PCI_REBAR_CTRL, &ctrl);
- return FIELD_GET(PCI_REBAR_CTRL_BAR_SIZE, ctrl);
-}
-
-/**
- * pci_rebar_set_size - set a new size for a BAR
- * @pdev: PCI device
- * @bar: BAR to set size to
- * @size: new size as defined in the spec (0=1MB, 31=128TB)
- *
- * Set the new size of a BAR as defined in the spec.
- * Returns zero if resizing was successful, error code otherwise.
- */
-int pci_rebar_set_size(struct pci_dev *pdev, int bar, int size)
-{
- int pos;
- u32 ctrl;
-
- pos = pci_rebar_find_pos(pdev, bar);
- if (pos < 0)
- return pos;
-
- pci_read_config_dword(pdev, pos + PCI_REBAR_CTRL, &ctrl);
- ctrl &= ~PCI_REBAR_CTRL_BAR_SIZE;
- ctrl |= FIELD_PREP(PCI_REBAR_CTRL_BAR_SIZE, size);
- pci_write_config_dword(pdev, pos + PCI_REBAR_CTRL, ctrl);
- return 0;
-}
-
/**
* pci_enable_atomic_ops_to_root - enable AtomicOp requests to root port
* @dev: the PCI device
@@ -6656,9 +6508,31 @@ static void pci_no_domains(void)
#endif
}
+#ifdef CONFIG_PCI_DOMAINS
+static DEFINE_IDA(pci_domain_nr_dynamic_ida);
+
+/**
+ * pci_bus_find_emul_domain_nr() - allocate a PCI domain number per constraints
+ * @hint: desired domain, 0 if any ID in the range of @min to @max is acceptable
+ * @min: minimum allowable domain
+ * @max: maximum allowable domain, no IDs higher than INT_MAX will be returned
+ */
+int pci_bus_find_emul_domain_nr(u32 hint, u32 min, u32 max)
+{
+ return ida_alloc_range(&pci_domain_nr_dynamic_ida, max(hint, min), max,
+ GFP_KERNEL);
+}
+EXPORT_SYMBOL_GPL(pci_bus_find_emul_domain_nr);
+
+void pci_bus_release_emul_domain_nr(int domain_nr)
+{
+ ida_free(&pci_domain_nr_dynamic_ida, domain_nr);
+}
+EXPORT_SYMBOL_GPL(pci_bus_release_emul_domain_nr);
+#endif
+
#ifdef CONFIG_PCI_DOMAINS_GENERIC
static DEFINE_IDA(pci_domain_nr_static_ida);
-static DEFINE_IDA(pci_domain_nr_dynamic_ida);
static void of_pci_reserve_static_domain_nr(void)
{
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 36f8c0985430..a33bc4e0bf34 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -421,8 +421,10 @@ enum pci_bar_type {
struct device *pci_get_host_bridge_device(struct pci_dev *dev);
void pci_put_host_bridge_device(struct device *dev);
+void pci_resize_resource_set_size(struct pci_dev *dev, int resno, int size);
+int pci_do_resource_release_and_resize(struct pci_dev *dev, int resno, int size,
+ int exclude_bars);
unsigned int pci_rescan_bus_bridge_resize(struct pci_dev *bridge);
-int pbus_reassign_bridge_resources(struct pci_bus *bus, struct resource *res);
int __must_check pci_reassign_resource(struct pci_dev *dev, int i, resource_size_t add_size, resource_size_t align);
int pci_configure_extended_tags(struct pci_dev *dev, void *ign);
@@ -808,8 +810,7 @@ void pci_iov_update_resource(struct pci_dev *dev, int resno);
resource_size_t pci_sriov_resource_alignment(struct pci_dev *dev, int resno);
void pci_restore_iov_state(struct pci_dev *dev);
int pci_iov_bus_range(struct pci_bus *bus);
-void pci_iov_resource_set_size(struct pci_dev *dev, int resno,
- resource_size_t size);
+void pci_iov_resource_set_size(struct pci_dev *dev, int resno, int size);
bool pci_iov_is_memory_decoding_enabled(struct pci_dev *dev);
static inline u16 pci_iov_vf_rebar_cap(struct pci_dev *dev)
{
@@ -851,7 +852,7 @@ static inline int pci_iov_bus_range(struct pci_bus *bus)
return 0;
}
static inline void pci_iov_resource_set_size(struct pci_dev *dev, int resno,
- resource_size_t size) { }
+ int size) { }
static inline bool pci_iov_is_memory_decoding_enabled(struct pci_dev *dev)
{
return false;
@@ -1022,12 +1023,9 @@ static inline int acpi_get_rc_resources(struct device *dev, const char *hid,
#endif
void pci_rebar_init(struct pci_dev *pdev);
+void pci_restore_rebar_state(struct pci_dev *pdev);
int pci_rebar_get_current_size(struct pci_dev *pdev, int bar);
int pci_rebar_set_size(struct pci_dev *pdev, int bar, int size);
-static inline u64 pci_rebar_size_to_bytes(int size)
-{
- return 1ULL << (size + 20);
-}
struct device_node;
diff --git a/drivers/pci/pcie/portdrv.c b/drivers/pci/pcie/portdrv.c
index d1b68c18444f..38a41ccf79b9 100644
--- a/drivers/pci/pcie/portdrv.c
+++ b/drivers/pci/pcie/portdrv.c
@@ -760,7 +760,6 @@ static pci_ers_result_t pcie_portdrv_slot_reset(struct pci_dev *dev)
device_for_each_child(&dev->dev, &off, pcie_port_device_iter);
pci_restore_state(dev);
- pci_save_state(dev);
return PCI_ERS_RESULT_RECOVERED;
}
diff --git a/drivers/pci/pcie/ptm.c b/drivers/pci/pcie/ptm.c
index 65e4b008be00..ed0f9691e7d1 100644
--- a/drivers/pci/pcie/ptm.c
+++ b/drivers/pci/pcie/ptm.c
@@ -81,6 +81,11 @@ void pci_ptm_init(struct pci_dev *dev)
dev->ptm_granularity = 0;
}
+ if (cap & PCI_PTM_CAP_RES)
+ dev->ptm_responder = 1;
+ if (cap & PCI_PTM_CAP_REQ)
+ dev->ptm_requester = 1;
+
if (pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT ||
pci_pcie_type(dev) == PCI_EXP_TYPE_UPSTREAM)
pci_enable_ptm(dev, NULL);
@@ -144,6 +149,24 @@ static int __pci_enable_ptm(struct pci_dev *dev)
return -EINVAL;
}
+ switch (pci_pcie_type(dev)) {
+ case PCI_EXP_TYPE_ROOT_PORT:
+ if (!dev->ptm_root)
+ return -EINVAL;
+ break;
+ case PCI_EXP_TYPE_UPSTREAM:
+ if (!dev->ptm_responder)
+ return -EINVAL;
+ break;
+ case PCI_EXP_TYPE_ENDPOINT:
+ case PCI_EXP_TYPE_LEG_END:
+ if (!dev->ptm_requester)
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+
pci_read_config_dword(dev, ptm + PCI_PTM_CTRL, &ctrl);
ctrl |= PCI_PTM_CTRL_ENABLE;
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 9cd032dff31e..124d2d309c58 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -650,6 +650,11 @@ static void pci_release_host_bridge_dev(struct device *dev)
pci_free_resource_list(&bridge->windows);
pci_free_resource_list(&bridge->dma_ranges);
+
+ /* Host bridges only have domain_nr set in the emulation case */
+ if (bridge->domain_nr != PCI_DOMAIN_NR_NOT_SET)
+ pci_bus_release_emul_domain_nr(bridge->domain_nr);
+
kfree(bridge);
}
@@ -1130,7 +1135,8 @@ unregister:
device_del(&bridge->dev);
free:
#ifdef CONFIG_PCI_DOMAINS_GENERIC
- pci_bus_release_domain_nr(parent, bus->domain_nr);
+ if (bridge->domain_nr == PCI_DOMAIN_NR_NOT_SET)
+ pci_bus_release_domain_nr(parent, bus->domain_nr);
#endif
if (bus_registered)
put_device(&bus->dev);
@@ -2747,8 +2753,6 @@ void pci_device_add(struct pci_dev *dev, struct pci_bus *bus)
pci_reassigndev_resource_alignment(dev);
- dev->state_saved = false;
-
pci_init_capabilities(dev);
/*
@@ -3170,8 +3174,7 @@ static unsigned int pci_scan_child_bus_extend(struct pci_bus *bus,
* bus number if there is room.
*/
if (bus->self && bus->self->is_hotplug_bridge) {
- used_buses = max_t(unsigned int, available_buses,
- pci_hotplug_bus_size - 1);
+ used_buses = max(available_buses, pci_hotplug_bus_size - 1);
if (max - start < used_buses) {
max = start + used_buses;
diff --git a/drivers/pci/pwrctrl/Kconfig b/drivers/pci/pwrctrl/Kconfig
index 6956c1854811..e0f999f299bb 100644
--- a/drivers/pci/pwrctrl/Kconfig
+++ b/drivers/pci/pwrctrl/Kconfig
@@ -22,6 +22,21 @@ config PCI_PWRCTRL_SLOT
PCI slots. The voltage regulators powering the rails of the PCI slots
are expected to be defined in the devicetree node of the PCI bridge.
+config PCI_PWRCTRL_TC9563
+ tristate "PCI Power Control driver for TC9563 PCIe switch"
+ select PCI_PWRCTRL
+ default m if ARCH_QCOM
+ depends on I2C
+ help
+ Say Y here to enable the PCI Power Control driver of TC9563 PCIe
+ switch.
+
+ This driver enables power and configures the TC9563 PCIe switch
+ through i2c. TC9563 is a PCIe switch which has one upstream and three
+ downstream ports. To one of the downstream ports integrated ethernet
+ MAC is connected as endpoint device. Other two downstream ports are
+ supposed to connect to external device.
+
# deprecated
config HAVE_PWRCTL
bool
diff --git a/drivers/pci/pwrctrl/Makefile b/drivers/pci/pwrctrl/Makefile
index a4e5808d7850..13b02282106c 100644
--- a/drivers/pci/pwrctrl/Makefile
+++ b/drivers/pci/pwrctrl/Makefile
@@ -7,3 +7,5 @@ obj-$(CONFIG_PCI_PWRCTRL_PWRSEQ) += pci-pwrctrl-pwrseq.o
obj-$(CONFIG_PCI_PWRCTRL_SLOT) += pci-pwrctrl-slot.o
pci-pwrctrl-slot-y := slot.o
+
+obj-$(CONFIG_PCI_PWRCTRL_TC9563) += pci-pwrctrl-tc9563.o
diff --git a/drivers/pci/pwrctrl/pci-pwrctrl-tc9563.c b/drivers/pci/pwrctrl/pci-pwrctrl-tc9563.c
new file mode 100644
index 000000000000..ec423432ac65
--- /dev/null
+++ b/drivers/pci/pwrctrl/pci-pwrctrl-tc9563.c
@@ -0,0 +1,648 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ */
+
+#include <linux/array_size.h>
+#include <linux/bitfield.h>
+#include <linux/bits.h>
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/gpio/consumer.h>
+#include <linux/i2c.h>
+#include <linux/mod_devicetable.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_platform.h>
+#include <linux/pci.h>
+#include <linux/pci-pwrctrl.h>
+#include <linux/platform_device.h>
+#include <linux/regulator/consumer.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/unaligned.h>
+
+#include "../pci.h"
+
+#define TC9563_GPIO_CONFIG 0x801208
+#define TC9563_RESET_GPIO 0x801210
+
+#define TC9563_PORT_L0S_DELAY 0x82496c
+#define TC9563_PORT_L1_DELAY 0x824970
+
+#define TC9563_EMBEDDED_ETH_DELAY 0x8200d8
+#define TC9563_ETH_L1_DELAY_MASK GENMASK(27, 18)
+#define TC9563_ETH_L1_DELAY_VALUE(x) FIELD_PREP(TC9563_ETH_L1_DELAY_MASK, x)
+#define TC9563_ETH_L0S_DELAY_MASK GENMASK(17, 13)
+#define TC9563_ETH_L0S_DELAY_VALUE(x) FIELD_PREP(TC9563_ETH_L0S_DELAY_MASK, x)
+
+#define TC9563_NFTS_2_5_GT 0x824978
+#define TC9563_NFTS_5_GT 0x82497c
+
+#define TC9563_PORT_LANE_ACCESS_ENABLE 0x828000
+
+#define TC9563_PHY_RATE_CHANGE_OVERRIDE 0x828040
+#define TC9563_PHY_RATE_CHANGE 0x828050
+
+#define TC9563_TX_MARGIN 0x828234
+
+#define TC9563_DFE_ENABLE 0x828a04
+#define TC9563_DFE_EQ0_MODE 0x828a08
+#define TC9563_DFE_EQ1_MODE 0x828a0c
+#define TC9563_DFE_EQ2_MODE 0x828a14
+#define TC9563_DFE_PD_MASK 0x828254
+
+#define TC9563_PORT_SELECT 0x82c02c
+#define TC9563_PORT_ACCESS_ENABLE 0x82c030
+
+#define TC9563_POWER_CONTROL 0x82b09c
+#define TC9563_POWER_CONTROL_OVREN 0x82b2c8
+
+#define TC9563_GPIO_MASK 0xfffffff3
+#define TC9563_GPIO_DEASSERT_BITS 0xc /* Bits to clear for GPIO deassert */
+
+#define TC9563_TX_MARGIN_MIN_UA 400000
+
+/*
+ * From TC9563 PORSYS rev 0.2, figure 1.1 POR boot sequence
+ * wait for 10ms for the internal osc frequency to stabilize.
+ */
+#define TC9563_OSC_STAB_DELAY_US (10 * USEC_PER_MSEC)
+
+#define TC9563_L0S_L1_DELAY_UNIT_NS 256 /* Each unit represents 256 nanoseconds */
+
+struct tc9563_pwrctrl_reg_setting {
+ unsigned int offset;
+ unsigned int val;
+};
+
+enum tc9563_pwrctrl_ports {
+ TC9563_USP,
+ TC9563_DSP1,
+ TC9563_DSP2,
+ TC9563_DSP3,
+ TC9563_ETHERNET,
+ TC9563_MAX
+};
+
+struct tc9563_pwrctrl_cfg {
+ u32 l0s_delay;
+ u32 l1_delay;
+ u32 tx_amp;
+ u8 nfts[2]; /* GEN1 & GEN2 */
+ bool disable_dfe;
+ bool disable_port;
+};
+
+#define TC9563_PWRCTL_MAX_SUPPLY 6
+
+static const char *const tc9563_supply_names[TC9563_PWRCTL_MAX_SUPPLY] = {
+ "vddc",
+ "vdd18",
+ "vdd09",
+ "vddio1",
+ "vddio2",
+ "vddio18",
+};
+
+struct tc9563_pwrctrl_ctx {
+ struct regulator_bulk_data supplies[TC9563_PWRCTL_MAX_SUPPLY];
+ struct tc9563_pwrctrl_cfg cfg[TC9563_MAX];
+ struct gpio_desc *reset_gpio;
+ struct i2c_adapter *adapter;
+ struct i2c_client *client;
+ struct pci_pwrctrl pwrctrl;
+};
+
+/*
+ * downstream port power off sequence, hardcoding the address
+ * as we don't know register names for these register offsets.
+ */
+static const struct tc9563_pwrctrl_reg_setting common_pwroff_seq[] = {
+ {0x82900c, 0x1},
+ {0x829010, 0x1},
+ {0x829018, 0x0},
+ {0x829020, 0x1},
+ {0x82902c, 0x1},
+ {0x829030, 0x1},
+ {0x82903c, 0x1},
+ {0x829058, 0x0},
+ {0x82905c, 0x1},
+ {0x829060, 0x1},
+ {0x8290cc, 0x1},
+ {0x8290d0, 0x1},
+ {0x8290d8, 0x1},
+ {0x8290e0, 0x1},
+ {0x8290e8, 0x1},
+ {0x8290ec, 0x1},
+ {0x8290f4, 0x1},
+ {0x82910c, 0x1},
+ {0x829110, 0x1},
+ {0x829114, 0x1},
+};
+
+static const struct tc9563_pwrctrl_reg_setting dsp1_pwroff_seq[] = {
+ {TC9563_PORT_ACCESS_ENABLE, 0x2},
+ {TC9563_PORT_LANE_ACCESS_ENABLE, 0x3},
+ {TC9563_POWER_CONTROL, 0x014f4804},
+ {TC9563_POWER_CONTROL_OVREN, 0x1},
+ {TC9563_PORT_ACCESS_ENABLE, 0x4},
+};
+
+static const struct tc9563_pwrctrl_reg_setting dsp2_pwroff_seq[] = {
+ {TC9563_PORT_ACCESS_ENABLE, 0x8},
+ {TC9563_PORT_LANE_ACCESS_ENABLE, 0x1},
+ {TC9563_POWER_CONTROL, 0x014f4804},
+ {TC9563_POWER_CONTROL_OVREN, 0x1},
+ {TC9563_PORT_ACCESS_ENABLE, 0x8},
+};
+
+/*
+ * Since all transfers are initiated by the probe, no locks are necessary,
+ * as there are no concurrent calls.
+ */
+static int tc9563_pwrctrl_i2c_write(struct i2c_client *client,
+ u32 reg_addr, u32 reg_val)
+{
+ struct i2c_msg msg;
+ u8 msg_buf[7];
+ int ret;
+
+ msg.addr = client->addr;
+ msg.len = 7;
+ msg.flags = 0;
+
+ /* Big Endian for reg addr */
+ put_unaligned_be24(reg_addr, &msg_buf[0]);
+
+ /* Little Endian for reg val */
+ put_unaligned_le32(reg_val, &msg_buf[3]);
+
+ msg.buf = msg_buf;
+ ret = i2c_transfer(client->adapter, &msg, 1);
+ return ret == 1 ? 0 : ret;
+}
+
+static int tc9563_pwrctrl_i2c_read(struct i2c_client *client,
+ u32 reg_addr, u32 *reg_val)
+{
+ struct i2c_msg msg[2];
+ u8 wr_data[3];
+ u32 rd_data;
+ int ret;
+
+ msg[0].addr = client->addr;
+ msg[0].len = 3;
+ msg[0].flags = 0;
+
+ /* Big Endian for reg addr */
+ put_unaligned_be24(reg_addr, &wr_data[0]);
+
+ msg[0].buf = wr_data;
+
+ msg[1].addr = client->addr;
+ msg[1].len = 4;
+ msg[1].flags = I2C_M_RD;
+
+ msg[1].buf = (u8 *)&rd_data;
+
+ ret = i2c_transfer(client->adapter, &msg[0], 2);
+ if (ret == 2) {
+ *reg_val = get_unaligned_le32(&rd_data);
+ return 0;
+ }
+
+ /* If only one message successfully completed, return -EIO */
+ return ret == 1 ? -EIO : ret;
+}
+
+static int tc9563_pwrctrl_i2c_bulk_write(struct i2c_client *client,
+ const struct tc9563_pwrctrl_reg_setting *seq, int len)
+{
+ int ret, i;
+
+ for (i = 0; i < len; i++) {
+ ret = tc9563_pwrctrl_i2c_write(client, seq[i].offset, seq[i].val);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int tc9563_pwrctrl_disable_port(struct tc9563_pwrctrl_ctx *ctx,
+ enum tc9563_pwrctrl_ports port)
+{
+ struct tc9563_pwrctrl_cfg *cfg = &ctx->cfg[port];
+ const struct tc9563_pwrctrl_reg_setting *seq;
+ int ret, len;
+
+ if (!cfg->disable_port)
+ return 0;
+
+ if (port == TC9563_DSP1) {
+ seq = dsp1_pwroff_seq;
+ len = ARRAY_SIZE(dsp1_pwroff_seq);
+ } else {
+ seq = dsp2_pwroff_seq;
+ len = ARRAY_SIZE(dsp2_pwroff_seq);
+ }
+
+ ret = tc9563_pwrctrl_i2c_bulk_write(ctx->client, seq, len);
+ if (ret)
+ return ret;
+
+ return tc9563_pwrctrl_i2c_bulk_write(ctx->client,
+ common_pwroff_seq, ARRAY_SIZE(common_pwroff_seq));
+}
+
+static int tc9563_pwrctrl_set_l0s_l1_entry_delay(struct tc9563_pwrctrl_ctx *ctx,
+ enum tc9563_pwrctrl_ports port, bool is_l1, u32 ns)
+{
+ u32 rd_val, units;
+ int ret;
+
+ if (ns < TC9563_L0S_L1_DELAY_UNIT_NS)
+ return 0;
+
+ /* convert to units of 256ns */
+ units = ns / TC9563_L0S_L1_DELAY_UNIT_NS;
+
+ if (port == TC9563_ETHERNET) {
+ ret = tc9563_pwrctrl_i2c_read(ctx->client, TC9563_EMBEDDED_ETH_DELAY, &rd_val);
+ if (ret)
+ return ret;
+
+ if (is_l1)
+ rd_val = u32_replace_bits(rd_val, units, TC9563_ETH_L1_DELAY_MASK);
+ else
+ rd_val = u32_replace_bits(rd_val, units, TC9563_ETH_L0S_DELAY_MASK);
+
+ return tc9563_pwrctrl_i2c_write(ctx->client, TC9563_EMBEDDED_ETH_DELAY, rd_val);
+ }
+
+ ret = tc9563_pwrctrl_i2c_write(ctx->client, TC9563_PORT_SELECT, BIT(port));
+ if (ret)
+ return ret;
+
+ return tc9563_pwrctrl_i2c_write(ctx->client,
+ is_l1 ? TC9563_PORT_L1_DELAY : TC9563_PORT_L0S_DELAY, units);
+}
+
+static int tc9563_pwrctrl_set_tx_amplitude(struct tc9563_pwrctrl_ctx *ctx,
+ enum tc9563_pwrctrl_ports port)
+{
+ u32 amp = ctx->cfg[port].tx_amp;
+ int port_access;
+
+ if (amp < TC9563_TX_MARGIN_MIN_UA)
+ return 0;
+
+ /* txmargin = (Amp(uV) - 400000) / 3125 */
+ amp = (amp - TC9563_TX_MARGIN_MIN_UA) / 3125;
+
+ switch (port) {
+ case TC9563_USP:
+ port_access = 0x1;
+ break;
+ case TC9563_DSP1:
+ port_access = 0x2;
+ break;
+ case TC9563_DSP2:
+ port_access = 0x8;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ struct tc9563_pwrctrl_reg_setting tx_amp_seq[] = {
+ {TC9563_PORT_ACCESS_ENABLE, port_access},
+ {TC9563_PORT_LANE_ACCESS_ENABLE, 0x3},
+ {TC9563_TX_MARGIN, amp},
+ };
+
+ return tc9563_pwrctrl_i2c_bulk_write(ctx->client, tx_amp_seq, ARRAY_SIZE(tx_amp_seq));
+}
+
+static int tc9563_pwrctrl_disable_dfe(struct tc9563_pwrctrl_ctx *ctx,
+ enum tc9563_pwrctrl_ports port)
+{
+ struct tc9563_pwrctrl_cfg *cfg = &ctx->cfg[port];
+ int port_access, lane_access = 0x3;
+ u32 phy_rate = 0x21;
+
+ if (!cfg->disable_dfe)
+ return 0;
+
+ switch (port) {
+ case TC9563_USP:
+ phy_rate = 0x1;
+ port_access = 0x1;
+ break;
+ case TC9563_DSP1:
+ port_access = 0x2;
+ break;
+ case TC9563_DSP2:
+ port_access = 0x8;
+ lane_access = 0x1;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ struct tc9563_pwrctrl_reg_setting disable_dfe_seq[] = {
+ {TC9563_PORT_ACCESS_ENABLE, port_access},
+ {TC9563_PORT_LANE_ACCESS_ENABLE, lane_access},
+ {TC9563_DFE_ENABLE, 0x0},
+ {TC9563_DFE_EQ0_MODE, 0x411},
+ {TC9563_DFE_EQ1_MODE, 0x11},
+ {TC9563_DFE_EQ2_MODE, 0x11},
+ {TC9563_DFE_PD_MASK, 0x7},
+ {TC9563_PHY_RATE_CHANGE_OVERRIDE, 0x10},
+ {TC9563_PHY_RATE_CHANGE, phy_rate},
+ {TC9563_PHY_RATE_CHANGE, 0x0},
+ {TC9563_PHY_RATE_CHANGE_OVERRIDE, 0x0},
+ };
+
+ return tc9563_pwrctrl_i2c_bulk_write(ctx->client,
+ disable_dfe_seq, ARRAY_SIZE(disable_dfe_seq));
+}
+
+static int tc9563_pwrctrl_set_nfts(struct tc9563_pwrctrl_ctx *ctx,
+ enum tc9563_pwrctrl_ports port)
+{
+ u8 *nfts = ctx->cfg[port].nfts;
+ struct tc9563_pwrctrl_reg_setting nfts_seq[] = {
+ {TC9563_NFTS_2_5_GT, nfts[0]},
+ {TC9563_NFTS_5_GT, nfts[1]},
+ };
+ int ret;
+
+ if (!nfts[0])
+ return 0;
+
+ ret = tc9563_pwrctrl_i2c_write(ctx->client, TC9563_PORT_SELECT, BIT(port));
+ if (ret)
+ return ret;
+
+ return tc9563_pwrctrl_i2c_bulk_write(ctx->client, nfts_seq, ARRAY_SIZE(nfts_seq));
+}
+
+static int tc9563_pwrctrl_assert_deassert_reset(struct tc9563_pwrctrl_ctx *ctx, bool deassert)
+{
+ int ret, val;
+
+ ret = tc9563_pwrctrl_i2c_write(ctx->client, TC9563_GPIO_CONFIG, TC9563_GPIO_MASK);
+ if (ret)
+ return ret;
+
+ val = deassert ? TC9563_GPIO_DEASSERT_BITS : 0;
+
+ return tc9563_pwrctrl_i2c_write(ctx->client, TC9563_RESET_GPIO, val);
+}
+
+static int tc9563_pwrctrl_parse_device_dt(struct tc9563_pwrctrl_ctx *ctx, struct device_node *node,
+ enum tc9563_pwrctrl_ports port)
+{
+ struct tc9563_pwrctrl_cfg *cfg = &ctx->cfg[port];
+ int ret;
+
+ /* Disable port if the status of the port is disabled. */
+ if (!of_device_is_available(node)) {
+ cfg->disable_port = true;
+ return 0;
+ }
+
+ ret = of_property_read_u32(node, "aspm-l0s-entry-delay-ns", &cfg->l0s_delay);
+ if (ret && ret != -EINVAL)
+ return ret;
+
+ ret = of_property_read_u32(node, "aspm-l1-entry-delay-ns", &cfg->l1_delay);
+ if (ret && ret != -EINVAL)
+ return ret;
+
+ ret = of_property_read_u32(node, "toshiba,tx-amplitude-microvolt", &cfg->tx_amp);
+ if (ret && ret != -EINVAL)
+ return ret;
+
+ ret = of_property_read_u8_array(node, "n-fts", cfg->nfts, ARRAY_SIZE(cfg->nfts));
+ if (ret && ret != -EINVAL)
+ return ret;
+
+ cfg->disable_dfe = of_property_read_bool(node, "toshiba,no-dfe-support");
+
+ return 0;
+}
+
+static void tc9563_pwrctrl_power_off(struct tc9563_pwrctrl_ctx *ctx)
+{
+ gpiod_set_value(ctx->reset_gpio, 1);
+
+ regulator_bulk_disable(ARRAY_SIZE(ctx->supplies), ctx->supplies);
+}
+
+static int tc9563_pwrctrl_bring_up(struct tc9563_pwrctrl_ctx *ctx)
+{
+ struct tc9563_pwrctrl_cfg *cfg;
+ int ret, i;
+
+ ret = regulator_bulk_enable(ARRAY_SIZE(ctx->supplies), ctx->supplies);
+ if (ret < 0)
+ return dev_err_probe(ctx->pwrctrl.dev, ret, "cannot enable regulators\n");
+
+ gpiod_set_value(ctx->reset_gpio, 0);
+
+ fsleep(TC9563_OSC_STAB_DELAY_US);
+
+ ret = tc9563_pwrctrl_assert_deassert_reset(ctx, false);
+ if (ret)
+ goto power_off;
+
+ for (i = 0; i < TC9563_MAX; i++) {
+ cfg = &ctx->cfg[i];
+ ret = tc9563_pwrctrl_disable_port(ctx, i);
+ if (ret) {
+ dev_err(ctx->pwrctrl.dev, "Disabling port failed\n");
+ goto power_off;
+ }
+
+ ret = tc9563_pwrctrl_set_l0s_l1_entry_delay(ctx, i, false, cfg->l0s_delay);
+ if (ret) {
+ dev_err(ctx->pwrctrl.dev, "Setting L0s entry delay failed\n");
+ goto power_off;
+ }
+
+ ret = tc9563_pwrctrl_set_l0s_l1_entry_delay(ctx, i, true, cfg->l1_delay);
+ if (ret) {
+ dev_err(ctx->pwrctrl.dev, "Setting L1 entry delay failed\n");
+ goto power_off;
+ }
+
+ ret = tc9563_pwrctrl_set_tx_amplitude(ctx, i);
+ if (ret) {
+ dev_err(ctx->pwrctrl.dev, "Setting Tx amplitude failed\n");
+ goto power_off;
+ }
+
+ ret = tc9563_pwrctrl_set_nfts(ctx, i);
+ if (ret) {
+ dev_err(ctx->pwrctrl.dev, "Setting N_FTS failed\n");
+ goto power_off;
+ }
+
+ ret = tc9563_pwrctrl_disable_dfe(ctx, i);
+ if (ret) {
+ dev_err(ctx->pwrctrl.dev, "Disabling DFE failed\n");
+ goto power_off;
+ }
+ }
+
+ ret = tc9563_pwrctrl_assert_deassert_reset(ctx, true);
+ if (!ret)
+ return 0;
+
+power_off:
+ tc9563_pwrctrl_power_off(ctx);
+ return ret;
+}
+
+static int tc9563_pwrctrl_probe(struct platform_device *pdev)
+{
+ struct pci_host_bridge *bridge = to_pci_host_bridge(pdev->dev.parent);
+ struct pci_bus *bus = bridge->bus;
+ struct device *dev = &pdev->dev;
+ enum tc9563_pwrctrl_ports port;
+ struct tc9563_pwrctrl_ctx *ctx;
+ struct device_node *i2c_node;
+ int ret, addr;
+
+ ctx = devm_kzalloc(dev, sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ ret = of_property_read_u32_index(pdev->dev.of_node, "i2c-parent", 1, &addr);
+ if (ret)
+ return dev_err_probe(dev, ret, "Failed to read i2c-parent property\n");
+
+ i2c_node = of_parse_phandle(dev->of_node, "i2c-parent", 0);
+ ctx->adapter = of_find_i2c_adapter_by_node(i2c_node);
+ of_node_put(i2c_node);
+ if (!ctx->adapter)
+ return dev_err_probe(dev, -EPROBE_DEFER, "Failed to find I2C adapter\n");
+
+ ctx->client = i2c_new_dummy_device(ctx->adapter, addr);
+ if (IS_ERR(ctx->client)) {
+ dev_err(dev, "Failed to create I2C client\n");
+ i2c_put_adapter(ctx->adapter);
+ return PTR_ERR(ctx->client);
+ }
+
+ for (int i = 0; i < ARRAY_SIZE(tc9563_supply_names); i++)
+ ctx->supplies[i].supply = tc9563_supply_names[i];
+
+ ret = devm_regulator_bulk_get(dev, TC9563_PWRCTL_MAX_SUPPLY, ctx->supplies);
+ if (ret) {
+ dev_err_probe(dev, ret, "failed to get supply regulator\n");
+ goto remove_i2c;
+ }
+
+ ctx->reset_gpio = devm_gpiod_get(dev, "resx", GPIOD_OUT_HIGH);
+ if (IS_ERR(ctx->reset_gpio)) {
+ ret = dev_err_probe(dev, PTR_ERR(ctx->reset_gpio), "failed to get resx GPIO\n");
+ goto remove_i2c;
+ }
+
+ pci_pwrctrl_init(&ctx->pwrctrl, dev);
+
+ port = TC9563_USP;
+ ret = tc9563_pwrctrl_parse_device_dt(ctx, pdev->dev.of_node, port);
+ if (ret) {
+ dev_err(dev, "failed to parse device tree properties: %d\n", ret);
+ goto remove_i2c;
+ }
+
+ /*
+ * Downstream ports are always children of the upstream port.
+ * The first node represents DSP1, the second node represents DSP2, and so on.
+ */
+ for_each_child_of_node_scoped(pdev->dev.of_node, child) {
+ port++;
+ ret = tc9563_pwrctrl_parse_device_dt(ctx, child, port);
+ if (ret)
+ break;
+ /* Embedded ethernet device are under DSP3 */
+ if (port == TC9563_DSP3) {
+ for_each_child_of_node_scoped(child, child1) {
+ port++;
+ ret = tc9563_pwrctrl_parse_device_dt(ctx, child1, port);
+ if (ret)
+ break;
+ }
+ }
+ }
+ if (ret) {
+ dev_err(dev, "failed to parse device tree properties: %d\n", ret);
+ goto remove_i2c;
+ }
+
+ if (bridge->ops->assert_perst) {
+ ret = bridge->ops->assert_perst(bus, true);
+ if (ret)
+ goto remove_i2c;
+ }
+
+ ret = tc9563_pwrctrl_bring_up(ctx);
+ if (ret)
+ goto remove_i2c;
+
+ if (bridge->ops->assert_perst) {
+ ret = bridge->ops->assert_perst(bus, false);
+ if (ret)
+ goto power_off;
+ }
+
+ ret = devm_pci_pwrctrl_device_set_ready(dev, &ctx->pwrctrl);
+ if (ret)
+ goto power_off;
+
+ platform_set_drvdata(pdev, ctx);
+
+ return 0;
+
+power_off:
+ tc9563_pwrctrl_power_off(ctx);
+remove_i2c:
+ i2c_unregister_device(ctx->client);
+ i2c_put_adapter(ctx->adapter);
+ return ret;
+}
+
+static void tc9563_pwrctrl_remove(struct platform_device *pdev)
+{
+ struct tc9563_pwrctrl_ctx *ctx = platform_get_drvdata(pdev);
+
+ tc9563_pwrctrl_power_off(ctx);
+ i2c_unregister_device(ctx->client);
+ i2c_put_adapter(ctx->adapter);
+}
+
+static const struct of_device_id tc9563_pwrctrl_of_match[] = {
+ { .compatible = "pci1179,0623"},
+ { }
+};
+MODULE_DEVICE_TABLE(of, tc9563_pwrctrl_of_match);
+
+static struct platform_driver tc9563_pwrctrl_driver = {
+ .driver = {
+ .name = "pwrctrl-tc9563",
+ .of_match_table = tc9563_pwrctrl_of_match,
+ .probe_type = PROBE_PREFER_ASYNCHRONOUS,
+ },
+ .probe = tc9563_pwrctrl_probe,
+ .remove = tc9563_pwrctrl_remove,
+};
+module_platform_driver(tc9563_pwrctrl_driver);
+
+MODULE_AUTHOR("Krishna chaitanya chundru <quic_krichai@quicinc.com>");
+MODULE_DESCRIPTION("TC956x power control driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/pci/rebar.c b/drivers/pci/rebar.c
new file mode 100644
index 000000000000..ecdebdeb2dff
--- /dev/null
+++ b/drivers/pci/rebar.c
@@ -0,0 +1,328 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * PCI Resizable BAR Extended Capability handling.
+ */
+
+#include <linux/bits.h>
+#include <linux/bitfield.h>
+#include <linux/bitops.h>
+#include <linux/errno.h>
+#include <linux/export.h>
+#include <linux/ioport.h>
+#include <linux/log2.h>
+#include <linux/pci.h>
+#include <linux/sizes.h>
+#include <linux/types.h>
+
+#include "pci.h"
+
+#define PCI_REBAR_MIN_SIZE ((resource_size_t)SZ_1M)
+
+/**
+ * pci_rebar_bytes_to_size - Convert size in bytes to PCI BAR Size
+ * @bytes: size in bytes
+ *
+ * Convert size in bytes to encoded BAR Size in Resizable BAR Capability
+ * (PCIe r6.2, sec. 7.8.6.3).
+ *
+ * Return: encoded BAR Size as defined in the PCIe spec (0=1MB, 31=128TB)
+ */
+int pci_rebar_bytes_to_size(u64 bytes)
+{
+ int rebar_minsize = ilog2(PCI_REBAR_MIN_SIZE);
+
+ bytes = roundup_pow_of_two(bytes);
+
+ return max(ilog2(bytes), rebar_minsize) - rebar_minsize;
+}
+EXPORT_SYMBOL_GPL(pci_rebar_bytes_to_size);
+
+/**
+ * pci_rebar_size_to_bytes - Convert encoded BAR Size to size in bytes
+ * @size: encoded BAR Size as defined in the PCIe spec (0=1MB, 31=128TB)
+ *
+ * Return: BAR size in bytes
+ */
+resource_size_t pci_rebar_size_to_bytes(int size)
+{
+ return 1ULL << (size + ilog2(PCI_REBAR_MIN_SIZE));
+}
+EXPORT_SYMBOL_GPL(pci_rebar_size_to_bytes);
+
+void pci_rebar_init(struct pci_dev *pdev)
+{
+ pdev->rebar_cap = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_REBAR);
+}
+
+/**
+ * pci_rebar_find_pos - find position of resize control reg for BAR
+ * @pdev: PCI device
+ * @bar: BAR to find
+ *
+ * Helper to find the position of the control register for a BAR.
+ *
+ * Return:
+ * * %-ENOTSUPP if resizable BARs are not supported at all,
+ * * %-ENOENT if no control register for the BAR could be found.
+ */
+static int pci_rebar_find_pos(struct pci_dev *pdev, int bar)
+{
+ unsigned int pos, nbars, i;
+ u32 ctrl;
+
+ if (pci_resource_is_iov(bar)) {
+ pos = pci_iov_vf_rebar_cap(pdev);
+ bar = pci_resource_num_to_vf_bar(bar);
+ } else {
+ pos = pdev->rebar_cap;
+ }
+
+ if (!pos)
+ return -ENOTSUPP;
+
+ pci_read_config_dword(pdev, pos + PCI_REBAR_CTRL, &ctrl);
+ nbars = FIELD_GET(PCI_REBAR_CTRL_NBAR_MASK, ctrl);
+
+ for (i = 0; i < nbars; i++, pos += 8) {
+ int bar_idx;
+
+ pci_read_config_dword(pdev, pos + PCI_REBAR_CTRL, &ctrl);
+ bar_idx = FIELD_GET(PCI_REBAR_CTRL_BAR_IDX, ctrl);
+ if (bar_idx == bar)
+ return pos;
+ }
+
+ return -ENOENT;
+}
+
+/**
+ * pci_rebar_get_possible_sizes - get possible sizes for Resizable BAR
+ * @pdev: PCI device
+ * @bar: BAR to query
+ *
+ * Get the possible sizes of a resizable BAR as bitmask.
+ *
+ * Return: A bitmask of possible sizes (bit 0=1MB, bit 31=128TB), or %0 if
+ * BAR isn't resizable.
+ */
+u64 pci_rebar_get_possible_sizes(struct pci_dev *pdev, int bar)
+{
+ int pos;
+ u32 cap;
+
+ pos = pci_rebar_find_pos(pdev, bar);
+ if (pos < 0)
+ return 0;
+
+ pci_read_config_dword(pdev, pos + PCI_REBAR_CAP, &cap);
+ cap = FIELD_GET(PCI_REBAR_CAP_SIZES, cap);
+
+ /* Sapphire RX 5600 XT Pulse has an invalid cap dword for BAR 0 */
+ if (pdev->vendor == PCI_VENDOR_ID_ATI && pdev->device == 0x731f &&
+ bar == 0 && cap == 0x700)
+ return 0x3f00;
+
+ return cap;
+}
+EXPORT_SYMBOL(pci_rebar_get_possible_sizes);
+
+/**
+ * pci_rebar_size_supported - check if size is supported for BAR
+ * @pdev: PCI device
+ * @bar: BAR to check
+ * @size: encoded size as defined in the PCIe spec (0=1MB, 31=128TB)
+ *
+ * Return: %true if @bar is resizable and @size is supported, otherwise
+ * %false.
+ */
+bool pci_rebar_size_supported(struct pci_dev *pdev, int bar, int size)
+{
+ u64 sizes = pci_rebar_get_possible_sizes(pdev, bar);
+
+ if (size < 0 || size > ilog2(SZ_128T) - ilog2(PCI_REBAR_MIN_SIZE))
+ return false;
+
+ return BIT(size) & sizes;
+}
+EXPORT_SYMBOL_GPL(pci_rebar_size_supported);
+
+/**
+ * pci_rebar_get_max_size - get the maximum supported size of a BAR
+ * @pdev: PCI device
+ * @bar: BAR to query
+ *
+ * Get the largest supported size of a resizable BAR as a size.
+ *
+ * Return: the encoded maximum BAR size as defined in the PCIe spec
+ * (0=1MB, 31=128TB), or %-NOENT on error.
+ */
+int pci_rebar_get_max_size(struct pci_dev *pdev, int bar)
+{
+ u64 sizes;
+
+ sizes = pci_rebar_get_possible_sizes(pdev, bar);
+ if (!sizes)
+ return -ENOENT;
+
+ return __fls(sizes);
+}
+EXPORT_SYMBOL_GPL(pci_rebar_get_max_size);
+
+/**
+ * pci_rebar_get_current_size - get the current size of a Resizable BAR
+ * @pdev: PCI device
+ * @bar: BAR to get the size from
+ *
+ * Read the current size of a BAR from the Resizable BAR config.
+ *
+ * Return: BAR Size if @bar is resizable (0=1MB, 31=128TB), or negative on
+ * error.
+ */
+int pci_rebar_get_current_size(struct pci_dev *pdev, int bar)
+{
+ int pos;
+ u32 ctrl;
+
+ pos = pci_rebar_find_pos(pdev, bar);
+ if (pos < 0)
+ return pos;
+
+ pci_read_config_dword(pdev, pos + PCI_REBAR_CTRL, &ctrl);
+ return FIELD_GET(PCI_REBAR_CTRL_BAR_SIZE, ctrl);
+}
+
+/**
+ * pci_rebar_set_size - set a new size for a Resizable BAR
+ * @pdev: PCI device
+ * @bar: BAR to set size to
+ * @size: new size as defined in the PCIe spec (0=1MB, 31=128TB)
+ *
+ * Set the new size of a BAR as defined in the spec.
+ *
+ * Return: %0 if resizing was successful, or negative on error.
+ */
+int pci_rebar_set_size(struct pci_dev *pdev, int bar, int size)
+{
+ int pos;
+ u32 ctrl;
+
+ pos = pci_rebar_find_pos(pdev, bar);
+ if (pos < 0)
+ return pos;
+
+ pci_read_config_dword(pdev, pos + PCI_REBAR_CTRL, &ctrl);
+ ctrl &= ~PCI_REBAR_CTRL_BAR_SIZE;
+ ctrl |= FIELD_PREP(PCI_REBAR_CTRL_BAR_SIZE, size);
+ pci_write_config_dword(pdev, pos + PCI_REBAR_CTRL, ctrl);
+
+ if (pci_resource_is_iov(bar))
+ pci_iov_resource_set_size(pdev, bar, size);
+
+ return 0;
+}
+
+void pci_restore_rebar_state(struct pci_dev *pdev)
+{
+ unsigned int pos, nbars, i;
+ u32 ctrl;
+
+ pos = pdev->rebar_cap;
+ if (!pos)
+ return;
+
+ pci_read_config_dword(pdev, pos + PCI_REBAR_CTRL, &ctrl);
+ nbars = FIELD_GET(PCI_REBAR_CTRL_NBAR_MASK, ctrl);
+
+ for (i = 0; i < nbars; i++, pos += 8) {
+ struct resource *res;
+ int bar_idx, size;
+
+ pci_read_config_dword(pdev, pos + PCI_REBAR_CTRL, &ctrl);
+ bar_idx = ctrl & PCI_REBAR_CTRL_BAR_IDX;
+ res = pci_resource_n(pdev, bar_idx);
+ size = pci_rebar_bytes_to_size(resource_size(res));
+ ctrl &= ~PCI_REBAR_CTRL_BAR_SIZE;
+ ctrl |= FIELD_PREP(PCI_REBAR_CTRL_BAR_SIZE, size);
+ pci_write_config_dword(pdev, pos + PCI_REBAR_CTRL, ctrl);
+ }
+}
+
+static bool pci_resize_is_memory_decoding_enabled(struct pci_dev *dev,
+ int resno)
+{
+ u16 cmd;
+
+ if (pci_resource_is_iov(resno))
+ return pci_iov_is_memory_decoding_enabled(dev);
+
+ pci_read_config_word(dev, PCI_COMMAND, &cmd);
+
+ return cmd & PCI_COMMAND_MEMORY;
+}
+
+void pci_resize_resource_set_size(struct pci_dev *dev, int resno, int size)
+{
+ resource_size_t res_size = pci_rebar_size_to_bytes(size);
+ struct resource *res = pci_resource_n(dev, resno);
+
+ if (pci_resource_is_iov(resno))
+ res_size *= pci_sriov_get_totalvfs(dev);
+
+ resource_set_size(res, res_size);
+}
+
+/**
+ * pci_resize_resource - reconfigure a Resizable BAR and resources
+ * @dev: the PCI device
+ * @resno: index of the BAR to be resized
+ * @size: new size as defined in the spec (0=1MB, 31=128TB)
+ * @exclude_bars: a mask of BARs that should not be released
+ *
+ * Reconfigure @resno to @size and re-run resource assignment algorithm
+ * with the new size.
+ *
+ * Prior to resize, release @dev resources that share a bridge window with
+ * @resno. This unpins the bridge window resource to allow changing it.
+ *
+ * The caller may prevent releasing a particular BAR by providing
+ * @exclude_bars mask, but this may result in the resize operation failing
+ * due to insufficient space.
+ *
+ * Return: 0 on success, or negative on error. In case of an error, the
+ * resources are restored to their original places.
+ */
+int pci_resize_resource(struct pci_dev *dev, int resno, int size,
+ int exclude_bars)
+{
+ struct pci_host_bridge *host;
+ int old, ret;
+
+ /* Check if we must preserve the firmware's resource assignment */
+ host = pci_find_host_bridge(dev->bus);
+ if (host->preserve_config)
+ return -ENOTSUPP;
+
+ if (pci_resize_is_memory_decoding_enabled(dev, resno))
+ return -EBUSY;
+
+ if (!pci_rebar_size_supported(dev, resno, size))
+ return -EINVAL;
+
+ old = pci_rebar_get_current_size(dev, resno);
+ if (old < 0)
+ return old;
+
+ ret = pci_rebar_set_size(dev, resno, size);
+ if (ret)
+ return ret;
+
+ ret = pci_do_resource_release_and_resize(dev, resno, size, exclude_bars);
+ if (ret)
+ goto error_resize;
+ return 0;
+
+error_resize:
+ pci_rebar_set_size(dev, resno, old);
+ return ret;
+}
+EXPORT_SYMBOL(pci_resize_resource);
diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 3645f392a9fd..6e90f46f52af 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -15,6 +15,7 @@
*/
#include <linux/bitops.h>
+#include <linux/bug.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/module.h>
@@ -135,6 +136,9 @@ static void restore_dev_resource(struct pci_dev_resource *dev_res)
{
struct resource *res = dev_res->res;
+ if (WARN_ON_ONCE(res->parent))
+ return;
+
res->start = dev_res->start;
res->end = dev_res->end;
res->flags = dev_res->flags;
@@ -2420,18 +2424,16 @@ EXPORT_SYMBOL_GPL(pci_assign_unassigned_bridge_resources);
* release it when possible. If the bridge window contains assigned
* resources, it cannot be released.
*/
-int pbus_reassign_bridge_resources(struct pci_bus *bus, struct resource *res)
+static int pbus_reassign_bridge_resources(struct pci_bus *bus, struct resource *res,
+ struct list_head *saved)
{
unsigned long type = res->flags;
struct pci_dev_resource *dev_res;
- struct pci_dev *bridge;
- LIST_HEAD(saved);
+ struct pci_dev *bridge = NULL;
LIST_HEAD(added);
LIST_HEAD(failed);
unsigned int i;
- int ret;
-
- down_read(&pci_bus_sem);
+ int ret = 0;
while (!pci_is_root_bus(bus)) {
bridge = bus->self;
@@ -2443,9 +2445,9 @@ int pbus_reassign_bridge_resources(struct pci_bus *bus, struct resource *res)
/* Ignore BARs which are still in use */
if (!res->child) {
- ret = add_to_list(&saved, bridge, res, 0, 0);
+ ret = add_to_list(saved, bridge, res, 0, 0);
if (ret)
- goto cleanup;
+ return ret;
pci_release_resource(bridge, i);
} else {
@@ -2459,10 +2461,8 @@ int pbus_reassign_bridge_resources(struct pci_bus *bus, struct resource *res)
bus = bus->parent;
}
- if (list_empty(&saved)) {
- up_read(&pci_bus_sem);
+ if (!bridge)
return -ENOENT;
- }
__pci_bus_size_bridges(bridge->subordinate, &added);
__pci_bridge_assign_resources(bridge, &added, &failed);
@@ -2470,49 +2470,107 @@ int pbus_reassign_bridge_resources(struct pci_bus *bus, struct resource *res)
free_list(&added);
if (!list_empty(&failed)) {
- if (pci_required_resource_failed(&failed, type)) {
+ if (pci_required_resource_failed(&failed, type))
ret = -ENOSPC;
- goto cleanup;
- }
- /* Only resources with unrelated types failed (again) */
free_list(&failed);
+ if (ret)
+ return ret;
+
+ /* Only resources with unrelated types failed (again) */
}
- list_for_each_entry(dev_res, &saved, list) {
+ list_for_each_entry(dev_res, saved, list) {
+ struct pci_dev *dev = dev_res->dev;
+
/* Skip the bridge we just assigned resources for */
- if (bridge == dev_res->dev)
+ if (bridge == dev)
+ continue;
+
+ if (!dev->subordinate)
continue;
- bridge = dev_res->dev;
- pci_setup_bridge(bridge->subordinate);
+ pci_setup_bridge(dev->subordinate);
}
- free_list(&saved);
- up_read(&pci_bus_sem);
return 0;
+}
-cleanup:
- /* Restore size and flags */
- list_for_each_entry(dev_res, &failed, list)
- restore_dev_resource(dev_res);
- free_list(&failed);
+int pci_do_resource_release_and_resize(struct pci_dev *pdev, int resno, int size,
+ int exclude_bars)
+{
+ struct resource *res = pci_resource_n(pdev, resno);
+ struct pci_dev_resource *dev_res;
+ struct pci_bus *bus = pdev->bus;
+ struct resource *b_win, *r;
+ LIST_HEAD(saved);
+ unsigned int i;
+ int ret = 0;
+
+ b_win = pbus_select_window(bus, res);
+ if (!b_win)
+ return -EINVAL;
+ pci_dev_for_each_resource(pdev, r, i) {
+ if (i >= PCI_BRIDGE_RESOURCES)
+ break;
+
+ if (exclude_bars & BIT(i))
+ continue;
+
+ if (b_win != pbus_select_window(bus, r))
+ continue;
+
+ ret = add_to_list(&saved, pdev, r, 0, 0);
+ if (ret)
+ goto restore;
+ pci_release_resource(pdev, i);
+ }
+
+ pci_resize_resource_set_size(pdev, resno, size);
+
+ if (!bus->self)
+ goto out;
+
+ down_read(&pci_bus_sem);
+ ret = pbus_reassign_bridge_resources(bus, res, &saved);
+ if (ret)
+ goto restore;
+
+out:
+ up_read(&pci_bus_sem);
+ free_list(&saved);
+ return ret;
+
+restore:
/* Revert to the old configuration */
list_for_each_entry(dev_res, &saved, list) {
struct resource *res = dev_res->res;
+ struct pci_dev *dev = dev_res->dev;
- bridge = dev_res->dev;
- i = pci_resource_num(bridge, res);
+ i = pci_resource_num(dev, res);
+
+ if (res->parent) {
+ release_child_resources(res);
+ pci_release_resource(dev, i);
+ }
restore_dev_resource(dev_res);
- pci_claim_resource(bridge, i);
- pci_setup_bridge(bridge->subordinate);
- }
- free_list(&saved);
- up_read(&pci_bus_sem);
+ ret = pci_claim_resource(dev, i);
+ if (ret)
+ continue;
- return ret;
+ if (i < PCI_BRIDGE_RESOURCES) {
+ const char *res_name = pci_resource_name(dev, i);
+
+ pci_update_resource(dev, i);
+ pci_info(dev, "%s %pR: old value restored\n",
+ res_name, res);
+ }
+ if (dev->subordinate)
+ pci_setup_bridge(dev->subordinate);
+ }
+ goto out;
}
void pci_assign_unassigned_bus_resources(struct pci_bus *bus)
diff --git a/drivers/pci/setup-res.c b/drivers/pci/setup-res.c
index c3ba4ccecd43..e5fcadfc58b0 100644
--- a/drivers/pci/setup-res.c
+++ b/drivers/pci/setup-res.c
@@ -431,84 +431,6 @@ int pci_release_resource(struct pci_dev *dev, int resno)
}
EXPORT_SYMBOL(pci_release_resource);
-static bool pci_resize_is_memory_decoding_enabled(struct pci_dev *dev,
- int resno)
-{
- u16 cmd;
-
- if (pci_resource_is_iov(resno))
- return pci_iov_is_memory_decoding_enabled(dev);
-
- pci_read_config_word(dev, PCI_COMMAND, &cmd);
-
- return cmd & PCI_COMMAND_MEMORY;
-}
-
-static void pci_resize_resource_set_size(struct pci_dev *dev, int resno,
- int size)
-{
- resource_size_t res_size = pci_rebar_size_to_bytes(size);
- struct resource *res = pci_resource_n(dev, resno);
-
- if (!pci_resource_is_iov(resno)) {
- resource_set_size(res, res_size);
- } else {
- resource_set_size(res, res_size * pci_sriov_get_totalvfs(dev));
- pci_iov_resource_set_size(dev, resno, res_size);
- }
-}
-
-int pci_resize_resource(struct pci_dev *dev, int resno, int size)
-{
- struct resource *res = pci_resource_n(dev, resno);
- struct pci_host_bridge *host;
- int old, ret;
- u32 sizes;
-
- /* Check if we must preserve the firmware's resource assignment */
- host = pci_find_host_bridge(dev->bus);
- if (host->preserve_config)
- return -ENOTSUPP;
-
- /* Make sure the resource isn't assigned before resizing it. */
- if (!(res->flags & IORESOURCE_UNSET))
- return -EBUSY;
-
- if (pci_resize_is_memory_decoding_enabled(dev, resno))
- return -EBUSY;
-
- sizes = pci_rebar_get_possible_sizes(dev, resno);
- if (!sizes)
- return -ENOTSUPP;
-
- if (!(sizes & BIT(size)))
- return -EINVAL;
-
- old = pci_rebar_get_current_size(dev, resno);
- if (old < 0)
- return old;
-
- ret = pci_rebar_set_size(dev, resno, size);
- if (ret)
- return ret;
-
- pci_resize_resource_set_size(dev, resno, size);
-
- /* Check if the new config works by trying to assign everything. */
- if (dev->bus->self) {
- ret = pbus_reassign_bridge_resources(dev->bus, res);
- if (ret)
- goto error_resize;
- }
- return 0;
-
-error_resize:
- pci_rebar_set_size(dev, resno, old);
- pci_resize_resource_set_size(dev, resno, old);
- return ret;
-}
-EXPORT_SYMBOL(pci_resize_resource);
-
int pci_enable_resources(struct pci_dev *dev, int mask)
{
u16 cmd, old_cmd;
diff --git a/drivers/power/reset/Kconfig b/drivers/power/reset/Kconfig
index 8248895ca903..f6c1bcbb57de 100644
--- a/drivers/power/reset/Kconfig
+++ b/drivers/power/reset/Kconfig
@@ -283,6 +283,15 @@ config POWER_RESET_KEYSTONE
help
Reboot support for the KEYSTONE SoCs.
+config POWER_RESET_SPACEMIT_P1
+ tristate "SpacemiT P1 poweroff and reset driver"
+ depends on ARCH_SPACEMIT || COMPILE_TEST
+ depends on MFD_SPACEMIT_P1
+ default MFD_SPACEMIT_P1
+ help
+ This driver supports power-off and reset operations for the SpacemiT
+ P1 PMIC.
+
config POWER_RESET_SYSCON
bool "Generic SYSCON regmap reset driver"
depends on OF
diff --git a/drivers/power/reset/Makefile b/drivers/power/reset/Makefile
index 51da87e05ce7..0e4ae6f6b5c5 100644
--- a/drivers/power/reset/Makefile
+++ b/drivers/power/reset/Makefile
@@ -24,6 +24,7 @@ obj-$(CONFIG_POWER_RESET_LTC2952) += ltc2952-poweroff.o
obj-$(CONFIG_POWER_RESET_QNAP) += qnap-poweroff.o
obj-$(CONFIG_POWER_RESET_REGULATOR) += regulator-poweroff.o
obj-$(CONFIG_POWER_RESET_RESTART) += restart-poweroff.o
+obj-$(CONFIG_POWER_RESET_SPACEMIT_P1) += spacemit-p1-reboot.o
obj-$(CONFIG_POWER_RESET_ST) += st-poweroff.o
obj-$(CONFIG_POWER_RESET_TH1520_AON) += th1520-aon-reboot.o
obj-$(CONFIG_POWER_RESET_TORADEX_EC) += tdx-ec-poweroff.o
diff --git a/drivers/power/reset/spacemit-p1-reboot.c b/drivers/power/reset/spacemit-p1-reboot.c
new file mode 100644
index 000000000000..9ec3d1fff8f3
--- /dev/null
+++ b/drivers/power/reset/spacemit-p1-reboot.c
@@ -0,0 +1,88 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2025 by Aurelien Jarno
+ */
+
+#include <linux/bits.h>
+#include <linux/mod_devicetable.h>
+#include <linux/platform_device.h>
+#include <linux/regmap.h>
+#include <linux/reboot.h>
+
+/* Power Control Register 2 */
+#define PWR_CTRL2 0x7e
+#define PWR_CTRL2_SHUTDOWN BIT(2) /* Shutdown request */
+#define PWR_CTRL2_RST BIT(1) /* Reset request */
+
+static int spacemit_p1_pwroff_handler(struct sys_off_data *data)
+{
+ struct regmap *regmap = data->cb_data;
+ int ret;
+
+ /* Put the PMIC into shutdown state */
+ ret = regmap_set_bits(regmap, PWR_CTRL2, PWR_CTRL2_SHUTDOWN);
+ if (ret) {
+ dev_err(data->dev, "shutdown failed: %d\n", ret);
+ return notifier_from_errno(ret);
+ }
+
+ return NOTIFY_DONE;
+}
+
+static int spacemit_p1_restart_handler(struct sys_off_data *data)
+{
+ struct regmap *regmap = data->cb_data;
+ int ret;
+
+ /* Put the PMIC into reset state */
+ ret = regmap_set_bits(regmap, PWR_CTRL2, PWR_CTRL2_RST);
+ if (ret) {
+ dev_err(data->dev, "restart failed: %d\n", ret);
+ return notifier_from_errno(ret);
+ }
+
+ return NOTIFY_DONE;
+}
+
+static int spacemit_p1_reboot_probe(struct platform_device *pdev)
+{
+ struct device *dev = &pdev->dev;
+ struct regmap *regmap;
+ int ret;
+
+ regmap = dev_get_regmap(dev->parent, NULL);
+ if (!regmap)
+ return -ENODEV;
+
+ ret = devm_register_power_off_handler(dev, &spacemit_p1_pwroff_handler,
+ regmap);
+ if (ret)
+ return dev_err_probe(dev, ret,
+ "Failed to register power off handler\n");
+
+ ret = devm_register_restart_handler(dev, spacemit_p1_restart_handler,
+ regmap);
+ if (ret)
+ return dev_err_probe(dev, ret,
+ "Failed to register restart handler\n");
+
+ return 0;
+}
+
+static const struct platform_device_id spacemit_p1_reboot_id_table[] = {
+ { "spacemit-p1-reboot", },
+ { /* sentinel */ },
+};
+MODULE_DEVICE_TABLE(platform, spacemit_p1_reboot_id_table);
+
+static struct platform_driver spacemit_p1_reboot_driver = {
+ .driver = {
+ .name = "spacemit-p1-reboot",
+ },
+ .probe = spacemit_p1_reboot_probe,
+ .id_table = spacemit_p1_reboot_id_table,
+};
+module_platform_driver(spacemit_p1_reboot_driver);
+
+MODULE_DESCRIPTION("SpacemiT P1 reboot/poweroff driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/power/supply/Kconfig b/drivers/power/supply/Kconfig
index 03c8525b480f..92f9f7aae92f 100644
--- a/drivers/power/supply/Kconfig
+++ b/drivers/power/supply/Kconfig
@@ -942,6 +942,21 @@ config CHARGER_RT9471
This driver can also be built as a module. If so, the module will be
called rt9471.
+config CHARGER_RT9756
+ tristate "Richtek RT9756 smart cap divider charger driver"
+ depends on I2C
+ select REGMAP_I2C
+ select LINEAR_RANGES
+ help
+ This adds support for Richtek RT9756 smart cap divider charger driver.
+ It's a high efficiency and high charge current charger. the device
+ integrates smart cap divider topology with 9-channel high speed
+ ADCs that can provide input and output voltage, current and
+ temperature monitoring.
+
+ This driver can also be built as a module. If so, the module will be
+ called rt9756.
+
config CHARGER_CROS_USBPD
tristate "ChromeOS EC based USBPD charger"
depends on CROS_USBPD_NOTIFY
@@ -1007,6 +1022,15 @@ config CHARGER_UCS1002
Say Y to enable support for Microchip UCS1002 Programmable
USB Port Power Controller with Charger Emulation.
+config CHARGER_BD71828
+ tristate "Power-supply driver for ROHM BD71828 and BD71815 PMIC"
+ depends on MFD_ROHM_BD71828
+ help
+ Say Y here to enable support for charger and battery
+ in ROHM BD71815, BD71817, ROHM BD71828 power management
+ ICs. This driver gets various bits of information about battery
+ and charger states.
+
config CHARGER_BD99954
tristate "ROHM bd99954 charger driver"
depends on I2C
diff --git a/drivers/power/supply/Makefile b/drivers/power/supply/Makefile
index 6e37a3edf7e3..4b79d5abc49a 100644
--- a/drivers/power/supply/Makefile
+++ b/drivers/power/supply/Makefile
@@ -64,6 +64,7 @@ obj-$(CONFIG_CHARGER_RT5033) += rt5033_charger.o
obj-$(CONFIG_CHARGER_RT9455) += rt9455_charger.o
obj-$(CONFIG_CHARGER_RT9467) += rt9467-charger.o
obj-$(CONFIG_CHARGER_RT9471) += rt9471.o
+obj-$(CONFIG_CHARGER_RT9756) += rt9756.o
obj-$(CONFIG_BATTERY_TWL4030_MADC) += twl4030_madc_battery.o
obj-$(CONFIG_CHARGER_88PM860X) += 88pm860x_charger.o
obj-$(CONFIG_CHARGER_PF1550) += pf1550-charger.o
@@ -117,6 +118,7 @@ obj-$(CONFIG_CHARGER_SC2731) += sc2731_charger.o
obj-$(CONFIG_FUEL_GAUGE_SC27XX) += sc27xx_fuel_gauge.o
obj-$(CONFIG_FUEL_GAUGE_STC3117) += stc3117_fuel_gauge.o
obj-$(CONFIG_CHARGER_UCS1002) += ucs1002_power.o
+obj-$(CONFIG_CHARGER_BD71828) += bd71828-power.o
obj-$(CONFIG_CHARGER_BD99954) += bd99954-charger.o
obj-$(CONFIG_CHARGER_WILCO) += wilco-charger.o
obj-$(CONFIG_RN5T618_POWER) += rn5t618_power.o
diff --git a/drivers/power/supply/apm_power.c b/drivers/power/supply/apm_power.c
index 9236e0078578..9933cdc5c387 100644
--- a/drivers/power/supply/apm_power.c
+++ b/drivers/power/supply/apm_power.c
@@ -364,7 +364,8 @@ static int __init apm_battery_init(void)
static void __exit apm_battery_exit(void)
{
- apm_get_power_status = NULL;
+ if (apm_get_power_status == apm_battery_apm_get_power_status)
+ apm_get_power_status = NULL;
}
module_init(apm_battery_init);
diff --git a/drivers/power/supply/bd71828-power.c b/drivers/power/supply/bd71828-power.c
new file mode 100644
index 000000000000..f667baedeb77
--- /dev/null
+++ b/drivers/power/supply/bd71828-power.c
@@ -0,0 +1,1049 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* ROHM BD71815, BD71828 and BD71878 Charger driver */
+
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/mfd/rohm-bd71815.h>
+#include <linux/mfd/rohm-bd71828.h>
+#include <linux/module.h>
+#include <linux/mod_devicetable.h>
+#include <linux/platform_device.h>
+#include <linux/property.h>
+#include <linux/power_supply.h>
+#include <linux/slab.h>
+
+/* common defines */
+#define BD7182x_MASK_VBAT_U 0x1f
+#define BD7182x_MASK_VDCIN_U 0x0f
+#define BD7182x_MASK_IBAT_U 0x3f
+#define BD7182x_MASK_CURDIR_DISCHG 0x80
+#define BD7182x_MASK_CHG_STATE 0x7f
+#define BD7182x_MASK_BAT_TEMP 0x07
+#define BD7182x_MASK_DCIN_DET BIT(0)
+#define BD7182x_MASK_CONF_PON BIT(0)
+#define BD71815_MASK_CONF_XSTB BIT(1)
+#define BD7182x_MASK_BAT_STAT 0x3f
+#define BD7182x_MASK_DCIN_STAT 0x07
+
+#define BD7182x_MASK_WDT_AUTO 0x40
+#define BD7182x_MASK_VBAT_ALM_LIMIT_U 0x01
+#define BD7182x_MASK_CHG_EN 0x01
+
+#define BD7182x_DCIN_COLLAPSE_DEFAULT 0x36
+
+#define MAX_CURRENT_DEFAULT 890000 /* uA */
+#define AC_NAME "bd71828_ac"
+#define BAT_NAME "bd71828_bat"
+
+#define BAT_OPEN 0x7
+
+/*
+ * VBAT Low voltage detection Threshold
+ * 0x00D4*16mV = 212*0.016 = 3.392v
+ */
+#define VBAT_LOW_TH 0x00D4
+
+struct pwr_regs {
+ u8 vbat_avg;
+ u8 ibat;
+ u8 ibat_avg;
+ u8 btemp_vth;
+ u8 chg_state;
+ u8 bat_temp;
+ u8 dcin_stat;
+ u8 dcin_collapse_limit;
+ u8 chg_set1;
+ u8 chg_en;
+ u8 vbat_alm_limit_u;
+ u8 conf;
+ u8 vdcin;
+};
+
+static const struct pwr_regs pwr_regs_bd71828 = {
+ .vbat_avg = BD71828_REG_VBAT_U,
+ .ibat = BD71828_REG_IBAT_U,
+ .ibat_avg = BD71828_REG_IBAT_AVG_U,
+ .btemp_vth = BD71828_REG_VM_BTMP_U,
+ .chg_state = BD71828_REG_CHG_STATE,
+ .bat_temp = BD71828_REG_BAT_TEMP,
+ .dcin_stat = BD71828_REG_DCIN_STAT,
+ .dcin_collapse_limit = BD71828_REG_DCIN_CLPS,
+ .chg_set1 = BD71828_REG_CHG_SET1,
+ .chg_en = BD71828_REG_CHG_EN,
+ .vbat_alm_limit_u = BD71828_REG_ALM_VBAT_LIMIT_U,
+ .conf = BD71828_REG_CONF,
+ .vdcin = BD71828_REG_VDCIN_U,
+};
+
+static const struct pwr_regs pwr_regs_bd71815 = {
+ .vbat_avg = BD71815_REG_VM_SA_VBAT_U,
+ /* BD71815 does not have separate current and current avg */
+ .ibat = BD71815_REG_CC_CURCD_U,
+ .ibat_avg = BD71815_REG_CC_CURCD_U,
+
+ .btemp_vth = BD71815_REG_VM_BTMP,
+ .chg_state = BD71815_REG_CHG_STATE,
+ .bat_temp = BD71815_REG_BAT_TEMP,
+ .dcin_stat = BD71815_REG_DCIN_STAT,
+ .dcin_collapse_limit = BD71815_REG_DCIN_CLPS,
+ .chg_set1 = BD71815_REG_CHG_SET1,
+ .chg_en = BD71815_REG_CHG_SET1,
+ .vbat_alm_limit_u = BD71815_REG_ALM_VBAT_TH_U,
+ .conf = BD71815_REG_CONF,
+
+ .vdcin = BD71815_REG_VM_DCIN_U,
+};
+
+struct bd71828_power {
+ struct regmap *regmap;
+ enum rohm_chip_type chip_type;
+ struct device *dev;
+ struct power_supply *ac;
+ struct power_supply *bat;
+
+ const struct pwr_regs *regs;
+ /* Reg val to uA */
+ int curr_factor;
+ int rsens;
+ int (*get_temp)(struct bd71828_power *pwr, int *temp);
+ int (*bat_inserted)(struct bd71828_power *pwr);
+};
+
+static int bd7182x_write16(struct bd71828_power *pwr, int reg, u16 val)
+{
+ __be16 tmp;
+
+ tmp = cpu_to_be16(val);
+
+ return regmap_bulk_write(pwr->regmap, reg, &tmp, sizeof(tmp));
+}
+
+static int bd7182x_read16_himask(struct bd71828_power *pwr, int reg, int himask,
+ u16 *val)
+{
+ struct regmap *regmap = pwr->regmap;
+ int ret;
+ __be16 rvals;
+ u8 *tmp = (u8 *)&rvals;
+
+ ret = regmap_bulk_read(regmap, reg, &rvals, sizeof(*val));
+ if (!ret) {
+ *tmp &= himask;
+ *val = be16_to_cpu(rvals);
+ }
+
+ return ret;
+}
+
+static int bd71828_get_vbat(struct bd71828_power *pwr, int *vcell)
+{
+ u16 tmp_vcell;
+ int ret;
+
+ ret = bd7182x_read16_himask(pwr, pwr->regs->vbat_avg,
+ BD7182x_MASK_VBAT_U, &tmp_vcell);
+ if (ret)
+ dev_err(pwr->dev, "Failed to read battery average voltage\n");
+ else
+ *vcell = ((int)tmp_vcell) * 1000;
+
+ return ret;
+}
+
+static int bd71828_get_current_ds_adc(struct bd71828_power *pwr, int *curr, int *curr_avg)
+{
+ __be16 tmp_curr;
+ char *tmp = (char *)&tmp_curr;
+ int dir = 1;
+ int regs[] = { pwr->regs->ibat, pwr->regs->ibat_avg };
+ int *vals[] = { curr, curr_avg };
+ int ret, i;
+
+ for (dir = 1, i = 0; i < ARRAY_SIZE(regs); i++) {
+ ret = regmap_bulk_read(pwr->regmap, regs[i], &tmp_curr,
+ sizeof(tmp_curr));
+ if (ret)
+ break;
+
+ if (*tmp & BD7182x_MASK_CURDIR_DISCHG)
+ dir = -1;
+
+ *tmp &= BD7182x_MASK_IBAT_U;
+
+ *vals[i] = dir * ((int)be16_to_cpu(tmp_curr)) * pwr->curr_factor;
+ }
+
+ return ret;
+}
+
+/* Unit is tenths of degree C */
+static int bd71815_get_temp(struct bd71828_power *pwr, int *temp)
+{
+ struct regmap *regmap = pwr->regmap;
+ int ret;
+ int t;
+
+ ret = regmap_read(regmap, pwr->regs->btemp_vth, &t);
+ if (ret)
+ return ret;
+
+ t = 200 - t;
+
+ if (t > 200) {
+ dev_err(pwr->dev, "Failed to read battery temperature\n");
+ return -ENODATA;
+ }
+
+ return 0;
+}
+
+/* Unit is tenths of degree C */
+static int bd71828_get_temp(struct bd71828_power *pwr, int *temp)
+{
+ u16 t;
+ int ret;
+ int tmp = 200 * 10000;
+
+ ret = bd7182x_read16_himask(pwr, pwr->regs->btemp_vth,
+ BD71828_MASK_VM_BTMP_U, &t);
+ if (ret)
+ return ret;
+
+ if (t > 3200) {
+ dev_err(pwr->dev,
+ "Failed to read battery temperature\n");
+ return -ENODATA;
+ }
+
+ tmp -= 625ULL * (unsigned int)t;
+ *temp = tmp / 1000;
+
+ return ret;
+}
+
+static int bd71828_charge_status(struct bd71828_power *pwr,
+ int *s, int *h)
+{
+ unsigned int state;
+ int status, health;
+ int ret = 1;
+
+ ret = regmap_read(pwr->regmap, pwr->regs->chg_state, &state);
+ if (ret) {
+ dev_err(pwr->dev, "charger status reading failed (%d)\n", ret);
+ return ret;
+ }
+
+ state &= BD7182x_MASK_CHG_STATE;
+
+ dev_dbg(pwr->dev, "CHG_STATE %d\n", state);
+
+ switch (state) {
+ case 0x00:
+ status = POWER_SUPPLY_STATUS_DISCHARGING;
+ health = POWER_SUPPLY_HEALTH_GOOD;
+ break;
+ case 0x01:
+ case 0x02:
+ case 0x03:
+ case 0x0E:
+ status = POWER_SUPPLY_STATUS_CHARGING;
+ health = POWER_SUPPLY_HEALTH_GOOD;
+ break;
+ case 0x0F:
+ status = POWER_SUPPLY_STATUS_FULL;
+ health = POWER_SUPPLY_HEALTH_GOOD;
+ break;
+ case 0x10:
+ case 0x11:
+ case 0x12:
+ case 0x13:
+ case 0x14:
+ case 0x20:
+ case 0x21:
+ case 0x22:
+ case 0x23:
+ case 0x24:
+ status = POWER_SUPPLY_STATUS_NOT_CHARGING;
+ health = POWER_SUPPLY_HEALTH_OVERHEAT;
+ break;
+ case 0x30:
+ case 0x31:
+ case 0x32:
+ case 0x40:
+ status = POWER_SUPPLY_STATUS_DISCHARGING;
+ health = POWER_SUPPLY_HEALTH_GOOD;
+ break;
+ case 0x7f:
+ default:
+ status = POWER_SUPPLY_STATUS_NOT_CHARGING;
+ health = POWER_SUPPLY_HEALTH_DEAD;
+ break;
+ }
+
+ if (s)
+ *s = status;
+ if (h)
+ *h = health;
+
+ return ret;
+}
+
+static int get_chg_online(struct bd71828_power *pwr, int *chg_online)
+{
+ int r, ret;
+
+ ret = regmap_read(pwr->regmap, pwr->regs->dcin_stat, &r);
+ if (ret) {
+ dev_err(pwr->dev, "Failed to read DCIN status\n");
+ return ret;
+ }
+ *chg_online = ((r & BD7182x_MASK_DCIN_DET) != 0);
+
+ return 0;
+}
+
+static int get_bat_online(struct bd71828_power *pwr, int *bat_online)
+{
+ int r, ret;
+
+ ret = regmap_read(pwr->regmap, pwr->regs->bat_temp, &r);
+ if (ret) {
+ dev_err(pwr->dev, "Failed to read battery temperature\n");
+ return ret;
+ }
+ *bat_online = ((r & BD7182x_MASK_BAT_TEMP) != BAT_OPEN);
+
+ return 0;
+}
+
+static int bd71828_bat_inserted(struct bd71828_power *pwr)
+{
+ int ret, val;
+
+ ret = regmap_read(pwr->regmap, pwr->regs->conf, &val);
+ if (ret) {
+ dev_err(pwr->dev, "Failed to read CONF register\n");
+ return 0;
+ }
+ ret = val & BD7182x_MASK_CONF_PON;
+
+ if (ret)
+ regmap_update_bits(pwr->regmap, pwr->regs->conf,
+ BD7182x_MASK_CONF_PON, 0);
+
+ return ret;
+}
+
+static int bd71815_bat_inserted(struct bd71828_power *pwr)
+{
+ int ret, val;
+
+ ret = regmap_read(pwr->regmap, pwr->regs->conf, &val);
+ if (ret) {
+ dev_err(pwr->dev, "Failed to read CONF register\n");
+ return ret;
+ }
+
+ ret = !(val & BD71815_MASK_CONF_XSTB);
+ if (ret)
+ regmap_write(pwr->regmap, pwr->regs->conf, val |
+ BD71815_MASK_CONF_XSTB);
+
+ return ret;
+}
+
+static int bd71828_init_hardware(struct bd71828_power *pwr)
+{
+ int ret;
+
+ /* TODO: Collapse limit should come from device-tree ? */
+ ret = regmap_write(pwr->regmap, pwr->regs->dcin_collapse_limit,
+ BD7182x_DCIN_COLLAPSE_DEFAULT);
+ if (ret) {
+ dev_err(pwr->dev, "Failed to write DCIN collapse limit\n");
+ return ret;
+ }
+
+ ret = pwr->bat_inserted(pwr);
+ if (ret < 0)
+ return ret;
+
+ if (ret) {
+ /* WDT_FST auto set */
+ ret = regmap_update_bits(pwr->regmap, pwr->regs->chg_set1,
+ BD7182x_MASK_WDT_AUTO,
+ BD7182x_MASK_WDT_AUTO);
+ if (ret)
+ return ret;
+
+ ret = bd7182x_write16(pwr, pwr->regs->vbat_alm_limit_u,
+ VBAT_LOW_TH);
+ if (ret)
+ return ret;
+
+ /*
+ * On BD71815 "we mask the power-state" from relax detection.
+ * I am unsure what the impact of the power-state would be if
+ * we didn't - but this is what the vendor driver did - and
+ * that driver has been used in few projects so I just assume
+ * this is needed.
+ */
+ if (pwr->chip_type == ROHM_CHIP_TYPE_BD71815) {
+ ret = regmap_set_bits(pwr->regmap,
+ BD71815_REG_REX_CTRL_1,
+ REX_PMU_STATE_MASK);
+ if (ret)
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static int bd71828_charger_get_property(struct power_supply *psy,
+ enum power_supply_property psp,
+ union power_supply_propval *val)
+{
+ struct bd71828_power *pwr = dev_get_drvdata(psy->dev.parent);
+ u32 vot;
+ u16 tmp;
+ int online;
+ int ret;
+
+ switch (psp) {
+ case POWER_SUPPLY_PROP_ONLINE:
+ ret = get_chg_online(pwr, &online);
+ if (!ret)
+ val->intval = online;
+ break;
+ case POWER_SUPPLY_PROP_VOLTAGE_NOW:
+ ret = bd7182x_read16_himask(pwr, pwr->regs->vdcin,
+ BD7182x_MASK_VDCIN_U, &tmp);
+ if (ret)
+ return ret;
+
+ vot = tmp;
+ /* 5 milli volt steps */
+ val->intval = 5000 * vot;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int bd71828_battery_get_property(struct power_supply *psy,
+ enum power_supply_property psp,
+ union power_supply_propval *val)
+{
+ struct bd71828_power *pwr = dev_get_drvdata(psy->dev.parent);
+ int ret = 0;
+ int status, health, tmp, curr, curr_avg, chg_en;
+
+ if (psp == POWER_SUPPLY_PROP_STATUS ||
+ psp == POWER_SUPPLY_PROP_HEALTH ||
+ psp == POWER_SUPPLY_PROP_CHARGE_TYPE)
+ ret = bd71828_charge_status(pwr, &status, &health);
+ else if (psp == POWER_SUPPLY_PROP_CURRENT_AVG ||
+ psp == POWER_SUPPLY_PROP_CURRENT_NOW)
+ ret = bd71828_get_current_ds_adc(pwr, &curr, &curr_avg);
+ if (ret)
+ return ret;
+
+ switch (psp) {
+ case POWER_SUPPLY_PROP_STATUS:
+ val->intval = status;
+ break;
+ case POWER_SUPPLY_PROP_HEALTH:
+ val->intval = health;
+ break;
+ case POWER_SUPPLY_PROP_PRESENT:
+ ret = get_bat_online(pwr, &tmp);
+ if (!ret)
+ val->intval = tmp;
+ break;
+ case POWER_SUPPLY_PROP_VOLTAGE_NOW:
+ ret = bd71828_get_vbat(pwr, &tmp);
+ val->intval = tmp;
+ break;
+ case POWER_SUPPLY_PROP_TECHNOLOGY:
+ val->intval = POWER_SUPPLY_TECHNOLOGY_LION;
+ break;
+ case POWER_SUPPLY_PROP_CURRENT_AVG:
+ val->intval = curr_avg;
+ break;
+ case POWER_SUPPLY_PROP_CURRENT_NOW:
+ val->intval = curr;
+ break;
+ case POWER_SUPPLY_PROP_CURRENT_MAX:
+ val->intval = MAX_CURRENT_DEFAULT;
+ break;
+ case POWER_SUPPLY_PROP_TEMP:
+ ret = pwr->get_temp(pwr, &val->intval);
+ break;
+ case POWER_SUPPLY_PROP_CHARGE_BEHAVIOUR:
+ ret = regmap_read(pwr->regmap, pwr->regs->chg_en, &chg_en);
+ if (ret)
+ return ret;
+
+ val->intval = (chg_en & BD7182x_MASK_CHG_EN) ?
+ POWER_SUPPLY_CHARGE_BEHAVIOUR_AUTO :
+ POWER_SUPPLY_CHARGE_BEHAVIOUR_INHIBIT_CHARGE;
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ return ret;
+}
+
+static int bd71828_battery_set_property(struct power_supply *psy,
+ enum power_supply_property psp,
+ const union power_supply_propval *val)
+{
+ struct bd71828_power *pwr = dev_get_drvdata(psy->dev.parent);
+ int ret = 0;
+
+ switch (psp) {
+ case POWER_SUPPLY_PROP_CHARGE_BEHAVIOUR:
+ if (val->intval == POWER_SUPPLY_CHARGE_BEHAVIOUR_AUTO)
+ ret = regmap_update_bits(pwr->regmap, pwr->regs->chg_en,
+ BD7182x_MASK_CHG_EN,
+ BD7182x_MASK_CHG_EN);
+ else
+ ret = regmap_update_bits(pwr->regmap, pwr->regs->chg_en,
+ BD7182x_MASK_CHG_EN,
+ 0);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return ret;
+}
+
+static int bd71828_battery_property_is_writeable(struct power_supply *psy,
+ enum power_supply_property psp)
+{
+ switch (psp) {
+ case POWER_SUPPLY_PROP_CHARGE_BEHAVIOUR:
+ return true;
+ default:
+ return false;
+ }
+}
+
+/** @brief ac properties */
+static const enum power_supply_property bd71828_charger_props[] = {
+ POWER_SUPPLY_PROP_ONLINE,
+ POWER_SUPPLY_PROP_VOLTAGE_NOW,
+};
+
+static const enum power_supply_property bd71828_battery_props[] = {
+ POWER_SUPPLY_PROP_STATUS,
+ POWER_SUPPLY_PROP_HEALTH,
+ POWER_SUPPLY_PROP_VOLTAGE_NOW,
+ POWER_SUPPLY_PROP_HEALTH,
+ POWER_SUPPLY_PROP_PRESENT,
+ POWER_SUPPLY_PROP_TECHNOLOGY,
+ POWER_SUPPLY_PROP_TEMP,
+ POWER_SUPPLY_PROP_CURRENT_AVG,
+ POWER_SUPPLY_PROP_CURRENT_NOW,
+ POWER_SUPPLY_PROP_CURRENT_MAX,
+ POWER_SUPPLY_PROP_CHARGE_BEHAVIOUR,
+};
+
+/** @brief powers supplied by bd71828_ac */
+static char *bd71828_ac_supplied_to[] = {
+ BAT_NAME,
+};
+
+static const struct power_supply_desc bd71828_ac_desc = {
+ .name = AC_NAME,
+ .type = POWER_SUPPLY_TYPE_MAINS,
+ .properties = bd71828_charger_props,
+ .num_properties = ARRAY_SIZE(bd71828_charger_props),
+ .get_property = bd71828_charger_get_property,
+};
+
+static const struct power_supply_desc bd71828_bat_desc = {
+ .name = BAT_NAME,
+ .type = POWER_SUPPLY_TYPE_BATTERY,
+ .charge_behaviours = BIT(POWER_SUPPLY_CHARGE_BEHAVIOUR_AUTO) |
+ BIT(POWER_SUPPLY_CHARGE_BEHAVIOUR_INHIBIT_CHARGE),
+ .properties = bd71828_battery_props,
+ .num_properties = ARRAY_SIZE(bd71828_battery_props),
+ .get_property = bd71828_battery_get_property,
+ .set_property = bd71828_battery_set_property,
+ .property_is_writeable = bd71828_battery_property_is_writeable,
+};
+
+#define RSENS_CURR 10000000LLU
+
+#define BD_ISR_NAME(name) \
+bd7181x_##name##_isr
+
+#define BD_ISR_BAT(name, print, run_gauge) \
+static irqreturn_t BD_ISR_NAME(name)(int irq, void *data) \
+{ \
+ struct bd71828_power *pwr = (struct bd71828_power *)data; \
+ \
+ dev_dbg(pwr->dev, "%s\n", print); \
+ power_supply_changed(pwr->bat); \
+ \
+ return IRQ_HANDLED; \
+}
+
+#define BD_ISR_AC(name, print, run_gauge) \
+static irqreturn_t BD_ISR_NAME(name)(int irq, void *data) \
+{ \
+ struct bd71828_power *pwr = (struct bd71828_power *)data; \
+ \
+ power_supply_changed(pwr->ac); \
+ dev_dbg(pwr->dev, "%s\n", print); \
+ power_supply_changed(pwr->bat); \
+ \
+ return IRQ_HANDLED; \
+}
+
+#define BD_ISR_DUMMY(name, print) \
+static irqreturn_t BD_ISR_NAME(name)(int irq, void *data) \
+{ \
+ struct bd71828_power *pwr = (struct bd71828_power *)data; \
+ \
+ dev_dbg(pwr->dev, "%s\n", print); \
+ \
+ return IRQ_HANDLED; \
+}
+
+BD_ISR_BAT(chg_state_changed, "CHG state changed", true)
+/* DCIN voltage changes */
+BD_ISR_AC(dcin_removed, "DCIN removed", true)
+BD_ISR_AC(clps_out, "DCIN voltage back to normal", true)
+BD_ISR_AC(clps_in, "DCIN voltage collapsed", false)
+BD_ISR_AC(dcin_ovp_res, "DCIN voltage normal", true)
+BD_ISR_AC(dcin_ovp_det, "DCIN OVER VOLTAGE", true)
+
+BD_ISR_DUMMY(dcin_mon_det, "DCIN voltage below threshold")
+BD_ISR_DUMMY(dcin_mon_res, "DCIN voltage above threshold")
+
+BD_ISR_DUMMY(vsys_uv_res, "VSYS under-voltage cleared")
+BD_ISR_DUMMY(vsys_uv_det, "VSYS under-voltage")
+BD_ISR_DUMMY(vsys_low_res, "'VSYS low' cleared")
+BD_ISR_DUMMY(vsys_low_det, "VSYS low")
+BD_ISR_DUMMY(vsys_mon_res, "VSYS mon - resumed")
+BD_ISR_DUMMY(vsys_mon_det, "VSYS mon - detected")
+BD_ISR_BAT(chg_wdg_temp, "charger temperature watchdog triggered", true)
+BD_ISR_BAT(chg_wdg, "charging watchdog triggered", true)
+BD_ISR_BAT(bat_removed, "Battery removed", true)
+BD_ISR_BAT(bat_det, "Battery detected", true)
+/* TODO: Verify the meaning of these interrupts */
+BD_ISR_BAT(rechg_det, "Recharging", true)
+BD_ISR_BAT(rechg_res, "Recharge ending", true)
+BD_ISR_DUMMY(temp_transit, "Temperature transition")
+BD_ISR_BAT(therm_rmv, "bd71815-therm-rmv", false)
+BD_ISR_BAT(therm_det, "bd71815-therm-det", true)
+BD_ISR_BAT(bat_dead, "bd71815-bat-dead", false)
+BD_ISR_BAT(bat_short_res, "bd71815-bat-short-res", true)
+BD_ISR_BAT(bat_short, "bd71815-bat-short-det", false)
+BD_ISR_BAT(bat_low_res, "bd71815-bat-low-res", true)
+BD_ISR_BAT(bat_low, "bd71815-bat-low-det", true)
+BD_ISR_BAT(bat_ov_res, "bd71815-bat-over-res", true)
+/* What should we do here? */
+BD_ISR_BAT(bat_ov, "bd71815-bat-over-det", false)
+BD_ISR_BAT(bat_mon_res, "bd71815-bat-mon-res", true)
+BD_ISR_BAT(bat_mon, "bd71815-bat-mon-det", true)
+BD_ISR_BAT(bat_cc_mon, "bd71815-bat-cc-mon2", false)
+BD_ISR_BAT(bat_oc1_res, "bd71815-bat-oc1-res", true)
+BD_ISR_BAT(bat_oc1, "bd71815-bat-oc1-det", false)
+BD_ISR_BAT(bat_oc2_res, "bd71815-bat-oc2-res", true)
+BD_ISR_BAT(bat_oc2, "bd71815-bat-oc2-det", false)
+BD_ISR_BAT(bat_oc3_res, "bd71815-bat-oc3-res", true)
+BD_ISR_BAT(bat_oc3, "bd71815-bat-oc3-det", false)
+BD_ISR_BAT(temp_bat_low_res, "bd71815-temp-bat-low-res", true)
+BD_ISR_BAT(temp_bat_low, "bd71815-temp-bat-low-det", true)
+BD_ISR_BAT(temp_bat_hi_res, "bd71815-temp-bat-hi-res", true)
+BD_ISR_BAT(temp_bat_hi, "bd71815-temp-bat-hi-det", true)
+
+static irqreturn_t bd7182x_dcin_removed(int irq, void *data)
+{
+ struct bd71828_power *pwr = (struct bd71828_power *)data;
+
+ power_supply_changed(pwr->ac);
+ dev_dbg(pwr->dev, "DCIN removed\n");
+
+ return IRQ_HANDLED;
+}
+
+static irqreturn_t bd718x7_chg_done(int irq, void *data)
+{
+ struct bd71828_power *pwr = (struct bd71828_power *)data;
+
+ power_supply_changed(pwr->bat);
+
+ return IRQ_HANDLED;
+}
+
+static irqreturn_t bd7182x_dcin_detected(int irq, void *data)
+{
+ struct bd71828_power *pwr = (struct bd71828_power *)data;
+
+ dev_dbg(pwr->dev, "DCIN inserted\n");
+ power_supply_changed(pwr->ac);
+
+ return IRQ_HANDLED;
+}
+
+static irqreturn_t bd71828_vbat_low_res(int irq, void *data)
+{
+ struct bd71828_power *pwr = (struct bd71828_power *)data;
+
+ dev_dbg(pwr->dev, "VBAT LOW Resumed\n");
+
+ return IRQ_HANDLED;
+}
+
+static irqreturn_t bd71828_vbat_low_det(int irq, void *data)
+{
+ struct bd71828_power *pwr = (struct bd71828_power *)data;
+
+ dev_dbg(pwr->dev, "VBAT LOW Detected\n");
+
+ return IRQ_HANDLED;
+}
+
+static irqreturn_t bd71828_temp_bat_hi_det(int irq, void *data)
+{
+ struct bd71828_power *pwr = (struct bd71828_power *)data;
+
+ dev_warn(pwr->dev, "Overtemp Detected\n");
+ power_supply_changed(pwr->bat);
+
+ return IRQ_HANDLED;
+}
+
+static irqreturn_t bd71828_temp_bat_hi_res(int irq, void *data)
+{
+ struct bd71828_power *pwr = (struct bd71828_power *)data;
+
+ dev_dbg(pwr->dev, "Overtemp Resumed\n");
+ power_supply_changed(pwr->bat);
+
+ return IRQ_HANDLED;
+}
+
+static irqreturn_t bd71828_temp_bat_low_det(int irq, void *data)
+{
+ struct bd71828_power *pwr = (struct bd71828_power *)data;
+
+ dev_dbg(pwr->dev, "Lowtemp Detected\n");
+ power_supply_changed(pwr->bat);
+
+ return IRQ_HANDLED;
+}
+
+static irqreturn_t bd71828_temp_bat_low_res(int irq, void *data)
+{
+ struct bd71828_power *pwr = (struct bd71828_power *)data;
+
+ dev_dbg(pwr->dev, "Lowtemp Resumed\n");
+ power_supply_changed(pwr->bat);
+
+ return IRQ_HANDLED;
+}
+
+static irqreturn_t bd71828_temp_vf_det(int irq, void *data)
+{
+ struct bd71828_power *pwr = (struct bd71828_power *)data;
+
+ dev_dbg(pwr->dev, "VF Detected\n");
+ power_supply_changed(pwr->bat);
+
+ return IRQ_HANDLED;
+}
+
+static irqreturn_t bd71828_temp_vf_res(int irq, void *data)
+{
+ struct bd71828_power *pwr = (struct bd71828_power *)data;
+
+ dev_dbg(pwr->dev, "VF Resumed\n");
+ power_supply_changed(pwr->bat);
+
+ return IRQ_HANDLED;
+}
+
+static irqreturn_t bd71828_temp_vf125_det(int irq, void *data)
+{
+ struct bd71828_power *pwr = (struct bd71828_power *)data;
+
+ dev_dbg(pwr->dev, "VF125 Detected\n");
+ power_supply_changed(pwr->bat);
+
+ return IRQ_HANDLED;
+}
+
+static irqreturn_t bd71828_temp_vf125_res(int irq, void *data)
+{
+ struct bd71828_power *pwr = (struct bd71828_power *)data;
+
+ dev_dbg(pwr->dev, "VF125 Resumed\n");
+ power_supply_changed(pwr->bat);
+
+ return IRQ_HANDLED;
+}
+
+struct bd7182x_irq_res {
+ const char *name;
+ irq_handler_t handler;
+};
+
+#define BDIRQ(na, hn) { .name = (na), .handler = (hn) }
+
+static int bd7182x_get_irqs(struct platform_device *pdev,
+ struct bd71828_power *pwr)
+{
+ int i, irq, ret;
+ static const struct bd7182x_irq_res bd71815_irqs[] = {
+ BDIRQ("bd71815-dcin-rmv", BD_ISR_NAME(dcin_removed)),
+ BDIRQ("bd71815-dcin-clps-out", BD_ISR_NAME(clps_out)),
+ BDIRQ("bd71815-dcin-clps-in", BD_ISR_NAME(clps_in)),
+ BDIRQ("bd71815-dcin-ovp-res", BD_ISR_NAME(dcin_ovp_res)),
+ BDIRQ("bd71815-dcin-ovp-det", BD_ISR_NAME(dcin_ovp_det)),
+ BDIRQ("bd71815-dcin-mon-res", BD_ISR_NAME(dcin_mon_res)),
+ BDIRQ("bd71815-dcin-mon-det", BD_ISR_NAME(dcin_mon_det)),
+
+ BDIRQ("bd71815-vsys-uv-res", BD_ISR_NAME(vsys_uv_res)),
+ BDIRQ("bd71815-vsys-uv-det", BD_ISR_NAME(vsys_uv_det)),
+ BDIRQ("bd71815-vsys-low-res", BD_ISR_NAME(vsys_low_res)),
+ BDIRQ("bd71815-vsys-low-det", BD_ISR_NAME(vsys_low_det)),
+ BDIRQ("bd71815-vsys-mon-res", BD_ISR_NAME(vsys_mon_res)),
+ BDIRQ("bd71815-vsys-mon-det", BD_ISR_NAME(vsys_mon_det)),
+ BDIRQ("bd71815-chg-wdg-temp", BD_ISR_NAME(chg_wdg_temp)),
+ BDIRQ("bd71815-chg-wdg", BD_ISR_NAME(chg_wdg)),
+ BDIRQ("bd71815-rechg-det", BD_ISR_NAME(rechg_det)),
+ BDIRQ("bd71815-rechg-res", BD_ISR_NAME(rechg_res)),
+ BDIRQ("bd71815-ranged-temp-transit", BD_ISR_NAME(temp_transit)),
+ BDIRQ("bd71815-chg-state-change", BD_ISR_NAME(chg_state_changed)),
+ BDIRQ("bd71815-bat-temp-normal", bd71828_temp_bat_hi_res),
+ BDIRQ("bd71815-bat-temp-erange", bd71828_temp_bat_hi_det),
+ BDIRQ("bd71815-bat-rmv", BD_ISR_NAME(bat_removed)),
+ BDIRQ("bd71815-bat-det", BD_ISR_NAME(bat_det)),
+
+ /* Add ISRs for these */
+ BDIRQ("bd71815-therm-rmv", BD_ISR_NAME(therm_rmv)),
+ BDIRQ("bd71815-therm-det", BD_ISR_NAME(therm_det)),
+ BDIRQ("bd71815-bat-dead", BD_ISR_NAME(bat_dead)),
+ BDIRQ("bd71815-bat-short-res", BD_ISR_NAME(bat_short_res)),
+ BDIRQ("bd71815-bat-short-det", BD_ISR_NAME(bat_short)),
+ BDIRQ("bd71815-bat-low-res", BD_ISR_NAME(bat_low_res)),
+ BDIRQ("bd71815-bat-low-det", BD_ISR_NAME(bat_low)),
+ BDIRQ("bd71815-bat-over-res", BD_ISR_NAME(bat_ov_res)),
+ BDIRQ("bd71815-bat-over-det", BD_ISR_NAME(bat_ov)),
+ BDIRQ("bd71815-bat-mon-res", BD_ISR_NAME(bat_mon_res)),
+ BDIRQ("bd71815-bat-mon-det", BD_ISR_NAME(bat_mon)),
+ /* cc-mon 1 & 3 ? */
+ BDIRQ("bd71815-bat-cc-mon2", BD_ISR_NAME(bat_cc_mon)),
+ BDIRQ("bd71815-bat-oc1-res", BD_ISR_NAME(bat_oc1_res)),
+ BDIRQ("bd71815-bat-oc1-det", BD_ISR_NAME(bat_oc1)),
+ BDIRQ("bd71815-bat-oc2-res", BD_ISR_NAME(bat_oc2_res)),
+ BDIRQ("bd71815-bat-oc2-det", BD_ISR_NAME(bat_oc2)),
+ BDIRQ("bd71815-bat-oc3-res", BD_ISR_NAME(bat_oc3_res)),
+ BDIRQ("bd71815-bat-oc3-det", BD_ISR_NAME(bat_oc3)),
+ BDIRQ("bd71815-temp-bat-low-res", BD_ISR_NAME(temp_bat_low_res)),
+ BDIRQ("bd71815-temp-bat-low-det", BD_ISR_NAME(temp_bat_low)),
+ BDIRQ("bd71815-temp-bat-hi-res", BD_ISR_NAME(temp_bat_hi_res)),
+ BDIRQ("bd71815-temp-bat-hi-det", BD_ISR_NAME(temp_bat_hi)),
+ /*
+ * TODO: add rest of the IRQs and re-check the handling.
+ * Check the bd71815-bat-cc-mon1, bd71815-bat-cc-mon3,
+ * bd71815-bat-low-res, bd71815-bat-low-det,
+ * bd71815-bat-hi-res, bd71815-bat-hi-det.
+ */
+ };
+ static const struct bd7182x_irq_res bd71828_irqs[] = {
+ BDIRQ("bd71828-chg-done", bd718x7_chg_done),
+ BDIRQ("bd71828-pwr-dcin-in", bd7182x_dcin_detected),
+ BDIRQ("bd71828-pwr-dcin-out", bd7182x_dcin_removed),
+ BDIRQ("bd71828-vbat-normal", bd71828_vbat_low_res),
+ BDIRQ("bd71828-vbat-low", bd71828_vbat_low_det),
+ BDIRQ("bd71828-btemp-hi", bd71828_temp_bat_hi_det),
+ BDIRQ("bd71828-btemp-cool", bd71828_temp_bat_hi_res),
+ BDIRQ("bd71828-btemp-lo", bd71828_temp_bat_low_det),
+ BDIRQ("bd71828-btemp-warm", bd71828_temp_bat_low_res),
+ BDIRQ("bd71828-temp-hi", bd71828_temp_vf_det),
+ BDIRQ("bd71828-temp-norm", bd71828_temp_vf_res),
+ BDIRQ("bd71828-temp-125-over", bd71828_temp_vf125_det),
+ BDIRQ("bd71828-temp-125-under", bd71828_temp_vf125_res),
+ };
+ int num_irqs;
+ const struct bd7182x_irq_res *irqs;
+
+ switch (pwr->chip_type) {
+ case ROHM_CHIP_TYPE_BD71828:
+ irqs = &bd71828_irqs[0];
+ num_irqs = ARRAY_SIZE(bd71828_irqs);
+ break;
+ case ROHM_CHIP_TYPE_BD71815:
+ irqs = &bd71815_irqs[0];
+ num_irqs = ARRAY_SIZE(bd71815_irqs);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ for (i = 0; i < num_irqs; i++) {
+ irq = platform_get_irq_byname(pdev, irqs[i].name);
+
+ ret = devm_request_threaded_irq(&pdev->dev, irq, NULL,
+ irqs[i].handler, 0,
+ irqs[i].name, pwr);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+#define RSENS_DEFAULT_30MOHM 30000 /* 30 mOhm in uOhms*/
+
+static int bd7182x_get_rsens(struct bd71828_power *pwr)
+{
+ u64 tmp = RSENS_CURR;
+ int rsens_ohm = RSENS_DEFAULT_30MOHM;
+ struct fwnode_handle *node = NULL;
+
+ if (pwr->dev->parent)
+ node = dev_fwnode(pwr->dev->parent);
+
+ if (node) {
+ int ret;
+ u32 rs;
+
+ ret = fwnode_property_read_u32(node,
+ "rohm,charger-sense-resistor-micro-ohms",
+ &rs);
+ if (ret) {
+ if (ret == -EINVAL) {
+ rs = RSENS_DEFAULT_30MOHM;
+ } else {
+ dev_err(pwr->dev, "Bad RSENS dt property\n");
+ return ret;
+ }
+ }
+ if (!rs) {
+ dev_err(pwr->dev, "Bad RSENS value\n");
+ return -EINVAL;
+ }
+
+ rsens_ohm = (int)rs;
+ }
+
+ /* Reg val to uA */
+ do_div(tmp, rsens_ohm);
+
+ pwr->curr_factor = tmp;
+ pwr->rsens = rsens_ohm;
+ dev_dbg(pwr->dev, "Setting rsens to %u micro ohm\n", pwr->rsens);
+ dev_dbg(pwr->dev, "Setting curr-factor to %u\n", pwr->curr_factor);
+
+ return 0;
+}
+
+static int bd71828_power_probe(struct platform_device *pdev)
+{
+ struct bd71828_power *pwr;
+ struct power_supply_config ac_cfg = {};
+ struct power_supply_config bat_cfg = {};
+ int ret;
+ struct regmap *regmap;
+
+ regmap = dev_get_regmap(pdev->dev.parent, NULL);
+ if (!regmap) {
+ dev_err(&pdev->dev, "No parent regmap\n");
+ return -EINVAL;
+ }
+
+ pwr = devm_kzalloc(&pdev->dev, sizeof(*pwr), GFP_KERNEL);
+ if (!pwr)
+ return -ENOMEM;
+
+ pwr->regmap = regmap;
+ pwr->dev = &pdev->dev;
+ pwr->chip_type = platform_get_device_id(pdev)->driver_data;
+
+ switch (pwr->chip_type) {
+ case ROHM_CHIP_TYPE_BD71828:
+ pwr->bat_inserted = bd71828_bat_inserted;
+ pwr->get_temp = bd71828_get_temp;
+ pwr->regs = &pwr_regs_bd71828;
+ break;
+ case ROHM_CHIP_TYPE_BD71815:
+ pwr->bat_inserted = bd71815_bat_inserted;
+ pwr->get_temp = bd71815_get_temp;
+ pwr->regs = &pwr_regs_bd71815;
+ break;
+ default:
+ dev_err(pwr->dev, "Unknown PMIC\n");
+ return -EINVAL;
+ }
+
+ ret = bd7182x_get_rsens(pwr);
+ if (ret)
+ return dev_err_probe(&pdev->dev, ret, "sense resistor missing\n");
+
+ dev_set_drvdata(&pdev->dev, pwr);
+ bd71828_init_hardware(pwr);
+
+ bat_cfg.drv_data = pwr;
+ bat_cfg.fwnode = dev_fwnode(&pdev->dev);
+
+ ac_cfg.supplied_to = bd71828_ac_supplied_to;
+ ac_cfg.num_supplicants = ARRAY_SIZE(bd71828_ac_supplied_to);
+ ac_cfg.drv_data = pwr;
+
+ pwr->ac = devm_power_supply_register(&pdev->dev, &bd71828_ac_desc,
+ &ac_cfg);
+ if (IS_ERR(pwr->ac))
+ return dev_err_probe(&pdev->dev, PTR_ERR(pwr->ac),
+ "failed to register ac\n");
+
+ pwr->bat = devm_power_supply_register(&pdev->dev, &bd71828_bat_desc,
+ &bat_cfg);
+ if (IS_ERR(pwr->bat))
+ return dev_err_probe(&pdev->dev, PTR_ERR(pwr->bat),
+ "failed to register bat\n");
+
+ ret = bd7182x_get_irqs(pdev, pwr);
+ if (ret)
+ return dev_err_probe(&pdev->dev, ret, "failed to request IRQs");
+
+ /* Configure wakeup capable */
+ device_set_wakeup_capable(pwr->dev, 1);
+ device_set_wakeup_enable(pwr->dev, 1);
+
+ return 0;
+}
+
+static const struct platform_device_id bd71828_charger_id[] = {
+ { "bd71815-power", ROHM_CHIP_TYPE_BD71815 },
+ { "bd71828-power", ROHM_CHIP_TYPE_BD71828 },
+ { },
+};
+MODULE_DEVICE_TABLE(platform, bd71828_charger_id);
+
+static struct platform_driver bd71828_power_driver = {
+ .driver = {
+ .name = "bd718xx-power",
+ },
+ .probe = bd71828_power_probe,
+ .id_table = bd71828_charger_id,
+};
+
+module_platform_driver(bd71828_power_driver);
+
+MODULE_AUTHOR("Cong Pham <cpham2403@gmail.com>");
+MODULE_DESCRIPTION("ROHM BD718(15/28/78) PMIC Battery Charger driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/power/supply/cw2015_battery.c b/drivers/power/supply/cw2015_battery.c
index 2263d5d3448f..0806abea2372 100644
--- a/drivers/power/supply/cw2015_battery.c
+++ b/drivers/power/supply/cw2015_battery.c
@@ -699,7 +699,13 @@ static int cw_bat_probe(struct i2c_client *client)
if (!cw_bat->battery_workqueue)
return -ENOMEM;
- devm_delayed_work_autocancel(&client->dev, &cw_bat->battery_delay_work, cw_bat_work);
+ ret = devm_delayed_work_autocancel(&client->dev, &cw_bat->battery_delay_work, cw_bat_work);
+ if (ret) {
+ dev_err_probe(&client->dev, ret,
+ "Failed to register delayed work\n");
+ return ret;
+ }
+
queue_delayed_work(cw_bat->battery_workqueue,
&cw_bat->battery_delay_work, msecs_to_jiffies(10));
return 0;
diff --git a/drivers/power/supply/max17040_battery.c b/drivers/power/supply/max17040_battery.c
index c1640bc6accd..48453508688a 100644
--- a/drivers/power/supply/max17040_battery.c
+++ b/drivers/power/supply/max17040_battery.c
@@ -388,6 +388,7 @@ static int max17040_get_property(struct power_supply *psy,
union power_supply_propval *val)
{
struct max17040_chip *chip = power_supply_get_drvdata(psy);
+ int ret;
switch (psp) {
case POWER_SUPPLY_PROP_ONLINE:
@@ -410,7 +411,10 @@ static int max17040_get_property(struct power_supply *psy,
if (!chip->channel_temp)
return -ENODATA;
- iio_read_channel_processed(chip->channel_temp, &val->intval);
+ ret = iio_read_channel_processed(chip->channel_temp, &val->intval);
+ if (ret)
+ return ret;
+
val->intval /= 100; /* Convert from milli- to deci-degree */
break;
diff --git a/drivers/power/supply/max77705_charger.c b/drivers/power/supply/max77705_charger.c
index b1a227bf72e2..5dd02f658f5b 100644
--- a/drivers/power/supply/max77705_charger.c
+++ b/drivers/power/supply/max77705_charger.c
@@ -40,6 +40,39 @@ static enum power_supply_property max77705_charger_props[] = {
POWER_SUPPLY_PROP_INPUT_CURRENT_LIMIT,
};
+static irqreturn_t max77705_aicl_irq(int irq, void *irq_drv_data)
+{
+ struct max77705_charger_data *chg = irq_drv_data;
+ unsigned int regval, irq_status;
+ int err;
+
+ err = regmap_read(chg->regmap, MAX77705_CHG_REG_INT_OK, &irq_status);
+ if (err < 0)
+ return IRQ_HANDLED;
+
+ // irq is fiered at the end of current decrease sequence too
+ // early check AICL_I bit to guard against that excess irq call
+ while (!(irq_status & BIT(MAX77705_AICL_I))) {
+ err = regmap_field_read(chg->rfield[MAX77705_CHG_CHGIN_LIM], &regval);
+ if (err < 0)
+ return IRQ_HANDLED;
+
+ regval--;
+
+ err = regmap_field_write(chg->rfield[MAX77705_CHG_CHGIN_LIM], regval);
+ if (err < 0)
+ return IRQ_HANDLED;
+
+ msleep(AICL_WORK_DELAY_MS);
+
+ err = regmap_read(chg->regmap, MAX77705_CHG_REG_INT_OK, &irq_status);
+ if (err < 0)
+ return IRQ_HANDLED;
+ }
+
+ return IRQ_HANDLED;
+}
+
static irqreturn_t max77705_chgin_irq(int irq, void *irq_drv_data)
{
struct max77705_charger_data *chg = irq_drv_data;
@@ -60,7 +93,7 @@ static const struct regmap_irq max77705_charger_irqs[] = {
REGMAP_IRQ_REG_LINE(MAX77705_AICL_I, BITS_PER_BYTE),
};
-static struct regmap_irq_chip max77705_charger_irq_chip = {
+static const struct regmap_irq_chip max77705_charger_irq_chip = {
.name = "max77705-charger",
.status_base = MAX77705_CHG_REG_INT,
.mask_base = MAX77705_CHG_REG_INT_MASK,
@@ -567,6 +600,7 @@ static int max77705_charger_probe(struct i2c_client *i2c)
{
struct power_supply_config pscfg = {};
struct max77705_charger_data *chg;
+ struct regmap_irq_chip *chip_desc;
struct device *dev;
struct regmap_irq_chip_data *irq_data;
int ret;
@@ -580,6 +614,13 @@ static int max77705_charger_probe(struct i2c_client *i2c)
chg->dev = dev;
i2c_set_clientdata(i2c, chg);
+ chip_desc = devm_kmemdup(dev, &max77705_charger_irq_chip,
+ sizeof(max77705_charger_irq_chip),
+ GFP_KERNEL);
+ if (!chip_desc)
+ return -ENOMEM;
+ chip_desc->irq_drv_data = chg;
+
chg->regmap = devm_regmap_init_i2c(i2c, &max77705_chg_regmap_config);
if (IS_ERR(chg->regmap))
return PTR_ERR(chg->regmap);
@@ -599,11 +640,9 @@ static int max77705_charger_probe(struct i2c_client *i2c)
if (IS_ERR(chg->psy_chg))
return PTR_ERR(chg->psy_chg);
- max77705_charger_irq_chip.irq_drv_data = chg;
ret = devm_regmap_add_irq_chip(chg->dev, chg->regmap, i2c->irq,
IRQF_ONESHOT, 0,
- &max77705_charger_irq_chip,
- &irq_data);
+ chip_desc, &irq_data);
if (ret)
return dev_err_probe(dev, ret, "failed to add irq chip\n");
@@ -632,6 +671,15 @@ static int max77705_charger_probe(struct i2c_client *i2c)
goto destroy_wq;
}
+ ret = devm_request_threaded_irq(dev, regmap_irq_get_virq(irq_data, MAX77705_AICL_I),
+ NULL, max77705_aicl_irq,
+ IRQF_TRIGGER_NONE,
+ "aicl-irq", chg);
+ if (ret) {
+ dev_err_probe(dev, ret, "Failed to Request aicl IRQ\n");
+ goto destroy_wq;
+ }
+
ret = max77705_charger_enable(chg);
if (ret) {
dev_err_probe(dev, ret, "failed to enable charge\n");
diff --git a/drivers/power/supply/qcom_battmgr.c b/drivers/power/supply/qcom_battmgr.c
index 3c2837ef3461..c8028606bba0 100644
--- a/drivers/power/supply/qcom_battmgr.c
+++ b/drivers/power/supply/qcom_battmgr.c
@@ -678,12 +678,7 @@ static int qcom_battmgr_set_charge_start_threshold(struct qcom_battmgr *battmgr,
u32 target_soc, delta_soc;
int ret;
- if (start_soc < CHARGE_CTRL_START_THR_MIN ||
- start_soc > CHARGE_CTRL_START_THR_MAX) {
- dev_err(battmgr->dev, "charge control start threshold exceed range: [%u - %u]\n",
- CHARGE_CTRL_START_THR_MIN, CHARGE_CTRL_START_THR_MAX);
- return -EINVAL;
- }
+ start_soc = clamp(start_soc, CHARGE_CTRL_START_THR_MIN, CHARGE_CTRL_START_THR_MAX);
/*
* If the new start threshold is larger than the old end threshold,
@@ -716,12 +711,7 @@ static int qcom_battmgr_set_charge_end_threshold(struct qcom_battmgr *battmgr, i
u32 delta_soc = CHARGE_CTRL_DELTA_SOC;
int ret;
- if (end_soc < CHARGE_CTRL_END_THR_MIN ||
- end_soc > CHARGE_CTRL_END_THR_MAX) {
- dev_err(battmgr->dev, "charge control end threshold exceed range: [%u - %u]\n",
- CHARGE_CTRL_END_THR_MIN, CHARGE_CTRL_END_THR_MAX);
- return -EINVAL;
- }
+ end_soc = clamp(end_soc, CHARGE_CTRL_END_THR_MIN, CHARGE_CTRL_END_THR_MAX);
if (battmgr->info.charge_ctrl_start && end_soc > battmgr->info.charge_ctrl_start)
delta_soc = end_soc - battmgr->info.charge_ctrl_start;
diff --git a/drivers/power/supply/rt5033_charger.c b/drivers/power/supply/rt5033_charger.c
index 2fdc58439707..de724f23e453 100644
--- a/drivers/power/supply/rt5033_charger.c
+++ b/drivers/power/supply/rt5033_charger.c
@@ -701,6 +701,8 @@ static int rt5033_charger_probe(struct platform_device *pdev)
np_conn = of_parse_phandle(pdev->dev.of_node, "richtek,usb-connector", 0);
np_edev = of_get_parent(np_conn);
charger->edev = extcon_find_edev_by_node(np_edev);
+ of_node_put(np_edev);
+ of_node_put(np_conn);
if (IS_ERR(charger->edev)) {
dev_warn(charger->dev, "no extcon device found in device-tree\n");
goto out;
diff --git a/drivers/power/supply/rt9467-charger.c b/drivers/power/supply/rt9467-charger.c
index fe773dd8b404..44c26fb37a77 100644
--- a/drivers/power/supply/rt9467-charger.c
+++ b/drivers/power/supply/rt9467-charger.c
@@ -376,7 +376,7 @@ static int rt9467_set_value_from_ranges(struct rt9467_chg_data *data,
if (rsel == RT9467_RANGE_VMIVR) {
ret = linear_range_get_selector_high(range, value, &sel, &found);
if (ret)
- value = range->max_sel;
+ sel = range->max_sel;
} else {
linear_range_get_selector_within(range, value, &sel);
}
@@ -588,6 +588,10 @@ static int rt9467_run_aicl(struct rt9467_chg_data *data)
aicl_vth = mivr_vth + RT9467_AICLVTH_GAP_uV;
ret = rt9467_set_value_from_ranges(data, F_AICL_VTH,
RT9467_RANGE_AICL_VTH, aicl_vth);
+ if (ret) {
+ dev_err(data->dev, "Failed to set AICL VTH\n");
+ return ret;
+ }
/* Trigger AICL function */
ret = regmap_field_write(data->rm_field[F_AICL_MEAS], 1);
diff --git a/drivers/power/supply/rt9756.c b/drivers/power/supply/rt9756.c
new file mode 100644
index 000000000000..f254527be653
--- /dev/null
+++ b/drivers/power/supply/rt9756.c
@@ -0,0 +1,955 @@
+// SPDX-License-Identifier: GPL-2.0-only
+//
+// Copyright (C) 2025 Richtek Technology Corp.
+//
+// Authors: ChiYuan Huang <cy_huang@richtek.com>
+
+#include <linux/atomic.h>
+#include <linux/cleanup.h>
+#include <linux/i2c.h>
+#include <linux/kernel.h>
+#include <linux/linear_range.h>
+#include <linux/interrupt.h>
+#include <linux/mod_devicetable.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/power_supply.h>
+#include <linux/property.h>
+#include <linux/regmap.h>
+#include <linux/sysfs.h>
+#include <linux/util_macros.h>
+
+#define RT9756_REG_INTFLAG1 0x0B
+#define RT9756_REG_INTFLAG2 0x0D
+#define RT9756_REG_INTFLAG3 0x0F
+#define RT9756_REG_ADCCTL 0x11
+#define RT9756_REG_VBUSADC 0x12
+#define RT9756_REG_BC12FLAG 0x45
+#define RT9756_REG_INTFLAG4 0x49
+
+/* Flag1 */
+#define RT9756_EVT_BUSOVP BIT(3)
+#define RT9756_EVT_BUSOCP BIT(2)
+#define RT9756_EVT_BUSUCP BIT(0)
+/* Flag2 */
+#define RT9756_EVT_BATOVP BIT(7)
+#define RT9756_EVT_BATOCP BIT(6)
+#define RT9756_EVT_TDIEOTP BIT(3)
+#define RT9756_EVT_VBUSLOW_ERR BIT(2)
+#define RT9756_EVT_VAC_INSERT BIT(0)
+/* Flag3 */
+#define RT9756_EVT_WDT BIT(5)
+#define RT9756_EVT_VAC_UVLO BIT(4)
+/* ADCCTL */
+#define RT9756_ADCEN_MASK BIT(7)
+#define RT9756_ADCONCE_MASK BIT(6)
+/* Bc12_flag */
+#define RT9756_EVT_BC12_DONE BIT(3)
+/* Flag4 */
+#define RT9756_EVT_OUTOVP BIT(0)
+
+#define RICHTEK_DEVID 7
+#define RT9756_REVID 0
+#define RT9756A_REVID 1
+#define RT9757_REVID 2
+#define RT9757A_REVID 3
+#define RT9756_ADC_CONVTIME 1200
+#define RT9756_ADC_MAXWAIT 16000
+
+enum rt9756_model {
+ MODEL_RT9756 = 0,
+ MODEL_RT9757,
+ MODEL_RT9770,
+ MODEL_MAX
+};
+
+enum rt9756_adc_chan {
+ ADC_VBUS = 0,
+ ADC_IBUS,
+ ADC_VBAT,
+ ADC_IBAT,
+ ADC_TDIE,
+ ADC_MAX_CHANNEL
+};
+
+enum rt9756_usb_type {
+ USB_NO_VBUS = 0,
+ USB_SDP = 2,
+ USB_NSTD,
+ USB_DCP,
+ USB_CDP,
+ MAX_USB_TYPE
+};
+
+enum rt9756_fields {
+ F_VBATOVP = 0,
+ F_VBATOVP_EN,
+ F_IBATOCP,
+ F_IBATOCP_EN,
+ F_VBUSOVP,
+ F_VBUSOVP_EN,
+ F_IBUSOCP,
+ F_IBUSOCP_EN,
+ F_SWITCHING,
+ F_REG_RST,
+ F_CHG_EN,
+ F_OP_MODE,
+ F_WDT_DIS,
+ F_WDT_TMR,
+ F_DEV_ID,
+ F_BC12_EN,
+ F_USB_STATE,
+ F_VBUS_STATE,
+ F_IBAT_RSEN,
+ F_REVISION,
+ F_MAX_FIELD
+};
+
+enum rt9756_ranges {
+ R_VBATOVP = 0,
+ R_IBATOCP,
+ R_VBUSOVP,
+ R_IBUSOCP,
+ R_MAX_RANGE
+};
+
+static const struct reg_field rt9756_chg_fields[F_MAX_FIELD] = {
+ [F_VBATOVP] = REG_FIELD(0x08, 0, 4),
+ [F_VBATOVP_EN] = REG_FIELD(0x08, 7, 7),
+ [F_IBATOCP] = REG_FIELD(0x09, 0, 5),
+ [F_IBATOCP_EN] = REG_FIELD(0x09, 7, 7),
+ [F_VBUSOVP] = REG_FIELD(0x06, 0, 5),
+ [F_VBUSOVP_EN] = REG_FIELD(0x06, 7, 7),
+ [F_IBUSOCP] = REG_FIELD(0x07, 0, 4),
+ [F_IBUSOCP_EN] = REG_FIELD(0x07, 5, 5),
+ [F_SWITCHING] = REG_FIELD(0x5c, 7, 7),
+ [F_REG_RST] = REG_FIELD(0x00, 7, 7),
+ [F_CHG_EN] = REG_FIELD(0x00, 6, 6),
+ [F_OP_MODE] = REG_FIELD(0x00, 5, 5),
+ [F_WDT_DIS] = REG_FIELD(0x00, 3, 3),
+ [F_WDT_TMR] = REG_FIELD(0x00, 0, 2),
+ [F_DEV_ID] = REG_FIELD(0x03, 0, 3),
+ [F_BC12_EN] = REG_FIELD(0x44, 7, 7),
+ [F_USB_STATE] = REG_FIELD(0x46, 5, 7),
+ [F_VBUS_STATE] = REG_FIELD(0x4c, 0, 0),
+ [F_IBAT_RSEN] = REG_FIELD(0x5e, 0, 1),
+ [F_REVISION] = REG_FIELD(0x62, 0, 1),
+};
+
+static const struct reg_field rt9770_chg_fields[F_MAX_FIELD] = {
+ [F_VBATOVP] = REG_FIELD(0x08, 0, 4),
+ [F_VBATOVP_EN] = REG_FIELD(0x08, 7, 7),
+ [F_IBATOCP] = REG_FIELD(0x09, 0, 5),
+ [F_IBATOCP_EN] = REG_FIELD(0x09, 7, 7),
+ [F_VBUSOVP] = REG_FIELD(0x06, 0, 5),
+ [F_VBUSOVP_EN] = REG_FIELD(0x06, 7, 7),
+ [F_IBUSOCP] = REG_FIELD(0x07, 0, 4),
+ [F_IBUSOCP_EN] = REG_FIELD(0x07, 5, 5),
+ [F_SWITCHING] = REG_FIELD(0x5c, 7, 7),
+ [F_REG_RST] = REG_FIELD(0x00, 7, 7),
+ [F_CHG_EN] = REG_FIELD(0x00, 6, 6),
+ [F_OP_MODE] = REG_FIELD(0x00, 5, 5),
+ [F_WDT_DIS] = REG_FIELD(0x00, 3, 3),
+ [F_WDT_TMR] = REG_FIELD(0x00, 0, 2),
+ [F_DEV_ID] = REG_FIELD(0x60, 0, 3),
+ [F_BC12_EN] = REG_FIELD(0x03, 7, 7),
+ [F_USB_STATE] = REG_FIELD(0x02, 5, 7),
+ [F_VBUS_STATE] = REG_FIELD(0x4c, 0, 0),
+ [F_IBAT_RSEN] = REG_FIELD(0x5e, 0, 1),
+ [F_REVISION] = REG_FIELD(0x62, 3, 7),
+};
+
+/* All converted to microvolt or microamp */
+static const struct linear_range rt9756_chg_ranges[R_MAX_RANGE] = {
+ LINEAR_RANGE_IDX(R_VBATOVP, 4200000, 0, 31, 25000),
+ LINEAR_RANGE_IDX(R_IBATOCP, 2000000, 0, 63, 100000),
+ LINEAR_RANGE_IDX(R_VBUSOVP, 3000000, 0, 63, 50000),
+ LINEAR_RANGE_IDX(R_IBUSOCP, 1000000, 0, 31, 250000),
+};
+
+struct charger_event {
+ unsigned int flag1;
+ unsigned int flag2;
+ unsigned int flag3;
+ unsigned int flag4;
+};
+
+struct rt9756_data {
+ struct device *dev;
+ struct regmap *regmap;
+ struct regmap_field *rm_fields[F_MAX_FIELD];
+ struct power_supply *psy;
+ struct power_supply *bat_psy;
+ struct mutex adc_lock;
+ struct power_supply_desc psy_desc;
+ struct power_supply_desc bat_psy_desc;
+ struct charger_event chg_evt;
+ unsigned int rg_resistor;
+ unsigned int real_resistor;
+ enum rt9756_model model;
+ atomic_t usb_type;
+};
+
+struct rt975x_dev_data {
+ const struct regmap_config *regmap_config;
+ const struct reg_field *reg_fields;
+ const struct reg_sequence *init_regs;
+ size_t num_init_regs;
+ int (*check_device_model)(struct rt9756_data *data);
+};
+
+static int rt9756_get_value_field_range(struct rt9756_data *data, enum rt9756_fields en_field,
+ enum rt9756_fields field, enum rt9756_ranges rsel, int *val)
+{
+ const struct linear_range *range = rt9756_chg_ranges + rsel;
+ unsigned int enable, selector, value;
+ int ret;
+
+ ret = regmap_field_read(data->rm_fields[en_field], &enable);
+ if (ret)
+ return ret;
+
+ if (!enable) {
+ *val = 0;
+ return 0;
+ }
+
+ ret = regmap_field_read(data->rm_fields[field], &selector);
+ if (ret)
+ return ret;
+
+ ret = linear_range_get_value(range, selector, &value);
+ if (ret)
+ return ret;
+
+ *val = (int)value;
+
+ return 0;
+}
+
+static int rt9756_set_value_field_range(struct rt9756_data *data, enum rt9756_fields en_field,
+ enum rt9756_fields field, enum rt9756_ranges rsel, int val)
+{
+ const struct linear_range *range = rt9756_chg_ranges + rsel;
+ unsigned int selector, value;
+ int ret;
+
+ if (!val)
+ return regmap_field_write(data->rm_fields[en_field], 0);
+
+ value = (unsigned int)val;
+ linear_range_get_selector_within(range, value, &selector);
+ ret = regmap_field_write(data->rm_fields[field], selector);
+ if (ret)
+ return ret;
+
+ return regmap_field_write(data->rm_fields[en_field], 1);
+}
+
+static int rt9756_get_adc(struct rt9756_data *data, enum rt9756_adc_chan chan,
+ int *val)
+{
+ struct regmap *regmap = data->regmap;
+ unsigned int reg_addr = RT9756_REG_VBUSADC + chan * 2;
+ unsigned int mask = RT9756_ADCEN_MASK | RT9756_ADCONCE_MASK;
+ unsigned int shift = 0, adc_cntl;
+ __be16 raws;
+ int scale, offset = 0, ret;
+
+ guard(mutex)(&data->adc_lock);
+
+ ret = regmap_update_bits(regmap, RT9756_REG_ADCCTL, mask, mask);
+ if (ret)
+ return ret;
+
+ ret = regmap_read_poll_timeout(regmap, RT9756_REG_ADCCTL, adc_cntl,
+ !(adc_cntl & RT9756_ADCEN_MASK),
+ RT9756_ADC_CONVTIME, RT9756_ADC_MAXWAIT);
+ if (ret && ret != -ETIMEDOUT)
+ return ret;
+
+ ret = regmap_raw_read(regmap, reg_addr, &raws, sizeof(raws));
+ if (ret)
+ return ret;
+
+ /*
+ * TDIE LSB 1'c, others LSB 1000uV or 1000uA.
+ * Rsense ratio is needed for IBAT channel
+ */
+ if (chan == ADC_TDIE) {
+ scale = 10;
+ shift = 8;
+ offset = -40;
+ } else if (chan == ADC_IBAT)
+ scale = 1000 * data->rg_resistor / data->real_resistor;
+ else
+ scale = 1000;
+
+ *val = ((be16_to_cpu(raws) >> shift) + offset) * scale;
+
+ return regmap_update_bits(regmap, RT9756_REG_ADCCTL, mask, 0);
+}
+
+static int rt9756_get_switching_state(struct rt9756_data *data, int *status)
+{
+ unsigned int switching_state;
+ int ret;
+
+ ret = regmap_field_read(data->rm_fields[F_SWITCHING], &switching_state);
+ if (ret)
+ return ret;
+
+ if (switching_state)
+ *status = POWER_SUPPLY_STATUS_CHARGING;
+ else
+ *status = POWER_SUPPLY_STATUS_NOT_CHARGING;
+
+ return 0;
+}
+
+static int rt9756_get_charger_health(struct rt9756_data *data)
+{
+ struct charger_event *evt = &data->chg_evt;
+
+ if (evt->flag2 & RT9756_EVT_VBUSLOW_ERR)
+ return POWER_SUPPLY_HEALTH_UNDERVOLTAGE;
+
+ if (evt->flag1 & RT9756_EVT_BUSOVP || evt->flag2 & RT9756_EVT_BATOVP ||
+ evt->flag4 & RT9756_EVT_OUTOVP)
+ return POWER_SUPPLY_HEALTH_OVERVOLTAGE;
+
+ if (evt->flag1 & RT9756_EVT_BUSOCP || evt->flag2 & RT9756_EVT_BATOCP)
+ return POWER_SUPPLY_HEALTH_OVERCURRENT;
+
+ if (evt->flag1 & RT9756_EVT_BUSUCP)
+ return POWER_SUPPLY_HEALTH_UNSPEC_FAILURE;
+
+ if (evt->flag2 & RT9756_EVT_TDIEOTP)
+ return POWER_SUPPLY_HEALTH_OVERHEAT;
+
+ if (evt->flag3 & RT9756_EVT_WDT)
+ return POWER_SUPPLY_HEALTH_WATCHDOG_TIMER_EXPIRE;
+
+ return POWER_SUPPLY_HEALTH_GOOD;
+}
+
+static int rt9756_get_charger_online(struct rt9756_data *data, int *val)
+{
+ unsigned int online;
+ int ret;
+
+ ret = regmap_field_read(data->rm_fields[F_VBUS_STATE], &online);
+ if (ret)
+ return ret;
+
+ *val = !!online;
+ return 0;
+}
+
+static int rt9756_get_vbus_ovp(struct rt9756_data *data, int *val)
+{
+ unsigned int opmode;
+ int ovpval, ret;
+
+ /* operating mode -> 0 bypass, 1 div2 */
+ ret = regmap_field_read(data->rm_fields[F_OP_MODE], &opmode);
+ if (ret)
+ return ret;
+
+ ret = rt9756_get_value_field_range(data, F_VBUSOVP_EN, F_VBUSOVP, R_VBUSOVP, &ovpval);
+ if (ret)
+ return ret;
+
+ *val = opmode ? ovpval * 2 : ovpval;
+ return 0;
+}
+
+static int rt9756_set_vbus_ovp(struct rt9756_data *data, int val)
+{
+ unsigned int opmode;
+ int ret;
+
+ /* operating mode -> 0 bypass, 1 div2 */
+ ret = regmap_field_read(data->rm_fields[F_OP_MODE], &opmode);
+ if (ret)
+ return ret;
+
+ return rt9756_set_value_field_range(data, F_VBUSOVP_EN, F_VBUSOVP, R_VBUSOVP,
+ opmode ? val / 2 : val);
+}
+
+static const char * const rt9756_manufacturer = "Richtek Technology Corp.";
+static const char * const rt9756_model[MODEL_MAX] = { "RT9756", "RT9757", "RT9770" };
+
+static int rt9756_psy_get_property(struct power_supply *psy,
+ enum power_supply_property psp,
+ union power_supply_propval *val)
+{
+ struct rt9756_data *data = power_supply_get_drvdata(psy);
+ int *pval = &val->intval;
+
+ switch (psp) {
+ case POWER_SUPPLY_PROP_STATUS:
+ return rt9756_get_switching_state(data, pval);
+ case POWER_SUPPLY_PROP_HEALTH:
+ *pval = rt9756_get_charger_health(data);
+ return 0;
+ case POWER_SUPPLY_PROP_ONLINE:
+ return rt9756_get_charger_online(data, pval);
+ case POWER_SUPPLY_PROP_VOLTAGE_MAX:
+ return rt9756_get_vbus_ovp(data, pval);
+ case POWER_SUPPLY_PROP_VOLTAGE_NOW:
+ return rt9756_get_adc(data, ADC_VBUS, pval);
+ case POWER_SUPPLY_PROP_CURRENT_MAX:
+ return rt9756_get_value_field_range(data, F_IBUSOCP_EN, F_IBUSOCP, R_IBUSOCP, pval);
+ case POWER_SUPPLY_PROP_CURRENT_NOW:
+ return rt9756_get_adc(data, ADC_IBUS, pval);
+ case POWER_SUPPLY_PROP_CONSTANT_CHARGE_VOLTAGE_MAX:
+ return rt9756_get_value_field_range(data, F_VBATOVP_EN, F_VBATOVP, R_VBATOVP, pval);
+ case POWER_SUPPLY_PROP_CONSTANT_CHARGE_CURRENT_MAX:
+ return rt9756_get_value_field_range(data, F_IBATOCP_EN, F_IBATOCP, R_IBATOCP, pval);
+ case POWER_SUPPLY_PROP_TEMP:
+ return rt9756_get_adc(data, ADC_TDIE, pval);
+ case POWER_SUPPLY_PROP_USB_TYPE:
+ *pval = atomic_read(&data->usb_type);
+ return 0;
+ case POWER_SUPPLY_PROP_MODEL_NAME:
+ val->strval = rt9756_model[data->model];
+ return 0;
+ case POWER_SUPPLY_PROP_MANUFACTURER:
+ val->strval = rt9756_manufacturer;
+ return 0;
+ default:
+ return -ENODATA;
+ }
+}
+
+static int rt9756_psy_set_property(struct power_supply *psy,
+ enum power_supply_property psp,
+ const union power_supply_propval *val)
+{
+ struct rt9756_data *data = power_supply_get_drvdata(psy);
+ int intval = val->intval;
+
+ switch (psp) {
+ case POWER_SUPPLY_PROP_STATUS:
+ memset(&data->chg_evt, 0, sizeof(data->chg_evt));
+ return regmap_field_write(data->rm_fields[F_CHG_EN], !!intval);
+ case POWER_SUPPLY_PROP_VOLTAGE_MAX:
+ return rt9756_set_vbus_ovp(data, intval);
+ case POWER_SUPPLY_PROP_CURRENT_MAX:
+ return rt9756_set_value_field_range(data, F_IBUSOCP_EN, F_IBUSOCP, R_IBUSOCP,
+ intval);
+ case POWER_SUPPLY_PROP_CONSTANT_CHARGE_VOLTAGE_MAX:
+ return rt9756_set_value_field_range(data, F_VBATOVP_EN, F_VBATOVP, R_VBATOVP,
+ intval);
+ case POWER_SUPPLY_PROP_CONSTANT_CHARGE_CURRENT_MAX:
+ return rt9756_set_value_field_range(data, F_IBATOCP_EN, F_IBATOCP, R_IBATOCP,
+ intval);
+ case POWER_SUPPLY_PROP_USB_TYPE:
+ return regmap_field_write(data->rm_fields[F_BC12_EN], !!intval);
+ default:
+ return -EINVAL;
+ }
+}
+
+static const enum power_supply_property rt9756_psy_properties[] = {
+ POWER_SUPPLY_PROP_STATUS,
+ POWER_SUPPLY_PROP_ONLINE,
+ POWER_SUPPLY_PROP_HEALTH,
+ POWER_SUPPLY_PROP_ONLINE,
+ POWER_SUPPLY_PROP_VOLTAGE_MAX,
+ POWER_SUPPLY_PROP_VOLTAGE_NOW,
+ POWER_SUPPLY_PROP_CURRENT_MAX,
+ POWER_SUPPLY_PROP_CURRENT_NOW,
+ POWER_SUPPLY_PROP_CONSTANT_CHARGE_VOLTAGE_MAX,
+ POWER_SUPPLY_PROP_CONSTANT_CHARGE_CURRENT_MAX,
+ POWER_SUPPLY_PROP_TEMP,
+ POWER_SUPPLY_PROP_USB_TYPE,
+ POWER_SUPPLY_PROP_MODEL_NAME,
+ POWER_SUPPLY_PROP_MANUFACTURER,
+};
+
+static int rt9756_bat_psy_get_property(struct power_supply *psy,
+ enum power_supply_property psp,
+ union power_supply_propval *val)
+{
+ struct rt9756_data *data = power_supply_get_drvdata(psy);
+ int *pval = &val->intval;
+
+ switch (psp) {
+ case POWER_SUPPLY_PROP_TECHNOLOGY:
+ *pval = POWER_SUPPLY_TECHNOLOGY_LION;
+ return 0;
+ case POWER_SUPPLY_PROP_VOLTAGE_NOW:
+ return rt9756_get_adc(data, ADC_VBAT, pval);
+ case POWER_SUPPLY_PROP_CURRENT_NOW:
+ return rt9756_get_adc(data, ADC_IBAT, pval);
+ default:
+ return -ENODATA;
+ }
+}
+
+static const enum power_supply_property rt9756_bat_psy_properties[] = {
+ POWER_SUPPLY_PROP_TECHNOLOGY,
+ POWER_SUPPLY_PROP_VOLTAGE_NOW,
+ POWER_SUPPLY_PROP_CURRENT_NOW,
+};
+
+static int rt9756_psy_property_is_writeable(struct power_supply *psy,
+ enum power_supply_property psp)
+{
+ switch (psp) {
+ case POWER_SUPPLY_PROP_STATUS:
+ case POWER_SUPPLY_PROP_ONLINE:
+ case POWER_SUPPLY_PROP_VOLTAGE_MAX:
+ case POWER_SUPPLY_PROP_CURRENT_MAX:
+ case POWER_SUPPLY_PROP_CONSTANT_CHARGE_VOLTAGE_MAX:
+ case POWER_SUPPLY_PROP_CONSTANT_CHARGE_CURRENT_MAX:
+ case POWER_SUPPLY_PROP_USB_TYPE:
+ return 1;
+ default:
+ return 0;
+ }
+}
+
+static const unsigned int rt9756_wdt_millisecond[] = {
+ 500, 1000, 5000, 30000, 40000, 80000, 128000, 255000
+};
+
+static ssize_t watchdog_timer_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct power_supply *psy = to_power_supply(dev);
+ struct rt9756_data *data = power_supply_get_drvdata(psy);
+ unsigned int wdt_tmr_now = 0, wdt_sel, wdt_dis;
+ int ret;
+
+ ret = regmap_field_read(data->rm_fields[F_WDT_DIS], &wdt_dis);
+ if (ret)
+ return ret;
+
+ if (!wdt_dis) {
+ ret = regmap_field_read(data->rm_fields[F_WDT_TMR], &wdt_sel);
+ if (ret)
+ return ret;
+
+ wdt_tmr_now = rt9756_wdt_millisecond[wdt_sel];
+ }
+
+ return sysfs_emit(buf, "%d\n", wdt_tmr_now);
+}
+
+static ssize_t watchdog_timer_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct power_supply *psy = to_power_supply(dev);
+ struct rt9756_data *data = power_supply_get_drvdata(psy);
+ unsigned int wdt_set, wdt_sel;
+ int ret;
+
+ ret = kstrtouint(buf, 10, &wdt_set);
+ if (ret)
+ return ret;
+
+ ret = regmap_field_write(data->rm_fields[F_WDT_DIS], 1);
+ if (ret)
+ return ret;
+
+ wdt_sel = find_closest(wdt_set, rt9756_wdt_millisecond,
+ ARRAY_SIZE(rt9756_wdt_millisecond));
+
+ ret = regmap_field_write(data->rm_fields[F_WDT_TMR], wdt_sel);
+ if (ret)
+ return ret;
+
+ if (wdt_set) {
+ ret = regmap_field_write(data->rm_fields[F_WDT_DIS], 0);
+ if (ret)
+ return ret;
+ }
+
+ return count;
+}
+
+static const char * const rt9756_opmode_str[] = { "bypass", "div2" };
+
+static ssize_t operation_mode_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct power_supply *psy = to_power_supply(dev);
+ struct rt9756_data *data = power_supply_get_drvdata(psy);
+ unsigned int opmode;
+ int ret;
+
+ ret = regmap_field_read(data->rm_fields[F_OP_MODE], &opmode);
+ if (ret)
+ return ret;
+
+ return sysfs_emit(buf, "%s\n", rt9756_opmode_str[opmode]);
+}
+
+static ssize_t operation_mode_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct power_supply *psy = to_power_supply(dev);
+ struct rt9756_data *data = power_supply_get_drvdata(psy);
+ int index, ret;
+
+ index = sysfs_match_string(rt9756_opmode_str, buf);
+ if (index < 0)
+ return index;
+
+ ret = regmap_field_write(data->rm_fields[F_OP_MODE], index);
+
+ return ret ?: count;
+}
+
+static DEVICE_ATTR_RW(watchdog_timer);
+static DEVICE_ATTR_RW(operation_mode);
+
+static struct attribute *rt9756_sysfs_attrs[] = {
+ &dev_attr_watchdog_timer.attr,
+ &dev_attr_operation_mode.attr,
+ NULL
+};
+ATTRIBUTE_GROUPS(rt9756_sysfs);
+
+static int rt9756_register_psy(struct rt9756_data *data)
+{
+ struct power_supply_desc *desc = &data->psy_desc;
+ struct power_supply_desc *bat_desc = &data->bat_psy_desc;
+ struct power_supply_config cfg = {}, bat_cfg = {};
+ struct device *dev = data->dev;
+ char *psy_name, *bat_psy_name, **supplied_to;
+
+ bat_cfg.drv_data = data;
+ bat_cfg.fwnode = dev_fwnode(dev);
+
+ bat_psy_name = devm_kasprintf(dev, GFP_KERNEL, "rt9756-%s-battery", dev_name(dev));
+ if (!bat_psy_name)
+ return -ENOMEM;
+
+ bat_desc->name = bat_psy_name;
+ bat_desc->type = POWER_SUPPLY_TYPE_BATTERY;
+ bat_desc->properties = rt9756_bat_psy_properties;
+ bat_desc->num_properties = ARRAY_SIZE(rt9756_bat_psy_properties);
+ bat_desc->get_property = rt9756_bat_psy_get_property;
+
+ data->bat_psy = devm_power_supply_register(dev, bat_desc, &bat_cfg);
+ if (IS_ERR(data->bat_psy))
+ return dev_err_probe(dev, PTR_ERR(data->bat_psy), "Failed to register battery\n");
+
+ supplied_to = devm_kzalloc(dev, sizeof(*supplied_to), GFP_KERNEL);
+ if (!supplied_to)
+ return -ENOMEM;
+
+ /* Link charger psy to battery psy */
+ supplied_to[0] = bat_psy_name;
+
+ cfg.drv_data = data;
+ cfg.fwnode = dev_fwnode(dev);
+ cfg.attr_grp = rt9756_sysfs_groups;
+ cfg.supplied_to = supplied_to;
+ cfg.num_supplicants = 1;
+
+ psy_name = devm_kasprintf(dev, GFP_KERNEL, "rt9756-%s", dev_name(dev));
+ if (!psy_name)
+ return -ENOMEM;
+
+ desc->name = psy_name;
+ desc->type = POWER_SUPPLY_TYPE_USB;
+ desc->usb_types = BIT(POWER_SUPPLY_USB_TYPE_UNKNOWN) | BIT(POWER_SUPPLY_USB_TYPE_SDP) |
+ BIT(POWER_SUPPLY_USB_TYPE_DCP) | BIT(POWER_SUPPLY_USB_TYPE_CDP);
+ desc->properties = rt9756_psy_properties;
+ desc->num_properties = ARRAY_SIZE(rt9756_psy_properties);
+ desc->property_is_writeable = rt9756_psy_property_is_writeable;
+ desc->get_property = rt9756_psy_get_property;
+ desc->set_property = rt9756_psy_set_property;
+
+ data->psy = devm_power_supply_register(dev, desc, &cfg);
+
+ return PTR_ERR_OR_ZERO(data->psy);
+}
+
+static int rt9756_get_usb_type(struct rt9756_data *data)
+{
+ unsigned int type;
+ int report_type, ret;
+
+ ret = regmap_field_read(data->rm_fields[F_USB_STATE], &type);
+ if (ret)
+ return ret;
+
+ switch (type) {
+ case USB_SDP:
+ case USB_NSTD:
+ report_type = POWER_SUPPLY_USB_TYPE_SDP;
+ break;
+ case USB_DCP:
+ report_type = POWER_SUPPLY_USB_TYPE_DCP;
+ break;
+ case USB_CDP:
+ report_type = POWER_SUPPLY_USB_TYPE_CDP;
+ break;
+ case USB_NO_VBUS:
+ default:
+ report_type = POWER_SUPPLY_USB_TYPE_UNKNOWN;
+ break;
+ }
+
+ atomic_set(&data->usb_type, report_type);
+ return 0;
+}
+
+static irqreturn_t rt9756_irq_handler(int irq, void *devid)
+{
+ struct rt9756_data *data = devid;
+ struct regmap *regmap = data->regmap;
+ struct charger_event *evt = &data->chg_evt;
+ unsigned int bc12_flag = 0;
+ int ret;
+
+ ret = regmap_read(regmap, RT9756_REG_INTFLAG1, &evt->flag1);
+ if (ret)
+ return IRQ_NONE;
+
+ ret = regmap_read(regmap, RT9756_REG_INTFLAG2, &evt->flag2);
+ if (ret)
+ return IRQ_NONE;
+
+ ret = regmap_read(regmap, RT9756_REG_INTFLAG3, &evt->flag3);
+ if (ret)
+ return IRQ_NONE;
+
+ if (data->model != MODEL_RT9770) {
+ ret = regmap_read(regmap, RT9756_REG_INTFLAG4, &evt->flag4);
+ if (ret)
+ return IRQ_NONE;
+
+ ret = regmap_read(regmap, RT9756_REG_BC12FLAG, &bc12_flag);
+ if (ret)
+ return IRQ_NONE;
+ }
+
+ dev_dbg(data->dev, "events: 0x%02x,%02x,%02x,%02x,%02x\n", evt->flag1, evt->flag2,
+ evt->flag3, evt->flag4, bc12_flag);
+
+ if (evt->flag2 & RT9756_EVT_VAC_INSERT) {
+ ret = regmap_field_write(data->rm_fields[F_BC12_EN], 1);
+ if (ret)
+ return IRQ_NONE;
+ }
+
+ if (evt->flag3 & RT9756_EVT_VAC_UVLO)
+ atomic_set(&data->usb_type, POWER_SUPPLY_USB_TYPE_UNKNOWN);
+
+ if (bc12_flag & RT9756_EVT_BC12_DONE) {
+ ret = rt9756_get_usb_type(data);
+ if (ret)
+ return IRQ_NONE;
+ }
+
+ power_supply_changed(data->psy);
+
+ return IRQ_HANDLED;
+}
+
+static int rt9756_config_batsense_resistor(struct rt9756_data *data)
+{
+ unsigned int shunt_resistor_uohms = 2000, rsense_sel;
+
+ device_property_read_u32(data->dev, "shunt-resistor-micro-ohms", &shunt_resistor_uohms);
+
+ if (!shunt_resistor_uohms || shunt_resistor_uohms > 5000)
+ return -EINVAL;
+
+ data->real_resistor = shunt_resistor_uohms;
+
+ /* Always choose the larger or equal one to prevent false ocp alarm */
+ if (shunt_resistor_uohms <= 1000) {
+ rsense_sel = 0;
+ data->rg_resistor = 1000;
+ } else if (shunt_resistor_uohms <= 2000) {
+ rsense_sel = 1;
+ data->rg_resistor = 2000;
+ } else {
+ rsense_sel = 2;
+ data->rg_resistor = 5000;
+ }
+
+ return regmap_field_write(data->rm_fields[F_IBAT_RSEN], rsense_sel);
+}
+
+static const struct reg_sequence rt9756_init_regs[] = {
+ REG_SEQ(0x00, 0x80, 1000), /* REG_RESET */
+ REG_SEQ0(0x04, 0x13), /* VACOVP/OVPGATE 12V */
+ REG_SEQ0(0x00, 0x28), /* WDT_DIS = 1 */
+ REG_SEQ0(0x0c, 0x02), /* MASK FLAG1 */
+ REG_SEQ0(0x0e, 0x06), /* MASK FLAG2 */
+ REG_SEQ0(0x10, 0xca), /* MASK FLAG3 */
+ REG_SEQ0(0x44, 0xa0), /* BC12_EN */
+ REG_SEQ0(0x47, 0x07), /* MASK BC12FLAG */
+ REG_SEQ0(0x4a, 0xfe), /* MASK FLAG4 */
+ REG_SEQ0(0x5c, 0x40), /* MASK CON_SWITCHING */
+ REG_SEQ0(0x63, 0x01), /* MASK VDDA_UVLO */
+};
+
+static const struct reg_sequence rt9770_init_regs[] = {
+ REG_SEQ(0x00, 0x80, 1000), /* REG_RESET */
+ REG_SEQ0(0x04, 0x13), /* VACOVP/OVPGATE 12V */
+ REG_SEQ0(0x00, 0x28), /* WDT_DIS = 1 */
+ REG_SEQ0(0x0c, 0x02), /* MASK FLAG1 */
+ REG_SEQ0(0x0e, 0x06), /* MASK FLAG2 */
+ REG_SEQ0(0x10, 0xca), /* MASK FLAG3 */
+ REG_SEQ0(0x5c, 0x40), /* MASK CON_SWITCHING */
+ REG_SEQ0(0x63, 0x01), /* MASK VDDA_UVLO */
+};
+
+static const struct regmap_config rt9756_regmap_config = {
+ .name = "rt9756",
+ .reg_bits = 16,
+ .val_bits = 8,
+ .max_register = 0x1ff,
+};
+
+static const struct regmap_config rt9770_regmap_config = {
+ .name = "rt9770",
+ .reg_bits = 8,
+ .val_bits = 8,
+ .max_register = 0xff,
+};
+
+static int rt9756_check_device_model(struct rt9756_data *data)
+{
+ struct device *dev = data->dev;
+ unsigned int revid;
+ int ret;
+
+ ret = regmap_field_read(data->rm_fields[F_REVISION], &revid);
+ if (ret)
+ return dev_err_probe(dev, ret, "Failed to read revid\n");
+
+ if (revid == RT9757_REVID || revid == RT9757A_REVID)
+ data->model = MODEL_RT9757;
+ else if (revid == RT9756_REVID || revid == RT9756A_REVID)
+ data->model = MODEL_RT9756;
+ else
+ return dev_err_probe(dev, -EINVAL, "Unknown revision %d\n", revid);
+
+ return 0;
+}
+
+static int rt9770_check_device_model(struct rt9756_data *data)
+{
+ data->model = MODEL_RT9770;
+ return 0;
+}
+
+static int rt9756_probe(struct i2c_client *i2c)
+{
+ const struct rt975x_dev_data *dev_data;
+ struct device *dev = &i2c->dev;
+ struct rt9756_data *data;
+ struct regmap *regmap;
+ unsigned int devid;
+ int ret;
+
+ dev_data = device_get_match_data(dev);
+ if (!dev_data)
+ return dev_err_probe(dev, -EINVAL, "No device data found\n");
+
+ data = devm_kzalloc(dev, sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+
+ data->dev = dev;
+ mutex_init(&data->adc_lock);
+ atomic_set(&data->usb_type, POWER_SUPPLY_USB_TYPE_UNKNOWN);
+ i2c_set_clientdata(i2c, data);
+
+ regmap = devm_regmap_init_i2c(i2c, dev_data->regmap_config);
+ if (IS_ERR(regmap))
+ return dev_err_probe(dev, PTR_ERR(regmap), "Failed to init regmap\n");
+
+ data->regmap = regmap;
+
+ ret = devm_regmap_field_bulk_alloc(dev, regmap, data->rm_fields, dev_data->reg_fields,
+ F_MAX_FIELD);
+ if (ret)
+ return dev_err_probe(dev, ret, "Failed to alloc regmap fields\n");
+
+ /* Richtek Device ID check */
+ ret = regmap_field_read(data->rm_fields[F_DEV_ID], &devid);
+ if (ret)
+ return dev_err_probe(dev, ret, "Failed to read devid\n");
+
+ if (devid != RICHTEK_DEVID)
+ return dev_err_probe(dev, -ENODEV, "Incorrect VID 0x%02x\n", devid);
+
+ /* Get specific model */
+ ret = dev_data->check_device_model(data);
+ if (ret)
+ return ret;
+
+ ret = regmap_register_patch(regmap, dev_data->init_regs, dev_data->num_init_regs);
+ if (ret)
+ return dev_err_probe(dev, ret, "Failed to init registers\n");
+
+ ret = rt9756_config_batsense_resistor(data);
+ if (ret)
+ return dev_err_probe(dev, ret, "Failed to config batsense resistor\n");
+
+ ret = rt9756_register_psy(data);
+ if (ret)
+ return dev_err_probe(dev, ret, "Failed to init power supply\n");
+
+ return devm_request_threaded_irq(dev, i2c->irq, NULL, rt9756_irq_handler, IRQF_ONESHOT,
+ dev_name(dev), data);
+}
+
+static void rt9756_shutdown(struct i2c_client *i2c)
+{
+ struct rt9756_data *data = i2c_get_clientdata(i2c);
+
+ regmap_field_write(data->rm_fields[F_REG_RST], 1);
+}
+
+static const struct rt975x_dev_data rt9756_dev_data = {
+ .regmap_config = &rt9756_regmap_config,
+ .reg_fields = rt9756_chg_fields,
+ .init_regs = rt9756_init_regs,
+ .num_init_regs = ARRAY_SIZE(rt9756_init_regs),
+ .check_device_model = rt9756_check_device_model,
+};
+
+static const struct rt975x_dev_data rt9770_dev_data = {
+ .regmap_config = &rt9770_regmap_config,
+ .reg_fields = rt9770_chg_fields,
+ .init_regs = rt9770_init_regs,
+ .num_init_regs = ARRAY_SIZE(rt9770_init_regs),
+ .check_device_model = rt9770_check_device_model,
+};
+
+static const struct of_device_id rt9756_device_match_table[] = {
+ { .compatible = "richtek,rt9756", .data = &rt9756_dev_data },
+ { .compatible = "richtek,rt9770", .data = &rt9770_dev_data },
+ {}
+};
+MODULE_DEVICE_TABLE(of, rt9756_device_match_table);
+
+static struct i2c_driver rt9756_charger_driver = {
+ .driver = {
+ .name = "rt9756",
+ .of_match_table = rt9756_device_match_table,
+ },
+ .probe = rt9756_probe,
+ .shutdown = rt9756_shutdown,
+};
+module_i2c_driver(rt9756_charger_driver);
+
+MODULE_DESCRIPTION("Richtek RT9756 charger driver");
+MODULE_AUTHOR("ChiYuan Huang <cy_huang@richtek.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/power/supply/wm831x_power.c b/drivers/power/supply/wm831x_power.c
index 6acdba7885ca..78fa0573ef25 100644
--- a/drivers/power/supply/wm831x_power.c
+++ b/drivers/power/supply/wm831x_power.c
@@ -144,6 +144,7 @@ static int wm831x_usb_limit_change(struct notifier_block *nb,
struct wm831x_power,
usb_notify);
unsigned int i, best;
+ int ret;
/* Find the highest supported limit */
best = 0;
@@ -156,8 +157,13 @@ static int wm831x_usb_limit_change(struct notifier_block *nb,
dev_dbg(wm831x_power->wm831x->dev,
"Limiting USB current to %umA", wm831x_usb_limits[best]);
- wm831x_set_bits(wm831x_power->wm831x, WM831X_POWER_STATE,
- WM831X_USB_ILIM_MASK, best);
+ ret = wm831x_set_bits(wm831x_power->wm831x, WM831X_POWER_STATE,
+ WM831X_USB_ILIM_MASK, best);
+ if (ret < 0) {
+ dev_err(wm831x_power->wm831x->dev,
+ "Failed to set USB current limit: %d\n", ret);
+ return ret;
+ }
return 0;
}
diff --git a/drivers/ras/ras.c b/drivers/ras/ras.c
index ac0e132ccc3e..2a5b5a9fdcb3 100644
--- a/drivers/ras/ras.c
+++ b/drivers/ras/ras.c
@@ -53,9 +53,45 @@ void log_non_standard_event(const guid_t *sec_type, const guid_t *fru_id,
}
EXPORT_SYMBOL_GPL(log_non_standard_event);
-void log_arm_hw_error(struct cper_sec_proc_arm *err)
+void log_arm_hw_error(struct cper_sec_proc_arm *err, const u8 sev)
{
- trace_arm_event(err);
+ struct cper_arm_err_info *err_info;
+ struct cper_arm_ctx_info *ctx_info;
+ u8 *ven_err_data;
+ u32 ctx_len = 0;
+ int n, sz, cpu;
+ s32 vsei_len;
+ u32 pei_len;
+ u8 *pei_err, *ctx_err;
+
+ pei_len = sizeof(struct cper_arm_err_info) * err->err_info_num;
+ pei_err = (u8 *)(err + 1);
+
+ err_info = (struct cper_arm_err_info *)(err + 1);
+ ctx_info = (struct cper_arm_ctx_info *)(err_info + err->err_info_num);
+ ctx_err = (u8 *)ctx_info;
+
+ for (n = 0; n < err->context_info_num; n++) {
+ sz = sizeof(struct cper_arm_ctx_info) + ctx_info->size;
+ ctx_info = (struct cper_arm_ctx_info *)((long)ctx_info + sz);
+ ctx_len += sz;
+ }
+
+ vsei_len = err->section_length - (sizeof(struct cper_sec_proc_arm) + pei_len + ctx_len);
+ if (vsei_len < 0) {
+ pr_warn(FW_BUG "section length: %d\n", err->section_length);
+ pr_warn(FW_BUG "section length is too small\n");
+ pr_warn(FW_BUG "firmware-generated error record is incorrect\n");
+ vsei_len = 0;
+ }
+ ven_err_data = (u8 *)ctx_info;
+
+ cpu = GET_LOGICAL_INDEX(err->mpidr);
+ if (cpu < 0)
+ cpu = -1;
+
+ trace_arm_event(err, pei_err, pei_len, ctx_err, ctx_len,
+ ven_err_data, (u32)vsei_len, sev, cpu);
}
static int __init ras_init(void)
diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c
index ea532a8a4a0c..a596f6013019 100644
--- a/drivers/s390/cio/vfio_ccw_ops.c
+++ b/drivers/s390/cio/vfio_ccw_ops.c
@@ -313,10 +313,12 @@ static int vfio_ccw_mdev_get_device_info(struct vfio_ccw_private *private,
return 0;
}
-static int vfio_ccw_mdev_get_region_info(struct vfio_ccw_private *private,
- struct vfio_region_info *info,
- unsigned long arg)
+static int vfio_ccw_mdev_ioctl_get_region_info(struct vfio_device *vdev,
+ struct vfio_region_info *info,
+ struct vfio_info_cap *caps)
{
+ struct vfio_ccw_private *private =
+ container_of(vdev, struct vfio_ccw_private, vdev);
int i;
switch (info->index) {
@@ -328,7 +330,6 @@ static int vfio_ccw_mdev_get_region_info(struct vfio_ccw_private *private,
return 0;
default: /* all other regions are handled via capability chain */
{
- struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
struct vfio_region_info_cap_type cap_type = {
.header.id = VFIO_REGION_INFO_CAP_TYPE,
.header.version = 1 };
@@ -351,27 +352,10 @@ static int vfio_ccw_mdev_get_region_info(struct vfio_ccw_private *private,
cap_type.type = private->region[i].type;
cap_type.subtype = private->region[i].subtype;
- ret = vfio_info_add_capability(&caps, &cap_type.header,
+ ret = vfio_info_add_capability(caps, &cap_type.header,
sizeof(cap_type));
if (ret)
return ret;
-
- info->flags |= VFIO_REGION_INFO_FLAG_CAPS;
- if (info->argsz < sizeof(*info) + caps.size) {
- info->argsz = sizeof(*info) + caps.size;
- info->cap_offset = 0;
- } else {
- vfio_info_cap_shift(&caps, sizeof(*info));
- if (copy_to_user((void __user *)arg + sizeof(*info),
- caps.buf, caps.size)) {
- kfree(caps.buf);
- return -EFAULT;
- }
- info->cap_offset = sizeof(*info);
- }
-
- kfree(caps.buf);
-
}
}
return 0;
@@ -532,24 +516,6 @@ static ssize_t vfio_ccw_mdev_ioctl(struct vfio_device *vdev,
return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0;
}
- case VFIO_DEVICE_GET_REGION_INFO:
- {
- struct vfio_region_info info;
-
- minsz = offsetofend(struct vfio_region_info, offset);
-
- if (copy_from_user(&info, (void __user *)arg, minsz))
- return -EFAULT;
-
- if (info.argsz < minsz)
- return -EINVAL;
-
- ret = vfio_ccw_mdev_get_region_info(private, &info, arg);
- if (ret)
- return ret;
-
- return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0;
- }
case VFIO_DEVICE_GET_IRQ_INFO:
{
struct vfio_irq_info info;
@@ -627,6 +593,7 @@ static const struct vfio_device_ops vfio_ccw_dev_ops = {
.read = vfio_ccw_mdev_read,
.write = vfio_ccw_mdev_write,
.ioctl = vfio_ccw_mdev_ioctl,
+ .get_region_info_caps = vfio_ccw_mdev_ioctl_get_region_info,
.request = vfio_ccw_mdev_request,
.dma_unmap = vfio_ccw_dma_unmap,
.bind_iommufd = vfio_iommufd_emulated_bind,
diff --git a/drivers/scsi/bfa/bfad.c b/drivers/scsi/bfa/bfad.c
index ff9adfc0b332..bdfd06516671 100644
--- a/drivers/scsi/bfa/bfad.c
+++ b/drivers/scsi/bfa/bfad.c
@@ -1528,7 +1528,6 @@ bfad_pci_slot_reset(struct pci_dev *pdev)
goto out_disable_device;
}
- pci_save_state(pdev);
pci_set_master(pdev);
rc = dma_set_mask_and_coherent(&bfad->pcidev->dev, DMA_BIT_MASK(64));
diff --git a/drivers/scsi/csiostor/csio_init.c b/drivers/scsi/csiostor/csio_init.c
index 79c8dafdd49e..db0c2174430a 100644
--- a/drivers/scsi/csiostor/csio_init.c
+++ b/drivers/scsi/csiostor/csio_init.c
@@ -1093,7 +1093,6 @@ csio_pci_slot_reset(struct pci_dev *pdev)
pci_set_master(pdev);
pci_restore_state(pdev);
- pci_save_state(pdev);
/* Bring HW s/m to ready state.
* but don't resume IOs.
diff --git a/drivers/scsi/ipr.c b/drivers/scsi/ipr.c
index 44214884deaf..95123689e9d1 100644
--- a/drivers/scsi/ipr.c
+++ b/drivers/scsi/ipr.c
@@ -7859,7 +7859,6 @@ static int ipr_reset_restore_cfg_space(struct ipr_cmnd *ipr_cmd)
struct ipr_ioa_cfg *ioa_cfg = ipr_cmd->ioa_cfg;
ENTER;
- ioa_cfg->pdev->state_saved = true;
pci_restore_state(ioa_cfg->pdev);
if (ipr_set_pcix_cmd_reg(ioa_cfg)) {
diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c
index f206267d9ecd..065eb91de9c0 100644
--- a/drivers/scsi/lpfc/lpfc_init.c
+++ b/drivers/scsi/lpfc/lpfc_init.c
@@ -14434,12 +14434,6 @@ lpfc_io_slot_reset_s3(struct pci_dev *pdev)
pci_restore_state(pdev);
- /*
- * As the new kernel behavior of pci_restore_state() API call clears
- * device saved_state flag, need to save the restored state again.
- */
- pci_save_state(pdev);
-
if (pdev->is_busmaster)
pci_set_master(pdev);
diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c
index 5ffd94586652..9007533e36e0 100644
--- a/drivers/scsi/qla2xxx/qla_os.c
+++ b/drivers/scsi/qla2xxx/qla_os.c
@@ -7886,11 +7886,6 @@ qla2xxx_pci_slot_reset(struct pci_dev *pdev)
pci_restore_state(pdev);
- /* pci_restore_state() clears the saved_state flag of the device
- * save restored state which resets saved_state flag
- */
- pci_save_state(pdev);
-
if (ha->mem_only)
rc = pci_enable_device_mem(pdev);
else
diff --git a/drivers/scsi/qla4xxx/ql4_os.c b/drivers/scsi/qla4xxx/ql4_os.c
index 83ff66f954e6..97329c97332f 100644
--- a/drivers/scsi/qla4xxx/ql4_os.c
+++ b/drivers/scsi/qla4xxx/ql4_os.c
@@ -9796,11 +9796,6 @@ qla4xxx_pci_slot_reset(struct pci_dev *pdev)
*/
pci_restore_state(pdev);
- /* pci_restore_state() clears the saved_state flag of the device
- * save restored state which resets saved_state flag
- */
- pci_save_state(pdev);
-
/* Initialize device or resume if in suspended state */
rc = pci_enable_device(pdev);
if (rc) {
diff --git a/drivers/tty/serial/8250/8250_pci.c b/drivers/tty/serial/8250/8250_pci.c
index 152f914c599d..65bd370f282a 100644
--- a/drivers/tty/serial/8250/8250_pci.c
+++ b/drivers/tty/serial/8250/8250_pci.c
@@ -6178,7 +6178,6 @@ static pci_ers_result_t serial8250_io_slot_reset(struct pci_dev *dev)
return PCI_ERS_RESULT_DISCONNECT;
pci_restore_state(dev);
- pci_save_state(dev);
return PCI_ERS_RESULT_RECOVERED;
}
diff --git a/drivers/tty/serial/jsm/jsm_driver.c b/drivers/tty/serial/jsm/jsm_driver.c
index 417a5b6bffc3..8d21373cae57 100644
--- a/drivers/tty/serial/jsm/jsm_driver.c
+++ b/drivers/tty/serial/jsm/jsm_driver.c
@@ -355,7 +355,6 @@ static void jsm_io_resume(struct pci_dev *pdev)
struct jsm_board *brd = pci_get_drvdata(pdev);
pci_restore_state(pdev);
- pci_save_state(pdev);
jsm_uart_port_init(brd);
}
diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index a7936bd1aabe..ddaa1366704b 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -1256,7 +1256,7 @@ static int query_virtqueues(struct mlx5_vdpa_net *ndev,
int vq_idx = start_vq + i;
if (cmd->err) {
- mlx5_vdpa_err(mvdev, "query vq %d failed, err: %d\n", vq_idx, err);
+ mlx5_vdpa_err(mvdev, "query vq %d failed, err: %d\n", vq_idx, cmd->err);
if (!err)
err = cmd->err;
continue;
diff --git a/drivers/vdpa/octeon_ep/octep_vdpa_main.c b/drivers/vdpa/octeon_ep/octep_vdpa_main.c
index 9e8d07078606..31a02e7fd7f2 100644
--- a/drivers/vdpa/octeon_ep/octep_vdpa_main.c
+++ b/drivers/vdpa/octeon_ep/octep_vdpa_main.c
@@ -736,6 +736,7 @@ static int octep_sriov_enable(struct pci_dev *pdev, int num_vfs)
octep_vdpa_assign_barspace(vf_pdev, pdev, index);
if (++index == num_vfs) {
done = true;
+ pci_dev_put(vf_pdev);
break;
}
}
diff --git a/drivers/vdpa/pds/vdpa_dev.c b/drivers/vdpa/pds/vdpa_dev.c
index 36f61cc96e21..43426bd971ac 100644
--- a/drivers/vdpa/pds/vdpa_dev.c
+++ b/drivers/vdpa/pds/vdpa_dev.c
@@ -51,7 +51,7 @@ static int pds_vdpa_register_event_handler(struct pds_vdpa_device *pdsv)
err = pdsc_register_notify(nb);
if (err) {
nb->notifier_call = NULL;
- dev_err(dev, "failed to register pds event handler: %ps\n",
+ dev_err(dev, "failed to register pds event handler: %pe\n",
ERR_PTR(err));
return -EINVAL;
}
diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
index e7bced0b5542..ae357d014564 100644
--- a/drivers/vdpa/vdpa_user/vduse_dev.c
+++ b/drivers/vdpa/vdpa_user/vduse_dev.c
@@ -2173,7 +2173,8 @@ static int vduse_init(void)
if (!vduse_irq_wq)
goto err_wq;
- vduse_irq_bound_wq = alloc_workqueue("vduse-irq-bound", WQ_HIGHPRI, 0);
+ vduse_irq_bound_wq = alloc_workqueue("vduse-irq-bound",
+ WQ_HIGHPRI | WQ_PERCPU, 0);
if (!vduse_irq_bound_wq)
goto err_bound_wq;
diff --git a/drivers/vfio/cdx/main.c b/drivers/vfio/cdx/main.c
index 5dd5f5ad7686..253031b86b60 100644
--- a/drivers/vfio/cdx/main.c
+++ b/drivers/vfio/cdx/main.c
@@ -129,28 +129,22 @@ static int vfio_cdx_ioctl_get_info(struct vfio_cdx_device *vdev,
return copy_to_user(arg, &info, minsz) ? -EFAULT : 0;
}
-static int vfio_cdx_ioctl_get_region_info(struct vfio_cdx_device *vdev,
- struct vfio_region_info __user *arg)
+static int vfio_cdx_ioctl_get_region_info(struct vfio_device *core_vdev,
+ struct vfio_region_info *info,
+ struct vfio_info_cap *caps)
{
- unsigned long minsz = offsetofend(struct vfio_region_info, offset);
+ struct vfio_cdx_device *vdev =
+ container_of(core_vdev, struct vfio_cdx_device, vdev);
struct cdx_device *cdx_dev = to_cdx_device(vdev->vdev.dev);
- struct vfio_region_info info;
-
- if (copy_from_user(&info, arg, minsz))
- return -EFAULT;
- if (info.argsz < minsz)
- return -EINVAL;
-
- if (info.index >= cdx_dev->res_count)
+ if (info->index >= cdx_dev->res_count)
return -EINVAL;
/* map offset to the physical address */
- info.offset = vfio_cdx_index_to_offset(info.index);
- info.size = vdev->regions[info.index].size;
- info.flags = vdev->regions[info.index].flags;
-
- return copy_to_user(arg, &info, minsz) ? -EFAULT : 0;
+ info->offset = vfio_cdx_index_to_offset(info->index);
+ info->size = vdev->regions[info->index].size;
+ info->flags = vdev->regions[info->index].flags;
+ return 0;
}
static int vfio_cdx_ioctl_get_irq_info(struct vfio_cdx_device *vdev,
@@ -219,8 +213,6 @@ static long vfio_cdx_ioctl(struct vfio_device *core_vdev,
switch (cmd) {
case VFIO_DEVICE_GET_INFO:
return vfio_cdx_ioctl_get_info(vdev, uarg);
- case VFIO_DEVICE_GET_REGION_INFO:
- return vfio_cdx_ioctl_get_region_info(vdev, uarg);
case VFIO_DEVICE_GET_IRQ_INFO:
return vfio_cdx_ioctl_get_irq_info(vdev, uarg);
case VFIO_DEVICE_SET_IRQS:
@@ -284,6 +276,7 @@ static const struct vfio_device_ops vfio_cdx_ops = {
.open_device = vfio_cdx_open_device,
.close_device = vfio_cdx_close_device,
.ioctl = vfio_cdx_ioctl,
+ .get_region_info_caps = vfio_cdx_ioctl_get_region_info,
.device_feature = vfio_cdx_ioctl_feature,
.mmap = vfio_cdx_mmap,
.bind_iommufd = vfio_iommufd_physical_bind,
diff --git a/drivers/vfio/device_cdev.c b/drivers/vfio/device_cdev.c
index 480cac3a0c27..8ceca24ac136 100644
--- a/drivers/vfio/device_cdev.c
+++ b/drivers/vfio/device_cdev.c
@@ -99,7 +99,7 @@ long vfio_df_ioctl_bind_iommufd(struct vfio_device_file *df,
return ret;
if (user_size < minsz)
return -EINVAL;
- ret = copy_struct_from_user(&bind, minsz, arg, user_size);
+ ret = copy_struct_from_user(&bind, sizeof(bind), arg, user_size);
if (ret)
return ret;
diff --git a/drivers/vfio/fsl-mc/vfio_fsl_mc.c b/drivers/vfio/fsl-mc/vfio_fsl_mc.c
index 76ccbab0e3d6..ba47100f28c1 100644
--- a/drivers/vfio/fsl-mc/vfio_fsl_mc.c
+++ b/drivers/vfio/fsl-mc/vfio_fsl_mc.c
@@ -117,6 +117,24 @@ static void vfio_fsl_mc_close_device(struct vfio_device *core_vdev)
fsl_mc_cleanup_irq_pool(mc_cont);
}
+static int vfio_fsl_mc_ioctl_get_region_info(struct vfio_device *core_vdev,
+ struct vfio_region_info *info,
+ struct vfio_info_cap *caps)
+{
+ struct vfio_fsl_mc_device *vdev =
+ container_of(core_vdev, struct vfio_fsl_mc_device, vdev);
+ struct fsl_mc_device *mc_dev = vdev->mc_dev;
+
+ if (info->index >= mc_dev->obj_desc.region_count)
+ return -EINVAL;
+
+ /* map offset to the physical address */
+ info->offset = VFIO_FSL_MC_INDEX_TO_OFFSET(info->index);
+ info->size = vdev->regions[info->index].size;
+ info->flags = vdev->regions[info->index].flags;
+ return 0;
+}
+
static long vfio_fsl_mc_ioctl(struct vfio_device *core_vdev,
unsigned int cmd, unsigned long arg)
{
@@ -149,30 +167,6 @@ static long vfio_fsl_mc_ioctl(struct vfio_device *core_vdev,
return copy_to_user((void __user *)arg, &info, minsz) ?
-EFAULT : 0;
}
- case VFIO_DEVICE_GET_REGION_INFO:
- {
- struct vfio_region_info info;
-
- minsz = offsetofend(struct vfio_region_info, offset);
-
- if (copy_from_user(&info, (void __user *)arg, minsz))
- return -EFAULT;
-
- if (info.argsz < minsz)
- return -EINVAL;
-
- if (info.index >= mc_dev->obj_desc.region_count)
- return -EINVAL;
-
- /* map offset to the physical address */
- info.offset = VFIO_FSL_MC_INDEX_TO_OFFSET(info.index);
- info.size = vdev->regions[info.index].size;
- info.flags = vdev->regions[info.index].flags;
-
- if (copy_to_user((void __user *)arg, &info, minsz))
- return -EFAULT;
- return 0;
- }
case VFIO_DEVICE_GET_IRQ_INFO:
{
struct vfio_irq_info info;
@@ -589,6 +583,7 @@ static const struct vfio_device_ops vfio_fsl_mc_ops = {
.open_device = vfio_fsl_mc_open_device,
.close_device = vfio_fsl_mc_close_device,
.ioctl = vfio_fsl_mc_ioctl,
+ .get_region_info_caps = vfio_fsl_mc_ioctl_get_region_info,
.read = vfio_fsl_mc_read,
.write = vfio_fsl_mc_write,
.mmap = vfio_fsl_mc_mmap,
diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
index 2b0172f54665..2b9fca00e9e8 100644
--- a/drivers/vfio/pci/Kconfig
+++ b/drivers/vfio/pci/Kconfig
@@ -55,6 +55,9 @@ config VFIO_PCI_ZDEV_KVM
To enable s390x KVM vfio-pci extensions, say Y.
+config VFIO_PCI_DMABUF
+ def_bool y if VFIO_PCI_CORE && PCI_P2PDMA && DMA_SHARED_BUFFER
+
source "drivers/vfio/pci/mlx5/Kconfig"
source "drivers/vfio/pci/hisilicon/Kconfig"
diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
index cf00c0a7e55c..53f59226ae01 100644
--- a/drivers/vfio/pci/Makefile
+++ b/drivers/vfio/pci/Makefile
@@ -2,6 +2,7 @@
vfio-pci-core-y := vfio_pci_core.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o
vfio-pci-core-$(CONFIG_VFIO_PCI_ZDEV_KVM) += vfio_pci_zdev.o
+vfio-pci-core-$(CONFIG_VFIO_PCI_DMABUF) += vfio_pci_dmabuf.o
obj-$(CONFIG_VFIO_PCI_CORE) += vfio-pci-core.o
vfio-pci-y := vfio_pci.o
diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c
index fde33f54e99e..cf45f6370c36 100644
--- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c
+++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c
@@ -125,9 +125,25 @@ static int qm_get_cqc(struct hisi_qm *qm, u64 *addr)
return 0;
}
+static void qm_xqc_reg_offsets(struct hisi_qm *qm,
+ u32 *eqc_addr, u32 *aeqc_addr)
+{
+ struct hisi_acc_vf_core_device *hisi_acc_vdev =
+ container_of(qm, struct hisi_acc_vf_core_device, vf_qm);
+
+ if (hisi_acc_vdev->drv_mode == HW_ACC_MIG_VF_CTRL) {
+ *eqc_addr = QM_EQC_VF_DW0;
+ *aeqc_addr = QM_AEQC_VF_DW0;
+ } else {
+ *eqc_addr = QM_EQC_PF_DW0;
+ *aeqc_addr = QM_AEQC_PF_DW0;
+ }
+}
+
static int qm_get_regs(struct hisi_qm *qm, struct acc_vf_data *vf_data)
{
struct device *dev = &qm->pdev->dev;
+ u32 eqc_addr, aeqc_addr;
int ret;
ret = qm_read_regs(qm, QM_VF_AEQ_INT_MASK, &vf_data->aeq_int_mask, 1);
@@ -167,15 +183,16 @@ static int qm_get_regs(struct hisi_qm *qm, struct acc_vf_data *vf_data)
return ret;
}
+ qm_xqc_reg_offsets(qm, &eqc_addr, &aeqc_addr);
/* QM_EQC_DW has 7 regs */
- ret = qm_read_regs(qm, QM_EQC_DW0, vf_data->qm_eqc_dw, 7);
+ ret = qm_read_regs(qm, eqc_addr, vf_data->qm_eqc_dw, 7);
if (ret) {
dev_err(dev, "failed to read QM_EQC_DW\n");
return ret;
}
/* QM_AEQC_DW has 7 regs */
- ret = qm_read_regs(qm, QM_AEQC_DW0, vf_data->qm_aeqc_dw, 7);
+ ret = qm_read_regs(qm, aeqc_addr, vf_data->qm_aeqc_dw, 7);
if (ret) {
dev_err(dev, "failed to read QM_AEQC_DW\n");
return ret;
@@ -187,6 +204,7 @@ static int qm_get_regs(struct hisi_qm *qm, struct acc_vf_data *vf_data)
static int qm_set_regs(struct hisi_qm *qm, struct acc_vf_data *vf_data)
{
struct device *dev = &qm->pdev->dev;
+ u32 eqc_addr, aeqc_addr;
int ret;
/* Check VF state */
@@ -239,15 +257,16 @@ static int qm_set_regs(struct hisi_qm *qm, struct acc_vf_data *vf_data)
return ret;
}
+ qm_xqc_reg_offsets(qm, &eqc_addr, &aeqc_addr);
/* QM_EQC_DW has 7 regs */
- ret = qm_write_regs(qm, QM_EQC_DW0, vf_data->qm_eqc_dw, 7);
+ ret = qm_write_regs(qm, eqc_addr, vf_data->qm_eqc_dw, 7);
if (ret) {
dev_err(dev, "failed to write QM_EQC_DW\n");
return ret;
}
/* QM_AEQC_DW has 7 regs */
- ret = qm_write_regs(qm, QM_AEQC_DW0, vf_data->qm_aeqc_dw, 7);
+ ret = qm_write_regs(qm, aeqc_addr, vf_data->qm_aeqc_dw, 7);
if (ret) {
dev_err(dev, "failed to write QM_AEQC_DW\n");
return ret;
@@ -1186,34 +1205,52 @@ static int hisi_acc_vf_qm_init(struct hisi_acc_vf_core_device *hisi_acc_vdev)
{
struct vfio_pci_core_device *vdev = &hisi_acc_vdev->core_device;
struct hisi_qm *vf_qm = &hisi_acc_vdev->vf_qm;
+ struct hisi_qm *pf_qm = hisi_acc_vdev->pf_qm;
struct pci_dev *vf_dev = vdev->pdev;
+ u32 val;
- /*
- * ACC VF dev BAR2 region consists of both functional register space
- * and migration control register space. For migration to work, we
- * need access to both. Hence, we map the entire BAR2 region here.
- * But unnecessarily exposing the migration BAR region to the Guest
- * has the potential to prevent/corrupt the Guest migration. Hence,
- * we restrict access to the migration control space from
- * Guest(Please see mmap/ioctl/read/write override functions).
- *
- * Please note that it is OK to expose the entire VF BAR if migration
- * is not supported or required as this cannot affect the ACC PF
- * configurations.
- *
- * Also the HiSilicon ACC VF devices supported by this driver on
- * HiSilicon hardware platforms are integrated end point devices
- * and the platform lacks the capability to perform any PCIe P2P
- * between these devices.
- */
+ val = readl(pf_qm->io_base + QM_MIG_REGION_SEL);
+ if (pf_qm->ver > QM_HW_V3 && (val & QM_MIG_REGION_EN))
+ hisi_acc_vdev->drv_mode = HW_ACC_MIG_PF_CTRL;
+ else
+ hisi_acc_vdev->drv_mode = HW_ACC_MIG_VF_CTRL;
- vf_qm->io_base =
- ioremap(pci_resource_start(vf_dev, VFIO_PCI_BAR2_REGION_INDEX),
- pci_resource_len(vf_dev, VFIO_PCI_BAR2_REGION_INDEX));
- if (!vf_qm->io_base)
- return -EIO;
+ if (hisi_acc_vdev->drv_mode == HW_ACC_MIG_PF_CTRL) {
+ /*
+ * On hardware platforms greater than QM_HW_V3, the migration function
+ * register is placed in the BAR2 configuration region of the PF,
+ * and each VF device occupies 8KB of configuration space.
+ */
+ vf_qm->io_base = pf_qm->io_base + QM_MIG_REGION_OFFSET +
+ hisi_acc_vdev->vf_id * QM_MIG_REGION_SIZE;
+ } else {
+ /*
+ * ACC VF dev BAR2 region consists of both functional register space
+ * and migration control register space. For migration to work, we
+ * need access to both. Hence, we map the entire BAR2 region here.
+ * But unnecessarily exposing the migration BAR region to the Guest
+ * has the potential to prevent/corrupt the Guest migration. Hence,
+ * we restrict access to the migration control space from
+ * Guest(Please see mmap/ioctl/read/write override functions).
+ *
+ * Please note that it is OK to expose the entire VF BAR if migration
+ * is not supported or required as this cannot affect the ACC PF
+ * configurations.
+ *
+ * Also the HiSilicon ACC VF devices supported by this driver on
+ * HiSilicon hardware platforms are integrated end point devices
+ * and the platform lacks the capability to perform any PCIe P2P
+ * between these devices.
+ */
+ vf_qm->io_base =
+ ioremap(pci_resource_start(vf_dev, VFIO_PCI_BAR2_REGION_INDEX),
+ pci_resource_len(vf_dev, VFIO_PCI_BAR2_REGION_INDEX));
+ if (!vf_qm->io_base)
+ return -EIO;
+ }
vf_qm->fun_type = QM_HW_VF;
+ vf_qm->ver = pf_qm->ver;
vf_qm->pdev = vf_dev;
mutex_init(&vf_qm->mailbox_lock);
@@ -1250,6 +1287,28 @@ static struct hisi_qm *hisi_acc_get_pf_qm(struct pci_dev *pdev)
return !IS_ERR(pf_qm) ? pf_qm : NULL;
}
+static size_t hisi_acc_get_resource_len(struct vfio_pci_core_device *vdev,
+ unsigned int index)
+{
+ struct hisi_acc_vf_core_device *hisi_acc_vdev =
+ hisi_acc_drvdata(vdev->pdev);
+
+ /*
+ * On the old HW_ACC_MIG_VF_CTRL mode device, the ACC VF device
+ * BAR2 region encompasses both functional register space
+ * and migration control register space.
+ * only the functional region should be report to Guest.
+ */
+ if (hisi_acc_vdev->drv_mode == HW_ACC_MIG_VF_CTRL)
+ return (pci_resource_len(vdev->pdev, index) >> 1);
+ /*
+ * On the new HW device, the migration control register
+ * has been moved to the PF device BAR2 region.
+ * The VF device BAR2 is entirely functional register space.
+ */
+ return pci_resource_len(vdev->pdev, index);
+}
+
static int hisi_acc_pci_rw_access_check(struct vfio_device *core_vdev,
size_t count, loff_t *ppos,
size_t *new_count)
@@ -1260,8 +1319,9 @@ static int hisi_acc_pci_rw_access_check(struct vfio_device *core_vdev,
if (index == VFIO_PCI_BAR2_REGION_INDEX) {
loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
- resource_size_t end = pci_resource_len(vdev->pdev, index) / 2;
+ resource_size_t end;
+ end = hisi_acc_get_resource_len(vdev, index);
/* Check if access is for migration control region */
if (pos >= end)
return -EINVAL;
@@ -1282,8 +1342,9 @@ static int hisi_acc_vfio_pci_mmap(struct vfio_device *core_vdev,
index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
if (index == VFIO_PCI_BAR2_REGION_INDEX) {
u64 req_len, pgoff, req_start;
- resource_size_t end = pci_resource_len(vdev->pdev, index) / 2;
+ resource_size_t end;
+ end = hisi_acc_get_resource_len(vdev, index);
req_len = vma->vm_end - vma->vm_start;
pgoff = vma->vm_pgoff &
((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
@@ -1324,43 +1385,23 @@ static ssize_t hisi_acc_vfio_pci_read(struct vfio_device *core_vdev,
return vfio_pci_core_read(core_vdev, buf, new_count, ppos);
}
-static long hisi_acc_vfio_pci_ioctl(struct vfio_device *core_vdev, unsigned int cmd,
- unsigned long arg)
+static int hisi_acc_vfio_ioctl_get_region(struct vfio_device *core_vdev,
+ struct vfio_region_info *info,
+ struct vfio_info_cap *caps)
{
- if (cmd == VFIO_DEVICE_GET_REGION_INFO) {
- struct vfio_pci_core_device *vdev =
- container_of(core_vdev, struct vfio_pci_core_device, vdev);
- struct pci_dev *pdev = vdev->pdev;
- struct vfio_region_info info;
- unsigned long minsz;
-
- minsz = offsetofend(struct vfio_region_info, offset);
-
- if (copy_from_user(&info, (void __user *)arg, minsz))
- return -EFAULT;
-
- if (info.argsz < minsz)
- return -EINVAL;
+ struct vfio_pci_core_device *vdev =
+ container_of(core_vdev, struct vfio_pci_core_device, vdev);
- if (info.index == VFIO_PCI_BAR2_REGION_INDEX) {
- info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
+ if (info->index != VFIO_PCI_BAR2_REGION_INDEX)
+ return vfio_pci_ioctl_get_region_info(core_vdev, info, caps);
- /*
- * ACC VF dev BAR2 region consists of both functional
- * register space and migration control register space.
- * Report only the functional region to Guest.
- */
- info.size = pci_resource_len(pdev, info.index) / 2;
+ info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index);
- info.flags = VFIO_REGION_INFO_FLAG_READ |
- VFIO_REGION_INFO_FLAG_WRITE |
- VFIO_REGION_INFO_FLAG_MMAP;
+ info->size = hisi_acc_get_resource_len(vdev, info->index);
- return copy_to_user((void __user *)arg, &info, minsz) ?
- -EFAULT : 0;
- }
- }
- return vfio_pci_core_ioctl(core_vdev, cmd, arg);
+ info->flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE |
+ VFIO_REGION_INFO_FLAG_MMAP;
+ return 0;
}
static int hisi_acc_vf_debug_check(struct seq_file *seq, struct vfio_device *vdev)
@@ -1521,7 +1562,8 @@ static void hisi_acc_vfio_pci_close_device(struct vfio_device *core_vdev)
hisi_acc_vf_disable_fds(hisi_acc_vdev);
mutex_lock(&hisi_acc_vdev->open_mutex);
hisi_acc_vdev->dev_opened = false;
- iounmap(vf_qm->io_base);
+ if (hisi_acc_vdev->drv_mode == HW_ACC_MIG_VF_CTRL)
+ iounmap(vf_qm->io_base);
mutex_unlock(&hisi_acc_vdev->open_mutex);
vfio_pci_core_close_device(core_vdev);
}
@@ -1557,13 +1599,15 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_migrn_ops = {
.release = vfio_pci_core_release_dev,
.open_device = hisi_acc_vfio_pci_open_device,
.close_device = hisi_acc_vfio_pci_close_device,
- .ioctl = hisi_acc_vfio_pci_ioctl,
+ .ioctl = vfio_pci_core_ioctl,
+ .get_region_info_caps = hisi_acc_vfio_ioctl_get_region,
.device_feature = vfio_pci_core_ioctl_feature,
.read = hisi_acc_vfio_pci_read,
.write = hisi_acc_vfio_pci_write,
.mmap = hisi_acc_vfio_pci_mmap,
.request = vfio_pci_core_request,
.match = vfio_pci_core_match,
+ .match_token_uuid = vfio_pci_core_match_token_uuid,
.bind_iommufd = vfio_iommufd_physical_bind,
.unbind_iommufd = vfio_iommufd_physical_unbind,
.attach_ioas = vfio_iommufd_physical_attach_ioas,
@@ -1577,6 +1621,7 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_ops = {
.open_device = hisi_acc_vfio_pci_open_device,
.close_device = vfio_pci_core_close_device,
.ioctl = vfio_pci_core_ioctl,
+ .get_region_info_caps = vfio_pci_ioctl_get_region_info,
.device_feature = vfio_pci_core_ioctl_feature,
.read = vfio_pci_core_read,
.write = vfio_pci_core_write,
diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h
index 91002ceeebc1..cd55eba64dfb 100644
--- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h
+++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h
@@ -50,8 +50,10 @@
#define QM_QUE_ISO_CFG_V 0x0030
#define QM_PAGE_SIZE 0x0034
-#define QM_EQC_DW0 0X8000
-#define QM_AEQC_DW0 0X8020
+#define QM_EQC_VF_DW0 0X8000
+#define QM_AEQC_VF_DW0 0X8020
+#define QM_EQC_PF_DW0 0x1c00
+#define QM_AEQC_PF_DW0 0x1c20
#define ACC_DRV_MAJOR_VER 1
#define ACC_DRV_MINOR_VER 0
@@ -59,6 +61,22 @@
#define ACC_DEV_MAGIC_V1 0XCDCDCDCDFEEDAACC
#define ACC_DEV_MAGIC_V2 0xAACCFEEDDECADEDE
+#define QM_MIG_REGION_OFFSET 0x180000
+#define QM_MIG_REGION_SIZE 0x2000
+
+/**
+ * On HW_ACC_MIG_VF_CTRL mode, the configuration domain supporting live
+ * migration functionality is located in the latter 32KB of the VF's BAR2.
+ * The Guest is only provided with the first 32KB of the VF's BAR2.
+ * On HW_ACC_MIG_PF_CTRL mode, the configuration domain supporting live
+ * migration functionality is located in the PF's BAR2, and the entire 64KB
+ * of the VF's BAR2 is allocated to the Guest.
+ */
+enum hw_drv_mode {
+ HW_ACC_MIG_VF_CTRL = 0,
+ HW_ACC_MIG_PF_CTRL,
+};
+
struct acc_vf_data {
#define QM_MATCH_SIZE offsetofend(struct acc_vf_data, qm_rsv_state)
/* QM match information */
@@ -125,6 +143,7 @@ struct hisi_acc_vf_core_device {
struct pci_dev *vf_dev;
struct hisi_qm *pf_qm;
struct hisi_qm vf_qm;
+ enum hw_drv_mode drv_mode;
/*
* vf_qm_state represents the QM_VF_STATE register value.
* It is set by Guest driver for the ACC VF dev indicating
diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c
index 7ec47e736a8e..9c5970411d07 100644
--- a/drivers/vfio/pci/mlx5/main.c
+++ b/drivers/vfio/pci/mlx5/main.c
@@ -1366,6 +1366,7 @@ static const struct vfio_device_ops mlx5vf_pci_ops = {
.open_device = mlx5vf_pci_open_device,
.close_device = mlx5vf_pci_close_device,
.ioctl = vfio_pci_core_ioctl,
+ .get_region_info_caps = vfio_pci_ioctl_get_region_info,
.device_feature = vfio_pci_core_ioctl_feature,
.read = vfio_pci_core_read,
.write = vfio_pci_core_write,
diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c
index e346392b72f6..84d142a47ec6 100644
--- a/drivers/vfio/pci/nvgrace-gpu/main.c
+++ b/drivers/vfio/pci/nvgrace-gpu/main.c
@@ -7,6 +7,8 @@
#include <linux/vfio_pci_core.h>
#include <linux/delay.h>
#include <linux/jiffies.h>
+#include <linux/pci-p2pdma.h>
+#include <linux/pm_runtime.h>
/*
* The device memory usable to the workloads running in the VM is cached
@@ -58,6 +60,8 @@ struct nvgrace_gpu_pci_core_device {
/* Lock to control device memory kernel mapping */
struct mutex remap_lock;
bool has_mig_hw_bug;
+ /* GPU has just been reset */
+ bool reset_done;
};
static void nvgrace_gpu_init_fake_bar_emu_regs(struct vfio_device *core_vdev)
@@ -102,6 +106,19 @@ static int nvgrace_gpu_open_device(struct vfio_device *core_vdev)
mutex_init(&nvdev->remap_lock);
}
+ /*
+ * GPU readiness is checked by reading the BAR0 registers.
+ *
+ * ioremap BAR0 to ensure that the BAR0 mapping is present before
+ * register reads on first fault before establishing any GPU
+ * memory mapping.
+ */
+ ret = vfio_pci_core_setup_barmap(vdev, 0);
+ if (ret) {
+ vfio_pci_core_disable(vdev);
+ return ret;
+ }
+
vfio_pci_core_finish_enable(vdev);
return 0;
@@ -130,6 +147,106 @@ static void nvgrace_gpu_close_device(struct vfio_device *core_vdev)
vfio_pci_core_close_device(core_vdev);
}
+static int nvgrace_gpu_wait_device_ready(void __iomem *io)
+{
+ unsigned long timeout = jiffies + msecs_to_jiffies(POLL_TIMEOUT_MS);
+
+ do {
+ if ((ioread32(io + C2C_LINK_BAR0_OFFSET) == STATUS_READY) &&
+ (ioread32(io + HBM_TRAINING_BAR0_OFFSET) == STATUS_READY))
+ return 0;
+ msleep(POLL_QUANTUM_MS);
+ } while (!time_after(jiffies, timeout));
+
+ return -ETIME;
+}
+
+/*
+ * If the GPU memory is accessed by the CPU while the GPU is not ready
+ * after reset, it can cause harmless corrected RAS events to be logged.
+ * Make sure the GPU is ready before establishing the mappings.
+ */
+static int
+nvgrace_gpu_check_device_ready(struct nvgrace_gpu_pci_core_device *nvdev)
+{
+ struct vfio_pci_core_device *vdev = &nvdev->core_device;
+ int ret;
+
+ lockdep_assert_held_read(&vdev->memory_lock);
+
+ if (!nvdev->reset_done)
+ return 0;
+
+ if (!__vfio_pci_memory_enabled(vdev))
+ return -EIO;
+
+ ret = nvgrace_gpu_wait_device_ready(vdev->barmap[0]);
+ if (ret)
+ return ret;
+
+ nvdev->reset_done = false;
+
+ return 0;
+}
+
+static unsigned long addr_to_pgoff(struct vm_area_struct *vma,
+ unsigned long addr)
+{
+ u64 pgoff = vma->vm_pgoff &
+ ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
+
+ return ((addr - vma->vm_start) >> PAGE_SHIFT) + pgoff;
+}
+
+static vm_fault_t nvgrace_gpu_vfio_pci_huge_fault(struct vm_fault *vmf,
+ unsigned int order)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ struct nvgrace_gpu_pci_core_device *nvdev = vma->vm_private_data;
+ struct vfio_pci_core_device *vdev = &nvdev->core_device;
+ unsigned int index =
+ vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
+ vm_fault_t ret = VM_FAULT_FALLBACK;
+ struct mem_region *memregion;
+ unsigned long pfn, addr;
+
+ memregion = nvgrace_gpu_memregion(index, nvdev);
+ if (!memregion)
+ return VM_FAULT_SIGBUS;
+
+ addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
+ pfn = PHYS_PFN(memregion->memphys) + addr_to_pgoff(vma, addr);
+
+ if (is_aligned_for_order(vma, addr, pfn, order)) {
+ scoped_guard(rwsem_read, &vdev->memory_lock) {
+ if (vdev->pm_runtime_engaged ||
+ nvgrace_gpu_check_device_ready(nvdev))
+ return VM_FAULT_SIGBUS;
+
+ ret = vfio_pci_vmf_insert_pfn(vdev, vmf, pfn, order);
+ }
+ }
+
+ dev_dbg_ratelimited(&vdev->pdev->dev,
+ "%s order = %d pfn 0x%lx: 0x%x\n",
+ __func__, order, pfn,
+ (unsigned int)ret);
+
+ return ret;
+}
+
+static vm_fault_t nvgrace_gpu_vfio_pci_fault(struct vm_fault *vmf)
+{
+ return nvgrace_gpu_vfio_pci_huge_fault(vmf, 0);
+}
+
+static const struct vm_operations_struct nvgrace_gpu_vfio_pci_mmap_ops = {
+ .fault = nvgrace_gpu_vfio_pci_fault,
+#ifdef CONFIG_ARCH_SUPPORTS_HUGE_PFNMAP
+ .huge_fault = nvgrace_gpu_vfio_pci_huge_fault,
+#endif
+};
+
static int nvgrace_gpu_mmap(struct vfio_device *core_vdev,
struct vm_area_struct *vma)
{
@@ -137,10 +254,8 @@ static int nvgrace_gpu_mmap(struct vfio_device *core_vdev,
container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
core_device.vdev);
struct mem_region *memregion;
- unsigned long start_pfn;
u64 req_len, pgoff, end;
unsigned int index;
- int ret = 0;
index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
@@ -157,17 +272,18 @@ static int nvgrace_gpu_mmap(struct vfio_device *core_vdev,
((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
if (check_sub_overflow(vma->vm_end, vma->vm_start, &req_len) ||
- check_add_overflow(PHYS_PFN(memregion->memphys), pgoff, &start_pfn) ||
check_add_overflow(PFN_PHYS(pgoff), req_len, &end))
return -EOVERFLOW;
/*
- * Check that the mapping request does not go beyond available device
- * memory size
+ * Check that the mapping request does not go beyond the exposed
+ * device memory size.
*/
if (end > memregion->memlength)
return -EINVAL;
+ vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP);
+
/*
* The carved out region of the device memory needs the NORMAL_NC
* property. Communicate as such to the hypervisor.
@@ -184,56 +300,31 @@ static int nvgrace_gpu_mmap(struct vfio_device *core_vdev,
vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
}
- /*
- * Perform a PFN map to the memory and back the device BAR by the
- * GPU memory.
- *
- * The available GPU memory size may not be power-of-2 aligned. The
- * remainder is only backed by vfio_device_ops read/write handlers.
- *
- * During device reset, the GPU is safely disconnected to the CPU
- * and access to the BAR will be immediately returned preventing
- * machine check.
- */
- ret = remap_pfn_range(vma, vma->vm_start, start_pfn,
- req_len, vma->vm_page_prot);
- if (ret)
- return ret;
-
- vma->vm_pgoff = start_pfn;
+ vma->vm_ops = &nvgrace_gpu_vfio_pci_mmap_ops;
+ vma->vm_private_data = nvdev;
return 0;
}
-static long
-nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev,
- unsigned long arg)
+static int nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev,
+ struct vfio_region_info *info,
+ struct vfio_info_cap *caps)
{
struct nvgrace_gpu_pci_core_device *nvdev =
container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
core_device.vdev);
- unsigned long minsz = offsetofend(struct vfio_region_info, offset);
- struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
struct vfio_region_info_cap_sparse_mmap *sparse;
- struct vfio_region_info info;
struct mem_region *memregion;
u32 size;
int ret;
- if (copy_from_user(&info, (void __user *)arg, minsz))
- return -EFAULT;
-
- if (info.argsz < minsz)
- return -EINVAL;
-
/*
* Request to determine the BAR region information. Send the
* GPU memory information.
*/
- memregion = nvgrace_gpu_memregion(info.index, nvdev);
+ memregion = nvgrace_gpu_memregion(info->index, nvdev);
if (!memregion)
- return vfio_pci_core_ioctl(core_vdev,
- VFIO_DEVICE_GET_REGION_INFO, arg);
+ return vfio_pci_ioctl_get_region_info(core_vdev, info, caps);
size = struct_size(sparse, areas, 1);
@@ -252,49 +343,28 @@ nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev,
sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
sparse->header.version = 1;
- ret = vfio_info_add_capability(&caps, &sparse->header, size);
+ ret = vfio_info_add_capability(caps, &sparse->header, size);
kfree(sparse);
if (ret)
return ret;
- info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
+ info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index);
/*
* The region memory size may not be power-of-2 aligned.
* Given that the memory is a BAR and may not be
* aligned, roundup to the next power-of-2.
*/
- info.size = memregion->bar_size;
- info.flags = VFIO_REGION_INFO_FLAG_READ |
+ info->size = memregion->bar_size;
+ info->flags = VFIO_REGION_INFO_FLAG_READ |
VFIO_REGION_INFO_FLAG_WRITE |
VFIO_REGION_INFO_FLAG_MMAP;
-
- if (caps.size) {
- info.flags |= VFIO_REGION_INFO_FLAG_CAPS;
- if (info.argsz < sizeof(info) + caps.size) {
- info.argsz = sizeof(info) + caps.size;
- info.cap_offset = 0;
- } else {
- vfio_info_cap_shift(&caps, sizeof(info));
- if (copy_to_user((void __user *)arg +
- sizeof(info), caps.buf,
- caps.size)) {
- kfree(caps.buf);
- return -EFAULT;
- }
- info.cap_offset = sizeof(info);
- }
- kfree(caps.buf);
- }
- return copy_to_user((void __user *)arg, &info, minsz) ?
- -EFAULT : 0;
+ return 0;
}
static long nvgrace_gpu_ioctl(struct vfio_device *core_vdev,
unsigned int cmd, unsigned long arg)
{
switch (cmd) {
- case VFIO_DEVICE_GET_REGION_INFO:
- return nvgrace_gpu_ioctl_get_region_info(core_vdev, arg);
case VFIO_DEVICE_IOEVENTFD:
return -ENOTTY;
case VFIO_DEVICE_RESET:
@@ -510,6 +580,7 @@ static ssize_t
nvgrace_gpu_read_mem(struct nvgrace_gpu_pci_core_device *nvdev,
char __user *buf, size_t count, loff_t *ppos)
{
+ struct vfio_pci_core_device *vdev = &nvdev->core_device;
u64 offset = *ppos & VFIO_PCI_OFFSET_MASK;
unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
struct mem_region *memregion;
@@ -536,9 +607,15 @@ nvgrace_gpu_read_mem(struct nvgrace_gpu_pci_core_device *nvdev,
else
mem_count = min(count, memregion->memlength - (size_t)offset);
- ret = nvgrace_gpu_map_and_read(nvdev, buf, mem_count, ppos);
- if (ret)
- return ret;
+ scoped_guard(rwsem_read, &vdev->memory_lock) {
+ ret = nvgrace_gpu_check_device_ready(nvdev);
+ if (ret)
+ return ret;
+
+ ret = nvgrace_gpu_map_and_read(nvdev, buf, mem_count, ppos);
+ if (ret)
+ return ret;
+ }
/*
* Only the device memory present on the hardware is mapped, which may
@@ -563,9 +640,16 @@ nvgrace_gpu_read(struct vfio_device *core_vdev,
struct nvgrace_gpu_pci_core_device *nvdev =
container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
core_device.vdev);
+ struct vfio_pci_core_device *vdev = &nvdev->core_device;
+ int ret;
- if (nvgrace_gpu_memregion(index, nvdev))
- return nvgrace_gpu_read_mem(nvdev, buf, count, ppos);
+ if (nvgrace_gpu_memregion(index, nvdev)) {
+ if (pm_runtime_resume_and_get(&vdev->pdev->dev))
+ return -EIO;
+ ret = nvgrace_gpu_read_mem(nvdev, buf, count, ppos);
+ pm_runtime_put(&vdev->pdev->dev);
+ return ret;
+ }
if (index == VFIO_PCI_CONFIG_REGION_INDEX)
return nvgrace_gpu_read_config_emu(core_vdev, buf, count, ppos);
@@ -627,6 +711,7 @@ static ssize_t
nvgrace_gpu_write_mem(struct nvgrace_gpu_pci_core_device *nvdev,
size_t count, loff_t *ppos, const char __user *buf)
{
+ struct vfio_pci_core_device *vdev = &nvdev->core_device;
unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
u64 offset = *ppos & VFIO_PCI_OFFSET_MASK;
struct mem_region *memregion;
@@ -656,9 +741,15 @@ nvgrace_gpu_write_mem(struct nvgrace_gpu_pci_core_device *nvdev,
*/
mem_count = min(count, memregion->memlength - (size_t)offset);
- ret = nvgrace_gpu_map_and_write(nvdev, buf, mem_count, ppos);
- if (ret)
- return ret;
+ scoped_guard(rwsem_read, &vdev->memory_lock) {
+ ret = nvgrace_gpu_check_device_ready(nvdev);
+ if (ret)
+ return ret;
+
+ ret = nvgrace_gpu_map_and_write(nvdev, buf, mem_count, ppos);
+ if (ret)
+ return ret;
+ }
exitfn:
*ppos += count;
@@ -672,10 +763,17 @@ nvgrace_gpu_write(struct vfio_device *core_vdev,
struct nvgrace_gpu_pci_core_device *nvdev =
container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
core_device.vdev);
+ struct vfio_pci_core_device *vdev = &nvdev->core_device;
unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
+ int ret;
- if (nvgrace_gpu_memregion(index, nvdev))
- return nvgrace_gpu_write_mem(nvdev, count, ppos, buf);
+ if (nvgrace_gpu_memregion(index, nvdev)) {
+ if (pm_runtime_resume_and_get(&vdev->pdev->dev))
+ return -EIO;
+ ret = nvgrace_gpu_write_mem(nvdev, count, ppos, buf);
+ pm_runtime_put(&vdev->pdev->dev);
+ return ret;
+ }
if (index == VFIO_PCI_CONFIG_REGION_INDEX)
return nvgrace_gpu_write_config_emu(core_vdev, buf, count, ppos);
@@ -683,6 +781,50 @@ nvgrace_gpu_write(struct vfio_device *core_vdev,
return vfio_pci_core_write(core_vdev, buf, count, ppos);
}
+static int nvgrace_get_dmabuf_phys(struct vfio_pci_core_device *core_vdev,
+ struct p2pdma_provider **provider,
+ unsigned int region_index,
+ struct dma_buf_phys_vec *phys_vec,
+ struct vfio_region_dma_range *dma_ranges,
+ size_t nr_ranges)
+{
+ struct nvgrace_gpu_pci_core_device *nvdev = container_of(
+ core_vdev, struct nvgrace_gpu_pci_core_device, core_device);
+ struct pci_dev *pdev = core_vdev->pdev;
+ struct mem_region *mem_region;
+
+ /*
+ * if (nvdev->resmem.memlength && region_index == RESMEM_REGION_INDEX) {
+ * The P2P properties of the non-BAR memory is the same as the
+ * BAR memory, so just use the provider for index 0. Someday
+ * when CXL gets P2P support we could create CXLish providers
+ * for the non-BAR memory.
+ * } else if (region_index == USEMEM_REGION_INDEX) {
+ * This is actually cachable memory and isn't treated as P2P in
+ * the chip. For now we have no way to push cachable memory
+ * through everything and the Grace HW doesn't care what caching
+ * attribute is programmed into the SMMU. So use BAR 0.
+ * }
+ */
+ mem_region = nvgrace_gpu_memregion(region_index, nvdev);
+ if (mem_region) {
+ *provider = pcim_p2pdma_provider(pdev, 0);
+ if (!*provider)
+ return -EINVAL;
+ return vfio_pci_core_fill_phys_vec(phys_vec, dma_ranges,
+ nr_ranges,
+ mem_region->memphys,
+ mem_region->memlength);
+ }
+
+ return vfio_pci_core_get_dmabuf_phys(core_vdev, provider, region_index,
+ phys_vec, dma_ranges, nr_ranges);
+}
+
+static const struct vfio_pci_device_ops nvgrace_gpu_pci_dev_ops = {
+ .get_dmabuf_phys = nvgrace_get_dmabuf_phys,
+};
+
static const struct vfio_device_ops nvgrace_gpu_pci_ops = {
.name = "nvgrace-gpu-vfio-pci",
.init = vfio_pci_core_init_dev,
@@ -690,6 +832,7 @@ static const struct vfio_device_ops nvgrace_gpu_pci_ops = {
.open_device = nvgrace_gpu_open_device,
.close_device = nvgrace_gpu_close_device,
.ioctl = nvgrace_gpu_ioctl,
+ .get_region_info_caps = nvgrace_gpu_ioctl_get_region_info,
.device_feature = vfio_pci_core_ioctl_feature,
.read = nvgrace_gpu_read,
.write = nvgrace_gpu_write,
@@ -703,6 +846,10 @@ static const struct vfio_device_ops nvgrace_gpu_pci_ops = {
.detach_ioas = vfio_iommufd_physical_detach_ioas,
};
+static const struct vfio_pci_device_ops nvgrace_gpu_pci_dev_core_ops = {
+ .get_dmabuf_phys = vfio_pci_core_get_dmabuf_phys,
+};
+
static const struct vfio_device_ops nvgrace_gpu_pci_core_ops = {
.name = "nvgrace-gpu-vfio-pci-core",
.init = vfio_pci_core_init_dev,
@@ -710,6 +857,7 @@ static const struct vfio_device_ops nvgrace_gpu_pci_core_ops = {
.open_device = nvgrace_gpu_open_device,
.close_device = vfio_pci_core_close_device,
.ioctl = vfio_pci_core_ioctl,
+ .get_region_info_caps = vfio_pci_ioctl_get_region_info,
.device_feature = vfio_pci_core_ioctl_feature,
.read = vfio_pci_core_read,
.write = vfio_pci_core_write,
@@ -893,11 +1041,10 @@ static bool nvgrace_gpu_has_mig_hw_bug(struct pci_dev *pdev)
* Ensure that the BAR0 region is enabled before accessing the
* registers.
*/
-static int nvgrace_gpu_wait_device_ready(struct pci_dev *pdev)
+static int nvgrace_gpu_probe_check_device_ready(struct pci_dev *pdev)
{
- unsigned long timeout = jiffies + msecs_to_jiffies(POLL_TIMEOUT_MS);
void __iomem *io;
- int ret = -ETIME;
+ int ret;
ret = pci_enable_device(pdev);
if (ret)
@@ -913,16 +1060,8 @@ static int nvgrace_gpu_wait_device_ready(struct pci_dev *pdev)
goto iomap_exit;
}
- do {
- if ((ioread32(io + C2C_LINK_BAR0_OFFSET) == STATUS_READY) &&
- (ioread32(io + HBM_TRAINING_BAR0_OFFSET) == STATUS_READY)) {
- ret = 0;
- goto reg_check_exit;
- }
- msleep(POLL_QUANTUM_MS);
- } while (!time_after(jiffies, timeout));
+ ret = nvgrace_gpu_wait_device_ready(io);
-reg_check_exit:
pci_iounmap(pdev, io);
iomap_exit:
pci_release_selected_regions(pdev, 1 << 0);
@@ -939,7 +1078,7 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev,
u64 memphys, memlength;
int ret;
- ret = nvgrace_gpu_wait_device_ready(pdev);
+ ret = nvgrace_gpu_probe_check_device_ready(pdev);
if (ret)
return ret;
@@ -965,6 +1104,9 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev,
memphys, memlength);
if (ret)
goto out_put_vdev;
+ nvdev->core_device.pci_ops = &nvgrace_gpu_pci_dev_ops;
+ } else {
+ nvdev->core_device.pci_ops = &nvgrace_gpu_pci_dev_core_ops;
}
ret = vfio_pci_core_register_device(&nvdev->core_device);
@@ -1002,12 +1144,38 @@ static const struct pci_device_id nvgrace_gpu_vfio_pci_table[] = {
MODULE_DEVICE_TABLE(pci, nvgrace_gpu_vfio_pci_table);
+/*
+ * The GPU reset is required to be serialized against the *first* mapping
+ * faults and read/writes accesses to prevent potential RAS events logging.
+ *
+ * First fault or access after a reset needs to poll device readiness,
+ * flag that a reset has occurred. The readiness test is done by holding
+ * the memory_lock read lock and we expect all vfio-pci initiated resets to
+ * hold the memory_lock write lock to avoid races. However, .reset_done
+ * extends beyond the scope of vfio-pci initiated resets therefore we
+ * cannot assert this behavior and use lockdep_assert_held_write.
+ */
+static void nvgrace_gpu_vfio_pci_reset_done(struct pci_dev *pdev)
+{
+ struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev);
+ struct nvgrace_gpu_pci_core_device *nvdev =
+ container_of(core_device, struct nvgrace_gpu_pci_core_device,
+ core_device);
+
+ nvdev->reset_done = true;
+}
+
+static const struct pci_error_handlers nvgrace_gpu_vfio_pci_err_handlers = {
+ .reset_done = nvgrace_gpu_vfio_pci_reset_done,
+ .error_detected = vfio_pci_core_aer_err_detected,
+};
+
static struct pci_driver nvgrace_gpu_vfio_pci_driver = {
.name = KBUILD_MODNAME,
.id_table = nvgrace_gpu_vfio_pci_table,
.probe = nvgrace_gpu_probe,
.remove = nvgrace_gpu_remove,
- .err_handler = &vfio_pci_core_err_handlers,
+ .err_handler = &nvgrace_gpu_vfio_pci_err_handlers,
.driver_managed_dma = true,
};
diff --git a/drivers/vfio/pci/pds/vfio_dev.c b/drivers/vfio/pci/pds/vfio_dev.c
index f3ccb0008f67..be103c74e969 100644
--- a/drivers/vfio/pci/pds/vfio_dev.c
+++ b/drivers/vfio/pci/pds/vfio_dev.c
@@ -195,6 +195,7 @@ static const struct vfio_device_ops pds_vfio_ops = {
.open_device = pds_vfio_open_device,
.close_device = pds_vfio_close_device,
.ioctl = vfio_pci_core_ioctl,
+ .get_region_info_caps = vfio_pci_ioctl_get_region_info,
.device_feature = vfio_pci_core_ioctl_feature,
.read = vfio_pci_core_read,
.write = vfio_pci_core_write,
diff --git a/drivers/vfio/pci/qat/main.c b/drivers/vfio/pci/qat/main.c
index a19b68043eb2..8fbdf7c6d666 100644
--- a/drivers/vfio/pci/qat/main.c
+++ b/drivers/vfio/pci/qat/main.c
@@ -609,6 +609,7 @@ static const struct vfio_device_ops qat_vf_pci_ops = {
.open_device = qat_vf_pci_open_device,
.close_device = qat_vf_pci_close_device,
.ioctl = vfio_pci_core_ioctl,
+ .get_region_info_caps = vfio_pci_ioctl_get_region_info,
.read = vfio_pci_core_read,
.write = vfio_pci_core_write,
.mmap = vfio_pci_core_mmap,
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index ac10f14417f2..0c771064c0b8 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -132,6 +132,7 @@ static const struct vfio_device_ops vfio_pci_ops = {
.open_device = vfio_pci_open_device,
.close_device = vfio_pci_core_close_device,
.ioctl = vfio_pci_core_ioctl,
+ .get_region_info_caps = vfio_pci_ioctl_get_region_info,
.device_feature = vfio_pci_core_ioctl_feature,
.read = vfio_pci_core_read,
.write = vfio_pci_core_write,
@@ -147,6 +148,10 @@ static const struct vfio_device_ops vfio_pci_ops = {
.pasid_detach_ioas = vfio_iommufd_physical_pasid_detach_ioas,
};
+static const struct vfio_pci_device_ops vfio_pci_dev_ops = {
+ .get_dmabuf_phys = vfio_pci_core_get_dmabuf_phys,
+};
+
static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
{
struct vfio_pci_core_device *vdev;
@@ -161,6 +166,7 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
return PTR_ERR(vdev);
dev_set_drvdata(&pdev->dev, vdev);
+ vdev->pci_ops = &vfio_pci_dev_ops;
ret = vfio_pci_core_register_device(vdev);
if (ret)
goto out_put_vdev;
diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c
index 8f02f236b5b4..dc4e510e6e1b 100644
--- a/drivers/vfio/pci/vfio_pci_config.c
+++ b/drivers/vfio/pci/vfio_pci_config.c
@@ -416,6 +416,7 @@ bool __vfio_pci_memory_enabled(struct vfio_pci_core_device *vdev)
return pdev->current_state < PCI_D3hot &&
(pdev->no_command_memory || (cmd & PCI_COMMAND_MEMORY));
}
+EXPORT_SYMBOL_GPL(__vfio_pci_memory_enabled);
/*
* Restore the *real* BARs after we detect a FLR or backdoor reset.
@@ -589,10 +590,12 @@ static int vfio_basic_config_write(struct vfio_pci_core_device *vdev, int pos,
virt_mem = !!(le16_to_cpu(*virt_cmd) & PCI_COMMAND_MEMORY);
new_mem = !!(new_cmd & PCI_COMMAND_MEMORY);
- if (!new_mem)
+ if (!new_mem) {
vfio_pci_zap_and_down_write_memory_lock(vdev);
- else
+ vfio_pci_dma_buf_move(vdev, true);
+ } else {
down_write(&vdev->memory_lock);
+ }
/*
* If the user is writing mem/io enable (new_mem/io) and we
@@ -627,6 +630,8 @@ static int vfio_basic_config_write(struct vfio_pci_core_device *vdev, int pos,
*virt_cmd &= cpu_to_le16(~mask);
*virt_cmd |= cpu_to_le16(new_cmd & mask);
+ if (__vfio_pci_memory_enabled(vdev))
+ vfio_pci_dma_buf_move(vdev, false);
up_write(&vdev->memory_lock);
}
@@ -707,12 +712,16 @@ static int __init init_pci_cap_basic_perm(struct perm_bits *perm)
static void vfio_lock_and_set_power_state(struct vfio_pci_core_device *vdev,
pci_power_t state)
{
- if (state >= PCI_D3hot)
+ if (state >= PCI_D3hot) {
vfio_pci_zap_and_down_write_memory_lock(vdev);
- else
+ vfio_pci_dma_buf_move(vdev, true);
+ } else {
down_write(&vdev->memory_lock);
+ }
vfio_pci_set_power_state(vdev, state);
+ if (__vfio_pci_memory_enabled(vdev))
+ vfio_pci_dma_buf_move(vdev, false);
up_write(&vdev->memory_lock);
}
@@ -900,7 +909,10 @@ static int vfio_exp_config_write(struct vfio_pci_core_device *vdev, int pos,
if (!ret && (cap & PCI_EXP_DEVCAP_FLR)) {
vfio_pci_zap_and_down_write_memory_lock(vdev);
+ vfio_pci_dma_buf_move(vdev, true);
pci_try_reset_function(vdev->pdev);
+ if (__vfio_pci_memory_enabled(vdev))
+ vfio_pci_dma_buf_move(vdev, false);
up_write(&vdev->memory_lock);
}
}
@@ -982,7 +994,10 @@ static int vfio_af_config_write(struct vfio_pci_core_device *vdev, int pos,
if (!ret && (cap & PCI_AF_CAP_FLR) && (cap & PCI_AF_CAP_TP)) {
vfio_pci_zap_and_down_write_memory_lock(vdev);
+ vfio_pci_dma_buf_move(vdev, true);
pci_try_reset_function(vdev->pdev);
+ if (__vfio_pci_memory_enabled(vdev))
+ vfio_pci_dma_buf_move(vdev, false);
up_write(&vdev->memory_lock);
}
}
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 7dcf5439dedc..3a11e6f450f7 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -28,6 +28,7 @@
#include <linux/nospec.h>
#include <linux/sched/mm.h>
#include <linux/iommufd.h>
+#include <linux/pci-p2pdma.h>
#if IS_ENABLED(CONFIG_EEH)
#include <asm/eeh.h>
#endif
@@ -41,6 +42,40 @@ static bool nointxmask;
static bool disable_vga;
static bool disable_idle_d3;
+static void vfio_pci_eventfd_rcu_free(struct rcu_head *rcu)
+{
+ struct vfio_pci_eventfd *eventfd =
+ container_of(rcu, struct vfio_pci_eventfd, rcu);
+
+ eventfd_ctx_put(eventfd->ctx);
+ kfree(eventfd);
+}
+
+int vfio_pci_eventfd_replace_locked(struct vfio_pci_core_device *vdev,
+ struct vfio_pci_eventfd __rcu **peventfd,
+ struct eventfd_ctx *ctx)
+{
+ struct vfio_pci_eventfd *new = NULL;
+ struct vfio_pci_eventfd *old;
+
+ lockdep_assert_held(&vdev->igate);
+
+ if (ctx) {
+ new = kzalloc(sizeof(*new), GFP_KERNEL_ACCOUNT);
+ if (!new)
+ return -ENOMEM;
+
+ new->ctx = ctx;
+ }
+
+ old = rcu_replace_pointer(*peventfd, new,
+ lockdep_is_held(&vdev->igate));
+ if (old)
+ call_rcu(&old->rcu, vfio_pci_eventfd_rcu_free);
+
+ return 0;
+}
+
/* List of PF's that vfio_pci_core_sriov_configure() has been called on */
static DEFINE_MUTEX(vfio_pci_sriov_pfs_mutex);
static LIST_HEAD(vfio_pci_sriov_pfs);
@@ -286,6 +321,8 @@ static int vfio_pci_runtime_pm_entry(struct vfio_pci_core_device *vdev,
* semaphore.
*/
vfio_pci_zap_and_down_write_memory_lock(vdev);
+ vfio_pci_dma_buf_move(vdev, true);
+
if (vdev->pm_runtime_engaged) {
up_write(&vdev->memory_lock);
return -EINVAL;
@@ -299,11 +336,9 @@ static int vfio_pci_runtime_pm_entry(struct vfio_pci_core_device *vdev,
return 0;
}
-static int vfio_pci_core_pm_entry(struct vfio_device *device, u32 flags,
+static int vfio_pci_core_pm_entry(struct vfio_pci_core_device *vdev, u32 flags,
void __user *arg, size_t argsz)
{
- struct vfio_pci_core_device *vdev =
- container_of(device, struct vfio_pci_core_device, vdev);
int ret;
ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET, 0);
@@ -320,12 +355,10 @@ static int vfio_pci_core_pm_entry(struct vfio_device *device, u32 flags,
}
static int vfio_pci_core_pm_entry_with_wakeup(
- struct vfio_device *device, u32 flags,
+ struct vfio_pci_core_device *vdev, u32 flags,
struct vfio_device_low_power_entry_with_wakeup __user *arg,
size_t argsz)
{
- struct vfio_pci_core_device *vdev =
- container_of(device, struct vfio_pci_core_device, vdev);
struct vfio_device_low_power_entry_with_wakeup entry;
struct eventfd_ctx *efdctx;
int ret;
@@ -373,14 +406,14 @@ static void vfio_pci_runtime_pm_exit(struct vfio_pci_core_device *vdev)
*/
down_write(&vdev->memory_lock);
__vfio_pci_runtime_pm_exit(vdev);
+ if (__vfio_pci_memory_enabled(vdev))
+ vfio_pci_dma_buf_move(vdev, false);
up_write(&vdev->memory_lock);
}
-static int vfio_pci_core_pm_exit(struct vfio_device *device, u32 flags,
+static int vfio_pci_core_pm_exit(struct vfio_pci_core_device *vdev, u32 flags,
void __user *arg, size_t argsz)
{
- struct vfio_pci_core_device *vdev =
- container_of(device, struct vfio_pci_core_device, vdev);
int ret;
ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET, 0);
@@ -695,15 +728,11 @@ void vfio_pci_core_close_device(struct vfio_device *core_vdev)
#endif
vfio_pci_core_disable(vdev);
+ vfio_pci_dma_buf_cleanup(vdev);
+
mutex_lock(&vdev->igate);
- if (vdev->err_trigger) {
- eventfd_ctx_put(vdev->err_trigger);
- vdev->err_trigger = NULL;
- }
- if (vdev->req_trigger) {
- eventfd_ctx_put(vdev->req_trigger);
- vdev->req_trigger = NULL;
- }
+ vfio_pci_eventfd_replace_locked(vdev, &vdev->err_trigger, NULL);
+ vfio_pci_eventfd_replace_locked(vdev, &vdev->req_trigger, NULL);
mutex_unlock(&vdev->igate);
}
EXPORT_SYMBOL_GPL(vfio_pci_core_close_device);
@@ -996,42 +1025,36 @@ static int vfio_pci_ioctl_get_info(struct vfio_pci_core_device *vdev,
return copy_to_user(arg, &info, minsz) ? -EFAULT : 0;
}
-static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev,
- struct vfio_region_info __user *arg)
+int vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev,
+ struct vfio_region_info *info,
+ struct vfio_info_cap *caps)
{
- unsigned long minsz = offsetofend(struct vfio_region_info, offset);
+ struct vfio_pci_core_device *vdev =
+ container_of(core_vdev, struct vfio_pci_core_device, vdev);
struct pci_dev *pdev = vdev->pdev;
- struct vfio_region_info info;
- struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
int i, ret;
- if (copy_from_user(&info, arg, minsz))
- return -EFAULT;
-
- if (info.argsz < minsz)
- return -EINVAL;
-
- switch (info.index) {
+ switch (info->index) {
case VFIO_PCI_CONFIG_REGION_INDEX:
- info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
- info.size = pdev->cfg_size;
- info.flags = VFIO_REGION_INFO_FLAG_READ |
- VFIO_REGION_INFO_FLAG_WRITE;
+ info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index);
+ info->size = pdev->cfg_size;
+ info->flags = VFIO_REGION_INFO_FLAG_READ |
+ VFIO_REGION_INFO_FLAG_WRITE;
break;
case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
- info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
- info.size = pci_resource_len(pdev, info.index);
- if (!info.size) {
- info.flags = 0;
+ info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index);
+ info->size = pci_resource_len(pdev, info->index);
+ if (!info->size) {
+ info->flags = 0;
break;
}
- info.flags = VFIO_REGION_INFO_FLAG_READ |
- VFIO_REGION_INFO_FLAG_WRITE;
- if (vdev->bar_mmap_supported[info.index]) {
- info.flags |= VFIO_REGION_INFO_FLAG_MMAP;
- if (info.index == vdev->msix_bar) {
- ret = msix_mmappable_cap(vdev, &caps);
+ info->flags = VFIO_REGION_INFO_FLAG_READ |
+ VFIO_REGION_INFO_FLAG_WRITE;
+ if (vdev->bar_mmap_supported[info->index]) {
+ info->flags |= VFIO_REGION_INFO_FLAG_MMAP;
+ if (info->index == vdev->msix_bar) {
+ ret = msix_mmappable_cap(vdev, caps);
if (ret)
return ret;
}
@@ -1043,9 +1066,9 @@ static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev,
size_t size;
u16 cmd;
- info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
- info.flags = 0;
- info.size = 0;
+ info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index);
+ info->flags = 0;
+ info->size = 0;
if (pci_resource_start(pdev, PCI_ROM_RESOURCE)) {
/*
@@ -1055,16 +1078,17 @@ static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev,
cmd = vfio_pci_memory_lock_and_enable(vdev);
io = pci_map_rom(pdev, &size);
if (io) {
- info.flags = VFIO_REGION_INFO_FLAG_READ;
+ info->flags = VFIO_REGION_INFO_FLAG_READ;
/* Report the BAR size, not the ROM size. */
- info.size = pci_resource_len(pdev, PCI_ROM_RESOURCE);
+ info->size = pci_resource_len(pdev,
+ PCI_ROM_RESOURCE);
pci_unmap_rom(pdev, io);
}
vfio_pci_memory_unlock_and_restore(vdev, cmd);
} else if (pdev->rom && pdev->romlen) {
- info.flags = VFIO_REGION_INFO_FLAG_READ;
+ info->flags = VFIO_REGION_INFO_FLAG_READ;
/* Report BAR size as power of two. */
- info.size = roundup_pow_of_two(pdev->romlen);
+ info->size = roundup_pow_of_two(pdev->romlen);
}
break;
@@ -1073,10 +1097,10 @@ static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev,
if (!vdev->has_vga)
return -EINVAL;
- info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
- info.size = 0xc0000;
- info.flags = VFIO_REGION_INFO_FLAG_READ |
- VFIO_REGION_INFO_FLAG_WRITE;
+ info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index);
+ info->size = 0xc0000;
+ info->flags = VFIO_REGION_INFO_FLAG_READ |
+ VFIO_REGION_INFO_FLAG_WRITE;
break;
default: {
@@ -1085,53 +1109,36 @@ static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev,
.header.version = 1
};
- if (info.index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions)
+ if (info->index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions)
return -EINVAL;
- info.index = array_index_nospec(
- info.index, VFIO_PCI_NUM_REGIONS + vdev->num_regions);
+ info->index = array_index_nospec(
+ info->index, VFIO_PCI_NUM_REGIONS + vdev->num_regions);
- i = info.index - VFIO_PCI_NUM_REGIONS;
+ i = info->index - VFIO_PCI_NUM_REGIONS;
- info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
- info.size = vdev->region[i].size;
- info.flags = vdev->region[i].flags;
+ info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index);
+ info->size = vdev->region[i].size;
+ info->flags = vdev->region[i].flags;
cap_type.type = vdev->region[i].type;
cap_type.subtype = vdev->region[i].subtype;
- ret = vfio_info_add_capability(&caps, &cap_type.header,
+ ret = vfio_info_add_capability(caps, &cap_type.header,
sizeof(cap_type));
if (ret)
return ret;
if (vdev->region[i].ops->add_capability) {
ret = vdev->region[i].ops->add_capability(
- vdev, &vdev->region[i], &caps);
+ vdev, &vdev->region[i], caps);
if (ret)
return ret;
}
}
}
-
- if (caps.size) {
- info.flags |= VFIO_REGION_INFO_FLAG_CAPS;
- if (info.argsz < sizeof(info) + caps.size) {
- info.argsz = sizeof(info) + caps.size;
- info.cap_offset = 0;
- } else {
- vfio_info_cap_shift(&caps, sizeof(info));
- if (copy_to_user(arg + 1, caps.buf, caps.size)) {
- kfree(caps.buf);
- return -EFAULT;
- }
- info.cap_offset = sizeof(*arg);
- }
-
- kfree(caps.buf);
- }
-
- return copy_to_user(arg, &info, minsz) ? -EFAULT : 0;
+ return 0;
}
+EXPORT_SYMBOL_GPL(vfio_pci_ioctl_get_region_info);
static int vfio_pci_ioctl_get_irq_info(struct vfio_pci_core_device *vdev,
struct vfio_irq_info __user *arg)
@@ -1227,7 +1234,10 @@ static int vfio_pci_ioctl_reset(struct vfio_pci_core_device *vdev,
*/
vfio_pci_set_power_state(vdev, PCI_D0);
+ vfio_pci_dma_buf_move(vdev, true);
ret = pci_try_reset_function(vdev->pdev);
+ if (__vfio_pci_memory_enabled(vdev))
+ vfio_pci_dma_buf_move(vdev, false);
up_write(&vdev->memory_lock);
return ret;
@@ -1457,8 +1467,6 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd,
return vfio_pci_ioctl_get_irq_info(vdev, uarg);
case VFIO_DEVICE_GET_PCI_HOT_RESET_INFO:
return vfio_pci_ioctl_get_pci_hot_reset_info(vdev, uarg);
- case VFIO_DEVICE_GET_REGION_INFO:
- return vfio_pci_ioctl_get_region_info(vdev, uarg);
case VFIO_DEVICE_IOEVENTFD:
return vfio_pci_ioctl_ioeventfd(vdev, uarg);
case VFIO_DEVICE_PCI_HOT_RESET:
@@ -1473,11 +1481,10 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd,
}
EXPORT_SYMBOL_GPL(vfio_pci_core_ioctl);
-static int vfio_pci_core_feature_token(struct vfio_device *device, u32 flags,
- uuid_t __user *arg, size_t argsz)
+static int vfio_pci_core_feature_token(struct vfio_pci_core_device *vdev,
+ u32 flags, uuid_t __user *arg,
+ size_t argsz)
{
- struct vfio_pci_core_device *vdev =
- container_of(device, struct vfio_pci_core_device, vdev);
uuid_t uuid;
int ret;
@@ -1504,16 +1511,21 @@ static int vfio_pci_core_feature_token(struct vfio_device *device, u32 flags,
int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
void __user *arg, size_t argsz)
{
+ struct vfio_pci_core_device *vdev =
+ container_of(device, struct vfio_pci_core_device, vdev);
+
switch (flags & VFIO_DEVICE_FEATURE_MASK) {
case VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY:
- return vfio_pci_core_pm_entry(device, flags, arg, argsz);
+ return vfio_pci_core_pm_entry(vdev, flags, arg, argsz);
case VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP:
- return vfio_pci_core_pm_entry_with_wakeup(device, flags,
+ return vfio_pci_core_pm_entry_with_wakeup(vdev, flags,
arg, argsz);
case VFIO_DEVICE_FEATURE_LOW_POWER_EXIT:
- return vfio_pci_core_pm_exit(device, flags, arg, argsz);
+ return vfio_pci_core_pm_exit(vdev, flags, arg, argsz);
case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN:
- return vfio_pci_core_feature_token(device, flags, arg, argsz);
+ return vfio_pci_core_feature_token(vdev, flags, arg, argsz);
+ case VFIO_DEVICE_FEATURE_DMA_BUF:
+ return vfio_pci_core_feature_dma_buf(vdev, flags, arg, argsz);
default:
return -ENOTTY;
}
@@ -1640,49 +1652,49 @@ static unsigned long vma_to_pfn(struct vm_area_struct *vma)
return (pci_resource_start(vdev->pdev, index) >> PAGE_SHIFT) + pgoff;
}
-static vm_fault_t vfio_pci_mmap_huge_fault(struct vm_fault *vmf,
- unsigned int order)
+vm_fault_t vfio_pci_vmf_insert_pfn(struct vfio_pci_core_device *vdev,
+ struct vm_fault *vmf,
+ unsigned long pfn,
+ unsigned int order)
{
- struct vm_area_struct *vma = vmf->vma;
- struct vfio_pci_core_device *vdev = vma->vm_private_data;
- unsigned long addr = vmf->address & ~((PAGE_SIZE << order) - 1);
- unsigned long pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
- unsigned long pfn = vma_to_pfn(vma) + pgoff;
- vm_fault_t ret = VM_FAULT_SIGBUS;
-
- if (order && (addr < vma->vm_start ||
- addr + (PAGE_SIZE << order) > vma->vm_end ||
- pfn & ((1 << order) - 1))) {
- ret = VM_FAULT_FALLBACK;
- goto out;
- }
-
- down_read(&vdev->memory_lock);
+ lockdep_assert_held_read(&vdev->memory_lock);
if (vdev->pm_runtime_engaged || !__vfio_pci_memory_enabled(vdev))
- goto out_unlock;
+ return VM_FAULT_SIGBUS;
switch (order) {
case 0:
- ret = vmf_insert_pfn(vma, vmf->address, pfn);
- break;
+ return vmf_insert_pfn(vmf->vma, vmf->address, pfn);
#ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP
case PMD_ORDER:
- ret = vmf_insert_pfn_pmd(vmf, pfn, false);
- break;
+ return vmf_insert_pfn_pmd(vmf, pfn, false);
#endif
#ifdef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP
case PUD_ORDER:
- ret = vmf_insert_pfn_pud(vmf, pfn, false);
+ return vmf_insert_pfn_pud(vmf, pfn, false);
break;
#endif
default:
- ret = VM_FAULT_FALLBACK;
+ return VM_FAULT_FALLBACK;
+ }
+}
+EXPORT_SYMBOL_GPL(vfio_pci_vmf_insert_pfn);
+
+static vm_fault_t vfio_pci_mmap_huge_fault(struct vm_fault *vmf,
+ unsigned int order)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ struct vfio_pci_core_device *vdev = vma->vm_private_data;
+ unsigned long addr = vmf->address & ~((PAGE_SIZE << order) - 1);
+ unsigned long pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
+ unsigned long pfn = vma_to_pfn(vma) + pgoff;
+ vm_fault_t ret = VM_FAULT_FALLBACK;
+
+ if (is_aligned_for_order(vma, addr, pfn, order)) {
+ scoped_guard(rwsem_read, &vdev->memory_lock)
+ ret = vfio_pci_vmf_insert_pfn(vdev, vmf, pfn, order);
}
-out_unlock:
- up_read(&vdev->memory_lock);
-out:
dev_dbg_ratelimited(&vdev->pdev->dev,
"%s(,order = %d) BAR %ld page offset 0x%lx: 0x%x\n",
__func__, order,
@@ -1749,18 +1761,9 @@ int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma
* Even though we don't make use of the barmap for the mmap,
* we need to request the region and the barmap tracks that.
*/
- if (!vdev->barmap[index]) {
- ret = pci_request_selected_regions(pdev,
- 1 << index, "vfio-pci");
- if (ret)
- return ret;
-
- vdev->barmap[index] = pci_iomap(pdev, index, 0);
- if (!vdev->barmap[index]) {
- pci_release_selected_regions(pdev, 1 << index);
- return -ENOMEM;
- }
- }
+ ret = vfio_pci_core_setup_barmap(vdev, index);
+ if (ret)
+ return ret;
vma->vm_private_data = vdev;
vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
@@ -1800,21 +1803,21 @@ void vfio_pci_core_request(struct vfio_device *core_vdev, unsigned int count)
struct vfio_pci_core_device *vdev =
container_of(core_vdev, struct vfio_pci_core_device, vdev);
struct pci_dev *pdev = vdev->pdev;
+ struct vfio_pci_eventfd *eventfd;
- mutex_lock(&vdev->igate);
-
- if (vdev->req_trigger) {
+ rcu_read_lock();
+ eventfd = rcu_dereference(vdev->req_trigger);
+ if (eventfd) {
if (!(count % 10))
pci_notice_ratelimited(pdev,
"Relaying device request to user (#%u)\n",
count);
- eventfd_signal(vdev->req_trigger);
+ eventfd_signal(eventfd->ctx);
} else if (count == 0) {
pci_warn(pdev,
"No device request channel registered, blocked until released by user\n");
}
-
- mutex_unlock(&vdev->igate);
+ rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(vfio_pci_core_request);
@@ -2085,6 +2088,7 @@ int vfio_pci_core_init_dev(struct vfio_device *core_vdev)
{
struct vfio_pci_core_device *vdev =
container_of(core_vdev, struct vfio_pci_core_device, vdev);
+ int ret;
vdev->pdev = to_pci_dev(core_vdev->dev);
vdev->irq_type = VFIO_PCI_NUM_IRQS;
@@ -2094,6 +2098,10 @@ int vfio_pci_core_init_dev(struct vfio_device *core_vdev)
INIT_LIST_HEAD(&vdev->dummy_resources_list);
INIT_LIST_HEAD(&vdev->ioeventfds_list);
INIT_LIST_HEAD(&vdev->sriov_pfs_item);
+ ret = pcim_p2pdma_init(vdev->pdev);
+ if (ret && ret != -EOPNOTSUPP)
+ return ret;
+ INIT_LIST_HEAD(&vdev->dmabufs);
init_rwsem(&vdev->memory_lock);
xa_init(&vdev->ctx);
@@ -2227,13 +2235,13 @@ pci_ers_result_t vfio_pci_core_aer_err_detected(struct pci_dev *pdev,
pci_channel_state_t state)
{
struct vfio_pci_core_device *vdev = dev_get_drvdata(&pdev->dev);
+ struct vfio_pci_eventfd *eventfd;
- mutex_lock(&vdev->igate);
-
- if (vdev->err_trigger)
- eventfd_signal(vdev->err_trigger);
-
- mutex_unlock(&vdev->igate);
+ rcu_read_lock();
+ eventfd = rcu_dereference(vdev->err_trigger);
+ if (eventfd)
+ eventfd_signal(eventfd->ctx);
+ rcu_read_unlock();
return PCI_ERS_RESULT_CAN_RECOVER;
}
@@ -2458,6 +2466,7 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
break;
}
+ vfio_pci_dma_buf_move(vdev, true);
vfio_pci_zap_bars(vdev);
}
@@ -2486,8 +2495,11 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
err_undo:
list_for_each_entry_from_reverse(vdev, &dev_set->device_list,
- vdev.dev_set_list)
+ vdev.dev_set_list) {
+ if (vdev->vdev.open_count && __vfio_pci_memory_enabled(vdev))
+ vfio_pci_dma_buf_move(vdev, false);
up_write(&vdev->memory_lock);
+ }
list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list)
pm_runtime_put(&vdev->pdev->dev);
diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c
new file mode 100644
index 000000000000..d4d0f7d08c53
--- /dev/null
+++ b/drivers/vfio/pci/vfio_pci_dmabuf.c
@@ -0,0 +1,350 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+ */
+#include <linux/dma-buf-mapping.h>
+#include <linux/pci-p2pdma.h>
+#include <linux/dma-resv.h>
+
+#include "vfio_pci_priv.h"
+
+MODULE_IMPORT_NS("DMA_BUF");
+
+struct vfio_pci_dma_buf {
+ struct dma_buf *dmabuf;
+ struct vfio_pci_core_device *vdev;
+ struct list_head dmabufs_elm;
+ size_t size;
+ struct dma_buf_phys_vec *phys_vec;
+ struct p2pdma_provider *provider;
+ u32 nr_ranges;
+ u8 revoked : 1;
+};
+
+static int vfio_pci_dma_buf_attach(struct dma_buf *dmabuf,
+ struct dma_buf_attachment *attachment)
+{
+ struct vfio_pci_dma_buf *priv = dmabuf->priv;
+
+ if (!attachment->peer2peer)
+ return -EOPNOTSUPP;
+
+ if (priv->revoked)
+ return -ENODEV;
+
+ return 0;
+}
+
+static struct sg_table *
+vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment,
+ enum dma_data_direction dir)
+{
+ struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv;
+
+ dma_resv_assert_held(priv->dmabuf->resv);
+
+ if (priv->revoked)
+ return ERR_PTR(-ENODEV);
+
+ return dma_buf_phys_vec_to_sgt(attachment, priv->provider,
+ priv->phys_vec, priv->nr_ranges,
+ priv->size, dir);
+}
+
+static void vfio_pci_dma_buf_unmap(struct dma_buf_attachment *attachment,
+ struct sg_table *sgt,
+ enum dma_data_direction dir)
+{
+ dma_buf_free_sgt(attachment, sgt, dir);
+}
+
+static void vfio_pci_dma_buf_release(struct dma_buf *dmabuf)
+{
+ struct vfio_pci_dma_buf *priv = dmabuf->priv;
+
+ /*
+ * Either this or vfio_pci_dma_buf_cleanup() will remove from the list.
+ * The refcount prevents both.
+ */
+ if (priv->vdev) {
+ down_write(&priv->vdev->memory_lock);
+ list_del_init(&priv->dmabufs_elm);
+ up_write(&priv->vdev->memory_lock);
+ vfio_device_put_registration(&priv->vdev->vdev);
+ }
+ kfree(priv->phys_vec);
+ kfree(priv);
+}
+
+static const struct dma_buf_ops vfio_pci_dmabuf_ops = {
+ .attach = vfio_pci_dma_buf_attach,
+ .map_dma_buf = vfio_pci_dma_buf_map,
+ .unmap_dma_buf = vfio_pci_dma_buf_unmap,
+ .release = vfio_pci_dma_buf_release,
+};
+
+/*
+ * This is a temporary "private interconnect" between VFIO DMABUF and iommufd.
+ * It allows the two co-operating drivers to exchange the physical address of
+ * the BAR. This is to be replaced with a formal DMABUF system for negotiated
+ * interconnect types.
+ *
+ * If this function succeeds the following are true:
+ * - There is one physical range and it is pointing to MMIO
+ * - When move_notify is called it means revoke, not move, vfio_dma_buf_map
+ * will fail if it is currently revoked
+ */
+int vfio_pci_dma_buf_iommufd_map(struct dma_buf_attachment *attachment,
+ struct dma_buf_phys_vec *phys)
+{
+ struct vfio_pci_dma_buf *priv;
+
+ dma_resv_assert_held(attachment->dmabuf->resv);
+
+ if (attachment->dmabuf->ops != &vfio_pci_dmabuf_ops)
+ return -EOPNOTSUPP;
+
+ priv = attachment->dmabuf->priv;
+ if (priv->revoked)
+ return -ENODEV;
+
+ /* More than one range to iommufd will require proper DMABUF support */
+ if (priv->nr_ranges != 1)
+ return -EOPNOTSUPP;
+
+ *phys = priv->phys_vec[0];
+ return 0;
+}
+EXPORT_SYMBOL_FOR_MODULES(vfio_pci_dma_buf_iommufd_map, "iommufd");
+
+int vfio_pci_core_fill_phys_vec(struct dma_buf_phys_vec *phys_vec,
+ struct vfio_region_dma_range *dma_ranges,
+ size_t nr_ranges, phys_addr_t start,
+ phys_addr_t len)
+{
+ phys_addr_t max_addr;
+ unsigned int i;
+
+ max_addr = start + len;
+ for (i = 0; i < nr_ranges; i++) {
+ phys_addr_t end;
+
+ if (!dma_ranges[i].length)
+ return -EINVAL;
+
+ if (check_add_overflow(start, dma_ranges[i].offset,
+ &phys_vec[i].paddr) ||
+ check_add_overflow(phys_vec[i].paddr,
+ dma_ranges[i].length, &end))
+ return -EOVERFLOW;
+ if (end > max_addr)
+ return -EINVAL;
+
+ phys_vec[i].len = dma_ranges[i].length;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(vfio_pci_core_fill_phys_vec);
+
+int vfio_pci_core_get_dmabuf_phys(struct vfio_pci_core_device *vdev,
+ struct p2pdma_provider **provider,
+ unsigned int region_index,
+ struct dma_buf_phys_vec *phys_vec,
+ struct vfio_region_dma_range *dma_ranges,
+ size_t nr_ranges)
+{
+ struct pci_dev *pdev = vdev->pdev;
+
+ *provider = pcim_p2pdma_provider(pdev, region_index);
+ if (!*provider)
+ return -EINVAL;
+
+ return vfio_pci_core_fill_phys_vec(
+ phys_vec, dma_ranges, nr_ranges,
+ pci_resource_start(pdev, region_index),
+ pci_resource_len(pdev, region_index));
+}
+EXPORT_SYMBOL_GPL(vfio_pci_core_get_dmabuf_phys);
+
+static int validate_dmabuf_input(struct vfio_device_feature_dma_buf *dma_buf,
+ struct vfio_region_dma_range *dma_ranges,
+ size_t *lengthp)
+{
+ size_t length = 0;
+ u32 i;
+
+ for (i = 0; i < dma_buf->nr_ranges; i++) {
+ u64 offset = dma_ranges[i].offset;
+ u64 len = dma_ranges[i].length;
+
+ if (!len || !PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len))
+ return -EINVAL;
+
+ if (check_add_overflow(length, len, &length))
+ return -EINVAL;
+ }
+
+ /*
+ * dma_iova_try_alloc() will WARN on if userspace proposes a size that
+ * is too big, eg with lots of ranges.
+ */
+ if ((u64)(length) & DMA_IOVA_USE_SWIOTLB)
+ return -EINVAL;
+
+ *lengthp = length;
+ return 0;
+}
+
+int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
+ struct vfio_device_feature_dma_buf __user *arg,
+ size_t argsz)
+{
+ struct vfio_device_feature_dma_buf get_dma_buf = {};
+ struct vfio_region_dma_range *dma_ranges;
+ DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
+ struct vfio_pci_dma_buf *priv;
+ size_t length;
+ int ret;
+
+ if (!vdev->pci_ops || !vdev->pci_ops->get_dmabuf_phys)
+ return -EOPNOTSUPP;
+
+ ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
+ sizeof(get_dma_buf));
+ if (ret != 1)
+ return ret;
+
+ if (copy_from_user(&get_dma_buf, arg, sizeof(get_dma_buf)))
+ return -EFAULT;
+
+ if (!get_dma_buf.nr_ranges || get_dma_buf.flags)
+ return -EINVAL;
+
+ /*
+ * For PCI the region_index is the BAR number like everything else.
+ */
+ if (get_dma_buf.region_index >= VFIO_PCI_ROM_REGION_INDEX)
+ return -ENODEV;
+
+ dma_ranges = memdup_array_user(&arg->dma_ranges, get_dma_buf.nr_ranges,
+ sizeof(*dma_ranges));
+ if (IS_ERR(dma_ranges))
+ return PTR_ERR(dma_ranges);
+
+ ret = validate_dmabuf_input(&get_dma_buf, dma_ranges, &length);
+ if (ret)
+ goto err_free_ranges;
+
+ priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+ if (!priv) {
+ ret = -ENOMEM;
+ goto err_free_ranges;
+ }
+ priv->phys_vec = kcalloc(get_dma_buf.nr_ranges, sizeof(*priv->phys_vec),
+ GFP_KERNEL);
+ if (!priv->phys_vec) {
+ ret = -ENOMEM;
+ goto err_free_priv;
+ }
+
+ priv->vdev = vdev;
+ priv->nr_ranges = get_dma_buf.nr_ranges;
+ priv->size = length;
+ ret = vdev->pci_ops->get_dmabuf_phys(vdev, &priv->provider,
+ get_dma_buf.region_index,
+ priv->phys_vec, dma_ranges,
+ priv->nr_ranges);
+ if (ret)
+ goto err_free_phys;
+
+ kfree(dma_ranges);
+ dma_ranges = NULL;
+
+ if (!vfio_device_try_get_registration(&vdev->vdev)) {
+ ret = -ENODEV;
+ goto err_free_phys;
+ }
+
+ exp_info.ops = &vfio_pci_dmabuf_ops;
+ exp_info.size = priv->size;
+ exp_info.flags = get_dma_buf.open_flags;
+ exp_info.priv = priv;
+
+ priv->dmabuf = dma_buf_export(&exp_info);
+ if (IS_ERR(priv->dmabuf)) {
+ ret = PTR_ERR(priv->dmabuf);
+ goto err_dev_put;
+ }
+
+ /* dma_buf_put() now frees priv */
+ INIT_LIST_HEAD(&priv->dmabufs_elm);
+ down_write(&vdev->memory_lock);
+ dma_resv_lock(priv->dmabuf->resv, NULL);
+ priv->revoked = !__vfio_pci_memory_enabled(vdev);
+ list_add_tail(&priv->dmabufs_elm, &vdev->dmabufs);
+ dma_resv_unlock(priv->dmabuf->resv);
+ up_write(&vdev->memory_lock);
+
+ /*
+ * dma_buf_fd() consumes the reference, when the file closes the dmabuf
+ * will be released.
+ */
+ ret = dma_buf_fd(priv->dmabuf, get_dma_buf.open_flags);
+ if (ret < 0)
+ goto err_dma_buf;
+ return ret;
+
+err_dma_buf:
+ dma_buf_put(priv->dmabuf);
+err_dev_put:
+ vfio_device_put_registration(&vdev->vdev);
+err_free_phys:
+ kfree(priv->phys_vec);
+err_free_priv:
+ kfree(priv);
+err_free_ranges:
+ kfree(dma_ranges);
+ return ret;
+}
+
+void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked)
+{
+ struct vfio_pci_dma_buf *priv;
+ struct vfio_pci_dma_buf *tmp;
+
+ lockdep_assert_held_write(&vdev->memory_lock);
+
+ list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) {
+ if (!get_file_active(&priv->dmabuf->file))
+ continue;
+
+ if (priv->revoked != revoked) {
+ dma_resv_lock(priv->dmabuf->resv, NULL);
+ priv->revoked = revoked;
+ dma_buf_move_notify(priv->dmabuf);
+ dma_resv_unlock(priv->dmabuf->resv);
+ }
+ fput(priv->dmabuf->file);
+ }
+}
+
+void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev)
+{
+ struct vfio_pci_dma_buf *priv;
+ struct vfio_pci_dma_buf *tmp;
+
+ down_write(&vdev->memory_lock);
+ list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) {
+ if (!get_file_active(&priv->dmabuf->file))
+ continue;
+
+ dma_resv_lock(priv->dmabuf->resv, NULL);
+ list_del_init(&priv->dmabufs_elm);
+ priv->vdev = NULL;
+ priv->revoked = true;
+ dma_buf_move_notify(priv->dmabuf);
+ dma_resv_unlock(priv->dmabuf->resv);
+ vfio_device_put_registration(&vdev->vdev);
+ fput(priv->dmabuf->file);
+ }
+ up_write(&vdev->memory_lock);
+}
diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c
index 30d3e921cb0d..c76e753b3cec 100644
--- a/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -731,21 +731,27 @@ static int vfio_pci_set_msi_trigger(struct vfio_pci_core_device *vdev,
return 0;
}
-static int vfio_pci_set_ctx_trigger_single(struct eventfd_ctx **ctx,
+static int vfio_pci_set_ctx_trigger_single(struct vfio_pci_core_device *vdev,
+ struct vfio_pci_eventfd __rcu **peventfd,
unsigned int count, uint32_t flags,
void *data)
{
/* DATA_NONE/DATA_BOOL enables loopback testing */
if (flags & VFIO_IRQ_SET_DATA_NONE) {
- if (*ctx) {
- if (count) {
- eventfd_signal(*ctx);
- } else {
- eventfd_ctx_put(*ctx);
- *ctx = NULL;
- }
+ struct vfio_pci_eventfd *eventfd;
+
+ eventfd = rcu_dereference_protected(*peventfd,
+ lockdep_is_held(&vdev->igate));
+
+ if (!eventfd)
+ return -EINVAL;
+
+ if (count) {
+ eventfd_signal(eventfd->ctx);
return 0;
}
+
+ return vfio_pci_eventfd_replace_locked(vdev, peventfd, NULL);
} else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
uint8_t trigger;
@@ -753,8 +759,15 @@ static int vfio_pci_set_ctx_trigger_single(struct eventfd_ctx **ctx,
return -EINVAL;
trigger = *(uint8_t *)data;
- if (trigger && *ctx)
- eventfd_signal(*ctx);
+
+ if (trigger) {
+ struct vfio_pci_eventfd *eventfd =
+ rcu_dereference_protected(*peventfd,
+ lockdep_is_held(&vdev->igate));
+
+ if (eventfd)
+ eventfd_signal(eventfd->ctx);
+ }
return 0;
} else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
@@ -765,22 +778,23 @@ static int vfio_pci_set_ctx_trigger_single(struct eventfd_ctx **ctx,
fd = *(int32_t *)data;
if (fd == -1) {
- if (*ctx)
- eventfd_ctx_put(*ctx);
- *ctx = NULL;
+ return vfio_pci_eventfd_replace_locked(vdev,
+ peventfd, NULL);
} else if (fd >= 0) {
struct eventfd_ctx *efdctx;
+ int ret;
efdctx = eventfd_ctx_fdget(fd);
if (IS_ERR(efdctx))
return PTR_ERR(efdctx);
- if (*ctx)
- eventfd_ctx_put(*ctx);
+ ret = vfio_pci_eventfd_replace_locked(vdev,
+ peventfd, efdctx);
+ if (ret)
+ eventfd_ctx_put(efdctx);
- *ctx = efdctx;
+ return ret;
}
- return 0;
}
return -EINVAL;
@@ -793,7 +807,7 @@ static int vfio_pci_set_err_trigger(struct vfio_pci_core_device *vdev,
if (index != VFIO_PCI_ERR_IRQ_INDEX || start != 0 || count > 1)
return -EINVAL;
- return vfio_pci_set_ctx_trigger_single(&vdev->err_trigger,
+ return vfio_pci_set_ctx_trigger_single(vdev, &vdev->err_trigger,
count, flags, data);
}
@@ -804,7 +818,7 @@ static int vfio_pci_set_req_trigger(struct vfio_pci_core_device *vdev,
if (index != VFIO_PCI_REQ_IRQ_INDEX || start != 0 || count > 1)
return -EINVAL;
- return vfio_pci_set_ctx_trigger_single(&vdev->req_trigger,
+ return vfio_pci_set_ctx_trigger_single(vdev, &vdev->req_trigger,
count, flags, data);
}
diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h
index a9972eacb293..27ac280f00b9 100644
--- a/drivers/vfio/pci/vfio_pci_priv.h
+++ b/drivers/vfio/pci/vfio_pci_priv.h
@@ -26,6 +26,10 @@ struct vfio_pci_ioeventfd {
bool vfio_pci_intx_mask(struct vfio_pci_core_device *vdev);
void vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev);
+int vfio_pci_eventfd_replace_locked(struct vfio_pci_core_device *vdev,
+ struct vfio_pci_eventfd __rcu **peventfd,
+ struct eventfd_ctx *ctx);
+
int vfio_pci_set_irqs_ioctl(struct vfio_pci_core_device *vdev, uint32_t flags,
unsigned index, unsigned start, unsigned count,
void *data);
@@ -60,7 +64,6 @@ void vfio_config_free(struct vfio_pci_core_device *vdev);
int vfio_pci_set_power_state(struct vfio_pci_core_device *vdev,
pci_power_t state);
-bool __vfio_pci_memory_enabled(struct vfio_pci_core_device *vdev);
void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_core_device *vdev);
u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_core_device *vdev);
void vfio_pci_memory_unlock_and_restore(struct vfio_pci_core_device *vdev,
@@ -107,4 +110,27 @@ static inline bool vfio_pci_is_vga(struct pci_dev *pdev)
return (pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA;
}
+#ifdef CONFIG_VFIO_PCI_DMABUF
+int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
+ struct vfio_device_feature_dma_buf __user *arg,
+ size_t argsz);
+void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev);
+void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked);
+#else
+static inline int
+vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
+ struct vfio_device_feature_dma_buf __user *arg,
+ size_t argsz)
+{
+ return -ENOTTY;
+}
+static inline void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev)
+{
+}
+static inline void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev,
+ bool revoked)
+{
+}
+#endif
+
#endif
diff --git a/drivers/vfio/pci/virtio/common.h b/drivers/vfio/pci/virtio/common.h
index c7d7e27af386..cb3d5e57d3a3 100644
--- a/drivers/vfio/pci/virtio/common.h
+++ b/drivers/vfio/pci/virtio/common.h
@@ -109,10 +109,9 @@ void virtiovf_migration_reset_done(struct pci_dev *pdev);
#ifdef CONFIG_VIRTIO_VFIO_PCI_ADMIN_LEGACY
int virtiovf_open_legacy_io(struct virtiovf_pci_core_device *virtvdev);
-long virtiovf_vfio_pci_core_ioctl(struct vfio_device *core_vdev,
- unsigned int cmd, unsigned long arg);
int virtiovf_pci_ioctl_get_region_info(struct vfio_device *core_vdev,
- unsigned int cmd, unsigned long arg);
+ struct vfio_region_info *info,
+ struct vfio_info_cap *caps);
ssize_t virtiovf_pci_core_write(struct vfio_device *core_vdev,
const char __user *buf, size_t count,
loff_t *ppos);
diff --git a/drivers/vfio/pci/virtio/legacy_io.c b/drivers/vfio/pci/virtio/legacy_io.c
index 832af5ba267c..1ed349a55629 100644
--- a/drivers/vfio/pci/virtio/legacy_io.c
+++ b/drivers/vfio/pci/virtio/legacy_io.c
@@ -281,41 +281,19 @@ ssize_t virtiovf_pci_core_write(struct vfio_device *core_vdev, const char __user
}
int virtiovf_pci_ioctl_get_region_info(struct vfio_device *core_vdev,
- unsigned int cmd, unsigned long arg)
+ struct vfio_region_info *info,
+ struct vfio_info_cap *caps)
{
struct virtiovf_pci_core_device *virtvdev = container_of(
core_vdev, struct virtiovf_pci_core_device, core_device.vdev);
- unsigned long minsz = offsetofend(struct vfio_region_info, offset);
- void __user *uarg = (void __user *)arg;
- struct vfio_region_info info = {};
- if (copy_from_user(&info, uarg, minsz))
- return -EFAULT;
+ if (info->index != VFIO_PCI_BAR0_REGION_INDEX)
+ return vfio_pci_ioctl_get_region_info(core_vdev, info, caps);
- if (info.argsz < minsz)
- return -EINVAL;
-
- switch (info.index) {
- case VFIO_PCI_BAR0_REGION_INDEX:
- info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
- info.size = virtvdev->bar0_virtual_buf_size;
- info.flags = VFIO_REGION_INFO_FLAG_READ |
- VFIO_REGION_INFO_FLAG_WRITE;
- return copy_to_user(uarg, &info, minsz) ? -EFAULT : 0;
- default:
- return vfio_pci_core_ioctl(core_vdev, cmd, arg);
- }
-}
-
-long virtiovf_vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd,
- unsigned long arg)
-{
- switch (cmd) {
- case VFIO_DEVICE_GET_REGION_INFO:
- return virtiovf_pci_ioctl_get_region_info(core_vdev, cmd, arg);
- default:
- return vfio_pci_core_ioctl(core_vdev, cmd, arg);
- }
+ info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index);
+ info->size = virtvdev->bar0_virtual_buf_size;
+ info->flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE;
+ return 0;
}
static int virtiovf_set_notify_addr(struct virtiovf_pci_core_device *virtvdev)
diff --git a/drivers/vfio/pci/virtio/main.c b/drivers/vfio/pci/virtio/main.c
index 8084f3e36a9f..d2e5cbca13c8 100644
--- a/drivers/vfio/pci/virtio/main.c
+++ b/drivers/vfio/pci/virtio/main.c
@@ -88,6 +88,7 @@ static const struct vfio_device_ops virtiovf_vfio_pci_lm_ops = {
.open_device = virtiovf_pci_open_device,
.close_device = virtiovf_pci_close_device,
.ioctl = vfio_pci_core_ioctl,
+ .get_region_info_caps = vfio_pci_ioctl_get_region_info,
.device_feature = vfio_pci_core_ioctl_feature,
.read = vfio_pci_core_read,
.write = vfio_pci_core_write,
@@ -108,7 +109,8 @@ static const struct vfio_device_ops virtiovf_vfio_pci_tran_lm_ops = {
.release = virtiovf_pci_core_release_dev,
.open_device = virtiovf_pci_open_device,
.close_device = virtiovf_pci_close_device,
- .ioctl = virtiovf_vfio_pci_core_ioctl,
+ .ioctl = vfio_pci_core_ioctl,
+ .get_region_info_caps = virtiovf_pci_ioctl_get_region_info,
.device_feature = vfio_pci_core_ioctl_feature,
.read = virtiovf_pci_core_read,
.write = virtiovf_pci_core_write,
@@ -130,6 +132,7 @@ static const struct vfio_device_ops virtiovf_vfio_pci_ops = {
.open_device = virtiovf_pci_open_device,
.close_device = vfio_pci_core_close_device,
.ioctl = vfio_pci_core_ioctl,
+ .get_region_info_caps = vfio_pci_ioctl_get_region_info,
.device_feature = vfio_pci_core_ioctl_feature,
.read = vfio_pci_core_read,
.write = vfio_pci_core_write,
diff --git a/drivers/vfio/platform/vfio_amba.c b/drivers/vfio/platform/vfio_amba.c
index 9f5c527baa8a..fa754f203b2d 100644
--- a/drivers/vfio/platform/vfio_amba.c
+++ b/drivers/vfio/platform/vfio_amba.c
@@ -115,6 +115,7 @@ static const struct vfio_device_ops vfio_amba_ops = {
.open_device = vfio_platform_open_device,
.close_device = vfio_platform_close_device,
.ioctl = vfio_platform_ioctl,
+ .get_region_info_caps = vfio_platform_ioctl_get_region_info,
.read = vfio_platform_read,
.write = vfio_platform_write,
.mmap = vfio_platform_mmap,
diff --git a/drivers/vfio/platform/vfio_platform.c b/drivers/vfio/platform/vfio_platform.c
index 512533501eb7..a4d3ace3e02d 100644
--- a/drivers/vfio/platform/vfio_platform.c
+++ b/drivers/vfio/platform/vfio_platform.c
@@ -101,6 +101,7 @@ static const struct vfio_device_ops vfio_platform_ops = {
.open_device = vfio_platform_open_device,
.close_device = vfio_platform_close_device,
.ioctl = vfio_platform_ioctl,
+ .get_region_info_caps = vfio_platform_ioctl_get_region_info,
.read = vfio_platform_read,
.write = vfio_platform_write,
.mmap = vfio_platform_mmap,
diff --git a/drivers/vfio/platform/vfio_platform_common.c b/drivers/vfio/platform/vfio_platform_common.c
index 3bf1043cd795..c2990b7e900f 100644
--- a/drivers/vfio/platform/vfio_platform_common.c
+++ b/drivers/vfio/platform/vfio_platform_common.c
@@ -272,6 +272,24 @@ err_irq:
}
EXPORT_SYMBOL_GPL(vfio_platform_open_device);
+int vfio_platform_ioctl_get_region_info(struct vfio_device *core_vdev,
+ struct vfio_region_info *info,
+ struct vfio_info_cap *caps)
+{
+ struct vfio_platform_device *vdev =
+ container_of(core_vdev, struct vfio_platform_device, vdev);
+
+ if (info->index >= vdev->num_regions)
+ return -EINVAL;
+
+ /* map offset to the physical address */
+ info->offset = VFIO_PLATFORM_INDEX_TO_OFFSET(info->index);
+ info->size = vdev->regions[info->index].size;
+ info->flags = vdev->regions[info->index].flags;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(vfio_platform_ioctl_get_region_info);
+
long vfio_platform_ioctl(struct vfio_device *core_vdev,
unsigned int cmd, unsigned long arg)
{
@@ -300,28 +318,6 @@ long vfio_platform_ioctl(struct vfio_device *core_vdev,
return copy_to_user((void __user *)arg, &info, minsz) ?
-EFAULT : 0;
- } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) {
- struct vfio_region_info info;
-
- minsz = offsetofend(struct vfio_region_info, offset);
-
- if (copy_from_user(&info, (void __user *)arg, minsz))
- return -EFAULT;
-
- if (info.argsz < minsz)
- return -EINVAL;
-
- if (info.index >= vdev->num_regions)
- return -EINVAL;
-
- /* map offset to the physical address */
- info.offset = VFIO_PLATFORM_INDEX_TO_OFFSET(info.index);
- info.size = vdev->regions[info.index].size;
- info.flags = vdev->regions[info.index].flags;
-
- return copy_to_user((void __user *)arg, &info, minsz) ?
- -EFAULT : 0;
-
} else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) {
struct vfio_irq_info info;
diff --git a/drivers/vfio/platform/vfio_platform_private.h b/drivers/vfio/platform/vfio_platform_private.h
index 8d8fab516849..05084212a76e 100644
--- a/drivers/vfio/platform/vfio_platform_private.h
+++ b/drivers/vfio/platform/vfio_platform_private.h
@@ -85,6 +85,9 @@ int vfio_platform_open_device(struct vfio_device *core_vdev);
void vfio_platform_close_device(struct vfio_device *core_vdev);
long vfio_platform_ioctl(struct vfio_device *core_vdev,
unsigned int cmd, unsigned long arg);
+int vfio_platform_ioctl_get_region_info(struct vfio_device *core_vdev,
+ struct vfio_region_info *info,
+ struct vfio_info_cap *caps);
ssize_t vfio_platform_read(struct vfio_device *core_vdev,
char __user *buf, size_t count,
loff_t *ppos);
diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
index 38c8e9350a60..f7df90c423b4 100644
--- a/drivers/vfio/vfio_main.c
+++ b/drivers/vfio/vfio_main.c
@@ -172,11 +172,13 @@ void vfio_device_put_registration(struct vfio_device *device)
if (refcount_dec_and_test(&device->refcount))
complete(&device->comp);
}
+EXPORT_SYMBOL_GPL(vfio_device_put_registration);
bool vfio_device_try_get_registration(struct vfio_device *device)
{
return refcount_inc_not_zero(&device->refcount);
}
+EXPORT_SYMBOL_GPL(vfio_device_try_get_registration);
/*
* VFIO driver API
@@ -1259,6 +1261,51 @@ static int vfio_ioctl_device_feature(struct vfio_device *device,
}
}
+static long vfio_get_region_info(struct vfio_device *device,
+ struct vfio_region_info __user *arg)
+{
+ unsigned long minsz = offsetofend(struct vfio_region_info, offset);
+ struct vfio_region_info info = {};
+ struct vfio_info_cap caps = {};
+ int ret;
+
+ if (unlikely(!device->ops->get_region_info_caps))
+ return -EINVAL;
+
+ if (copy_from_user(&info, arg, minsz))
+ return -EFAULT;
+ if (info.argsz < minsz)
+ return -EINVAL;
+
+ ret = device->ops->get_region_info_caps(device, &info, &caps);
+ if (ret)
+ goto out_free;
+
+ if (caps.size) {
+ info.flags |= VFIO_REGION_INFO_FLAG_CAPS;
+ if (info.argsz < sizeof(info) + caps.size) {
+ info.argsz = sizeof(info) + caps.size;
+ info.cap_offset = 0;
+ } else {
+ vfio_info_cap_shift(&caps, sizeof(info));
+ if (copy_to_user(arg + 1, caps.buf, caps.size)) {
+ ret = -EFAULT;
+ goto out_free;
+ }
+ info.cap_offset = sizeof(info);
+ }
+ }
+
+ if (copy_to_user(arg, &info, minsz)){
+ ret = -EFAULT;
+ goto out_free;
+ }
+
+out_free:
+ kfree(caps.buf);
+ return ret;
+}
+
static long vfio_device_fops_unl_ioctl(struct file *filep,
unsigned int cmd, unsigned long arg)
{
@@ -1296,6 +1343,10 @@ static long vfio_device_fops_unl_ioctl(struct file *filep,
ret = vfio_ioctl_device_feature(device, uptr);
break;
+ case VFIO_DEVICE_GET_REGION_INFO:
+ ret = vfio_get_region_info(device, uptr);
+ break;
+
default:
if (unlikely(!device->ops->ioctl))
ret = -EINVAL;
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 8f7f50acb6d6..7f886d3dba7d 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -69,15 +69,15 @@ MODULE_PARM_DESC(experimental_zcopytx, "Enable Zero Copy TX;"
#define VHOST_DMA_IS_DONE(len) ((__force u32)(len) >= (__force u32)VHOST_DMA_DONE_LEN)
-static const u64 vhost_net_features[VIRTIO_FEATURES_DWORDS] = {
- VHOST_FEATURES |
- (1ULL << VHOST_NET_F_VIRTIO_NET_HDR) |
- (1ULL << VIRTIO_NET_F_MRG_RXBUF) |
- (1ULL << VIRTIO_F_ACCESS_PLATFORM) |
- (1ULL << VIRTIO_F_RING_RESET) |
- (1ULL << VIRTIO_F_IN_ORDER),
- VIRTIO_BIT(VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO) |
- VIRTIO_BIT(VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO),
+static const int vhost_net_bits[] = {
+ VHOST_FEATURES,
+ VHOST_NET_F_VIRTIO_NET_HDR,
+ VIRTIO_NET_F_MRG_RXBUF,
+ VIRTIO_F_ACCESS_PLATFORM,
+ VIRTIO_F_RING_RESET,
+ VIRTIO_F_IN_ORDER,
+ VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO,
+ VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO
};
enum {
@@ -1731,7 +1731,8 @@ out:
static long vhost_net_ioctl(struct file *f, unsigned int ioctl,
unsigned long arg)
{
- u64 all_features[VIRTIO_FEATURES_DWORDS];
+ const DEFINE_VHOST_FEATURES_ARRAY(vhost_net_features, vhost_net_bits);
+ u64 all_features[VIRTIO_FEATURES_U64S];
struct vhost_net *n = f->private_data;
void __user *argp = (void __user *)arg;
u64 __user *featurep = argp;
@@ -1763,7 +1764,7 @@ static long vhost_net_ioctl(struct file *f, unsigned int ioctl,
/* Copy the net features, up to the user-provided buffer size */
argp += sizeof(u64);
- copied = min(count, VIRTIO_FEATURES_DWORDS);
+ copied = min(count, (u64)VIRTIO_FEATURES_U64S);
if (copy_to_user(argp, vhost_net_features,
copied * sizeof(u64)))
return -EFAULT;
@@ -1778,13 +1779,13 @@ static long vhost_net_ioctl(struct file *f, unsigned int ioctl,
virtio_features_zero(all_features);
argp += sizeof(u64);
- copied = min(count, VIRTIO_FEATURES_DWORDS);
+ copied = min(count, (u64)VIRTIO_FEATURES_U64S);
if (copy_from_user(all_features, argp, copied * sizeof(u64)))
return -EFAULT;
/*
* Any feature specified by user-space above
- * VIRTIO_FEATURES_MAX is not supported by definition.
+ * VIRTIO_FEATURES_BITS is not supported by definition.
*/
for (i = copied; i < count; ++i) {
if (copy_from_user(&features, featurep + 1 + i,
@@ -1794,7 +1795,7 @@ static long vhost_net_ioctl(struct file *f, unsigned int ioctl,
return -EOPNOTSUPP;
}
- for (i = 0; i < VIRTIO_FEATURES_DWORDS; i++)
+ for (i = 0; i < VIRTIO_FEATURES_U64S; i++)
if (all_features[i] & ~vhost_net_features[i])
return -EOPNOTSUPP;
diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index 98e4f68f4e3c..f43c1fe9fad9 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -197,11 +197,14 @@ enum {
};
/* Note: can't set VIRTIO_F_VERSION_1 yet, since that implies ANY_LAYOUT. */
-enum {
- VHOST_SCSI_FEATURES = VHOST_FEATURES | (1ULL << VIRTIO_SCSI_F_HOTPLUG) |
- (1ULL << VIRTIO_SCSI_F_T10_PI)
+static const int vhost_scsi_bits[] = {
+ VHOST_FEATURES,
+ VIRTIO_SCSI_F_HOTPLUG,
+ VIRTIO_SCSI_F_T10_PI
};
+#define VHOST_SCSI_FEATURES VHOST_FEATURES_U64(vhost_scsi_bits, 0)
+
#define VHOST_SCSI_MAX_TARGET 256
#define VHOST_SCSI_MAX_IO_VQ 1024
#define VHOST_SCSI_MAX_EVENT 128
diff --git a/drivers/vhost/test.c b/drivers/vhost/test.c
index 42c955a5b211..1e4e36edbcd2 100644
--- a/drivers/vhost/test.c
+++ b/drivers/vhost/test.c
@@ -28,6 +28,12 @@
*/
#define VHOST_TEST_PKT_WEIGHT 256
+static const int vhost_test_bits[] = {
+ VHOST_FEATURES
+};
+
+#define VHOST_TEST_FEATURES VHOST_FEATURES_U64(vhost_test_bits, 0)
+
enum {
VHOST_TEST_VQ = 0,
VHOST_TEST_VQ_MAX = 1,
@@ -328,14 +334,14 @@ static long vhost_test_ioctl(struct file *f, unsigned int ioctl,
return -EFAULT;
return vhost_test_set_backend(n, backend.index, backend.fd);
case VHOST_GET_FEATURES:
- features = VHOST_FEATURES;
+ features = VHOST_TEST_FEATURES;
if (copy_to_user(featurep, &features, sizeof features))
return -EFAULT;
return 0;
case VHOST_SET_FEATURES:
if (copy_from_user(&features, featurep, sizeof features))
return -EFAULT;
- if (features & ~VHOST_FEATURES)
+ if (features & ~VHOST_TEST_FEATURES)
return -EOPNOTSUPP;
return vhost_test_set_features(n, features);
case VHOST_RESET_OWNER:
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index a78226b37739..bccdc9eab267 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -804,11 +804,13 @@ static int vhost_kthread_worker_create(struct vhost_worker *worker,
ret = vhost_attach_task_to_cgroups(worker);
if (ret)
- goto stop_worker;
+ goto free_id;
worker->id = id;
return 0;
+free_id:
+ xa_erase(&dev->worker_xa, id);
stop_worker:
vhost_kthread_do_stop(worker);
return ret;
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index b49f08e4a1b4..4fe99765c5c7 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -14,6 +14,7 @@
#include <linux/atomic.h>
#include <linux/vhost_iotlb.h>
#include <linux/irqbypass.h>
+#include <linux/unroll.h>
struct vhost_work;
struct vhost_task;
@@ -287,14 +288,39 @@ void vhost_iotlb_map_free(struct vhost_iotlb *iotlb,
eventfd_signal((vq)->error_ctx);\
} while (0)
-enum {
- VHOST_FEATURES = (1ULL << VIRTIO_F_NOTIFY_ON_EMPTY) |
- (1ULL << VIRTIO_RING_F_INDIRECT_DESC) |
- (1ULL << VIRTIO_RING_F_EVENT_IDX) |
- (1ULL << VHOST_F_LOG_ALL) |
- (1ULL << VIRTIO_F_ANY_LAYOUT) |
- (1ULL << VIRTIO_F_VERSION_1)
-};
+#define VHOST_FEATURES \
+ VIRTIO_F_NOTIFY_ON_EMPTY, \
+ VIRTIO_RING_F_INDIRECT_DESC, \
+ VIRTIO_RING_F_EVENT_IDX, \
+ VHOST_F_LOG_ALL, \
+ VIRTIO_F_ANY_LAYOUT, \
+ VIRTIO_F_VERSION_1
+
+static inline u64 vhost_features_u64(const int *features, int size, int idx)
+{
+ u64 res = 0;
+
+ unrolled_count(VIRTIO_FEATURES_BITS)
+ for (int i = 0; i < size; ++i) {
+ int bit = features[i];
+
+ if (virtio_features_chk_bit(bit) && VIRTIO_U64(bit) == idx)
+ res |= VIRTIO_BIT(bit);
+ }
+ return res;
+}
+
+#define VHOST_FEATURES_U64(features, idx) \
+ vhost_features_u64(features, ARRAY_SIZE(features), idx)
+
+#define DEFINE_VHOST_FEATURES_ARRAY_ENTRY(idx, features) \
+ [idx] = VHOST_FEATURES_U64(features, idx),
+
+#define DEFINE_VHOST_FEATURES_ARRAY(array, features) \
+ u64 array[VIRTIO_FEATURES_U64S] = { \
+ UNROLL(VIRTIO_FEATURES_U64S, \
+ DEFINE_VHOST_FEATURES_ARRAY_ENTRY, features) \
+ }
/**
* vhost_vq_set_backend - Set backend.
diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index ae01457ea2cd..0298ddc34824 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -29,12 +29,14 @@
*/
#define VHOST_VSOCK_PKT_WEIGHT 256
-enum {
- VHOST_VSOCK_FEATURES = VHOST_FEATURES |
- (1ULL << VIRTIO_F_ACCESS_PLATFORM) |
- (1ULL << VIRTIO_VSOCK_F_SEQPACKET)
+static const int vhost_vsock_bits[] = {
+ VHOST_FEATURES,
+ VIRTIO_F_ACCESS_PLATFORM,
+ VIRTIO_VSOCK_F_SEQPACKET
};
+#define VHOST_VSOCK_FEATURES VHOST_FEATURES_U64(vhost_vsock_bits, 0)
+
enum {
VHOST_VSOCK_BACKEND_FEATURES = (1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2)
};
diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
index a09eb4d62f82..5bdc6b82b30b 100644
--- a/drivers/virtio/virtio.c
+++ b/drivers/virtio/virtio.c
@@ -53,7 +53,7 @@ static ssize_t features_show(struct device *_d,
/* We actually represent this as a bitstring, as it could be
* arbitrary length in future. */
- for (i = 0; i < VIRTIO_FEATURES_MAX; i++)
+ for (i = 0; i < VIRTIO_FEATURES_BITS; i++)
len += sysfs_emit_at(buf, len, "%c",
__virtio_test_bit(dev, i) ? '1' : '0');
len += sysfs_emit_at(buf, len, "\n");
@@ -272,8 +272,8 @@ static int virtio_dev_probe(struct device *_d)
int err, i;
struct virtio_device *dev = dev_to_virtio(_d);
struct virtio_driver *drv = drv_to_virtio(dev->dev.driver);
- u64 device_features[VIRTIO_FEATURES_DWORDS];
- u64 driver_features[VIRTIO_FEATURES_DWORDS];
+ u64 device_features[VIRTIO_FEATURES_U64S];
+ u64 driver_features[VIRTIO_FEATURES_U64S];
u64 driver_features_legacy;
/* We have a driver! */
@@ -286,7 +286,7 @@ static int virtio_dev_probe(struct device *_d)
virtio_features_zero(driver_features);
for (i = 0; i < drv->feature_table_size; i++) {
unsigned int f = drv->feature_table[i];
- if (!WARN_ON_ONCE(f >= VIRTIO_FEATURES_MAX))
+ if (!WARN_ON_ONCE(f >= VIRTIO_FEATURES_BITS))
virtio_features_set_bit(driver_features, f);
}
@@ -303,7 +303,7 @@ static int virtio_dev_probe(struct device *_d)
}
if (virtio_features_test_bit(device_features, VIRTIO_F_VERSION_1)) {
- for (i = 0; i < VIRTIO_FEATURES_DWORDS; ++i)
+ for (i = 0; i < VIRTIO_FEATURES_U64S; ++i)
dev->features_array[i] = driver_features[i] &
device_features[i];
} else {
@@ -325,7 +325,7 @@ static int virtio_dev_probe(struct device *_d)
goto err;
if (drv->validate) {
- u64 features[VIRTIO_FEATURES_DWORDS];
+ u64 features[VIRTIO_FEATURES_U64S];
virtio_features_copy(features, dev->features_array);
err = drv->validate(dev);
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 1b93d8c64361..74fe59f5a78c 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -983,7 +983,8 @@ static int virtballoon_probe(struct virtio_device *vdev)
goto out_del_vqs;
}
vb->balloon_wq = alloc_workqueue("balloon-wq",
- WQ_FREEZABLE | WQ_CPU_INTENSIVE, 0);
+ WQ_FREEZABLE | WQ_CPU_INTENSIVE | WQ_PERCPU,
+ 0);
if (!vb->balloon_wq) {
err = -ENOMEM;
goto out_del_vqs;
diff --git a/drivers/virtio/virtio_debug.c b/drivers/virtio/virtio_debug.c
index d58713ddf2e5..ccf1955a1183 100644
--- a/drivers/virtio/virtio_debug.c
+++ b/drivers/virtio/virtio_debug.c
@@ -8,12 +8,12 @@ static struct dentry *virtio_debugfs_dir;
static int virtio_debug_device_features_show(struct seq_file *s, void *data)
{
- u64 device_features[VIRTIO_FEATURES_DWORDS];
+ u64 device_features[VIRTIO_FEATURES_U64S];
struct virtio_device *dev = s->private;
unsigned int i;
virtio_get_features(dev, device_features);
- for (i = 0; i < VIRTIO_FEATURES_MAX; i++) {
+ for (i = 0; i < VIRTIO_FEATURES_BITS; i++) {
if (virtio_features_test_bit(device_features, i))
seq_printf(s, "%u\n", i);
}
@@ -26,7 +26,7 @@ static int virtio_debug_filter_features_show(struct seq_file *s, void *data)
struct virtio_device *dev = s->private;
unsigned int i;
- for (i = 0; i < VIRTIO_FEATURES_MAX; i++) {
+ for (i = 0; i < VIRTIO_FEATURES_BITS; i++) {
if (virtio_features_test_bit(dev->debugfs_filter_features, i))
seq_printf(s, "%u\n", i);
}
@@ -50,7 +50,7 @@ static int virtio_debug_filter_feature_add(void *data, u64 val)
{
struct virtio_device *dev = data;
- if (val >= VIRTIO_FEATURES_MAX)
+ if (val >= VIRTIO_FEATURES_BITS)
return -EINVAL;
virtio_features_set_bit(dev->debugfs_filter_features, val);
@@ -64,7 +64,7 @@ static int virtio_debug_filter_feature_del(void *data, u64 val)
{
struct virtio_device *dev = data;
- if (val >= VIRTIO_FEATURES_MAX)
+ if (val >= VIRTIO_FEATURES_BITS)
return -EINVAL;
virtio_features_clear_bit(dev->debugfs_filter_features, val);
diff --git a/drivers/virtio/virtio_pci_modern_dev.c b/drivers/virtio/virtio_pci_modern_dev.c
index 9e503b7a58d8..413a8c353463 100644
--- a/drivers/virtio/virtio_pci_modern_dev.c
+++ b/drivers/virtio/virtio_pci_modern_dev.c
@@ -401,7 +401,7 @@ void vp_modern_get_extended_features(struct virtio_pci_modern_device *mdev,
int i;
virtio_features_zero(features);
- for (i = 0; i < VIRTIO_FEATURES_WORDS; i++) {
+ for (i = 0; i < VIRTIO_FEATURES_BITS / 32; i++) {
u64 cur;
vp_iowrite32(i, &cfg->device_feature_select);
@@ -427,7 +427,7 @@ vp_modern_get_driver_extended_features(struct virtio_pci_modern_device *mdev,
int i;
virtio_features_zero(features);
- for (i = 0; i < VIRTIO_FEATURES_WORDS; i++) {
+ for (i = 0; i < VIRTIO_FEATURES_BITS / 32; i++) {
u64 cur;
vp_iowrite32(i, &cfg->guest_feature_select);
@@ -448,7 +448,7 @@ void vp_modern_set_extended_features(struct virtio_pci_modern_device *mdev,
struct virtio_pci_common_cfg __iomem *cfg = mdev->common;
int i;
- for (i = 0; i < VIRTIO_FEATURES_WORDS; i++) {
+ for (i = 0; i < VIRTIO_FEATURES_BITS / 32; i++) {
u32 cur = features[i >> 1] >> (32 * (i & 1));
vp_iowrite32(i, &cfg->guest_feature_select);
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 7b6205253b46..ddab68959671 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -3166,6 +3166,7 @@ EXPORT_SYMBOL_GPL(virtqueue_map_alloc_coherent);
* @vdev: the virtio device we are talking to
* @map: metadata for performing mapping
* @size: the size of the buffer
+ * @vaddr: the virtual address that needs to be freed
* @map_handle: the mapped address that needs to be freed
*
*/
@@ -3190,7 +3191,7 @@ EXPORT_SYMBOL_GPL(virtqueue_map_free_coherent);
* @dir: mapping direction
* @attrs: mapping attributes
*
- * Returns mapped address. Caller should check that by virtqueue_mapping_error().
+ * Returns mapped address. Caller should check that by virtqueue_map_mapping_error().
*/
dma_addr_t virtqueue_map_page_attrs(const struct virtqueue *_vq,
struct page *page,
@@ -3249,7 +3250,7 @@ EXPORT_SYMBOL_GPL(virtqueue_unmap_page_attrs);
* The caller calls this to do dma mapping in advance. The DMA address can be
* passed to this _vq when it is in pre-mapped mode.
*
- * return mapped address. Caller should check that by virtqueue_mapping_error().
+ * return mapped address. Caller should check that by virtqueue_map_mapping_error().
*/
dma_addr_t virtqueue_map_single_attrs(const struct virtqueue *_vq, void *ptr,
size_t size,
@@ -3299,7 +3300,7 @@ void virtqueue_unmap_single_attrs(const struct virtqueue *_vq,
EXPORT_SYMBOL_GPL(virtqueue_unmap_single_attrs);
/**
- * virtqueue_mapping_error - check dma address
+ * virtqueue_map_mapping_error - check dma address
* @_vq: the struct virtqueue we're talking about.
* @addr: DMA address
*
diff --git a/drivers/virtio/virtio_vdpa.c b/drivers/virtio/virtio_vdpa.c
index f9a29045eca0..0a801f67b599 100644
--- a/drivers/virtio/virtio_vdpa.c
+++ b/drivers/virtio/virtio_vdpa.c
@@ -80,7 +80,7 @@ static void virtio_vdpa_set_status(struct virtio_device *vdev, u8 status)
{
struct vdpa_device *vdpa = vd_get_vdpa(vdev);
- return vdpa_set_status(vdpa, status);
+ vdpa_set_status(vdpa, status);
}
static void virtio_vdpa_reset(struct virtio_device *vdev)
diff --git a/include/dt-bindings/memory/mediatek,mt8189-memory-port.h b/include/dt-bindings/memory/mediatek,mt8189-memory-port.h
new file mode 100644
index 000000000000..849fead3d0f7
--- /dev/null
+++ b/include/dt-bindings/memory/mediatek,mt8189-memory-port.h
@@ -0,0 +1,283 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/*
+ * Copyright (c) 2025 MediaTek Inc.
+ * Author: Zhengnan chen <zhengnan.chen@mediatek.com>
+ */
+#ifndef _DT_BINDINGS_MEMORY_MEDIATEK_MT8189_MEMORY_PORT_H_
+#define _DT_BINDINGS_MEMORY_MEDIATEK_MT8189_MEMORY_PORT_H_
+
+#include <dt-bindings/memory/mtk-memory-port.h>
+
+#define SMI_L0_ID (0)
+#define SMI_L1_ID (1)
+#define SMI_L2_ID (2)
+#define SMI_L4_ID (3)
+#define SMI_L7_ID (4)
+#define SMI_L9_ID (5)
+#define SMI_L11_ID (6)
+#define SMI_L13_ID (7)
+#define SMI_L14_ID (8)
+#define SMI_L16_ID (9)
+#define SMI_L17_ID (10)
+#define SMI_L19_ID (11)
+#define SMI_L20_ID (12)
+
+/*
+ * MM IOMMU supports 16GB dma address. We separate it to four ranges:
+ * 0 ~ 4G; 4G ~ 8G; 8G ~ 12G; 12G ~ 16G, we could adjust these masters
+ * locate in anyone region. BUT:
+ * a) Make sure all the ports inside a larb are in one range.
+ * b) The iova of any master can NOT cross the 4G/8G/12G boundary.
+ *
+ * This is the suggested mapping in this SoC:
+ *
+ * modules dma-address-region larbs-ports
+ * disp/mdp 0 ~ 4G larb0/1/2
+ * vcodec 4G ~ 8G larb4/7
+ * imgsys/cam/ipesys 8G ~ 12G the other larbs.
+ * N/A 12G ~ 16G
+ */
+
+/* Larb0 -- disp */
+#define M4U_L0_P0_DISP_OVL0_4L_HDR MTK_M4U_ID(SMI_L0_ID, 0)
+#define M4U_L0_P1_DISP_OVL0_4L_RDMA0 MTK_M4U_ID(SMI_L0_ID, 1)
+#define M4U_L0_P2_DISP_OVL1_4L_RDMA1 MTK_M4U_ID(SMI_L0_ID, 2)
+#define M4U_L0_P3_DISP_OVL0_4L_RDMA2 MTK_M4U_ID(SMI_L0_ID, 3)
+#define M4U_L0_P4_DISP_OVL1_4L_RDMA3 MTK_M4U_ID(SMI_L0_ID, 4)
+#define M4U_L0_P5_DISP_RDMA0 MTK_M4U_ID(SMI_L0_ID, 5)
+#define M4U_L0_P6_DISP_WDMA0 MTK_M4U_ID(SMI_L0_ID, 6)
+#define M4U_L0_P7_DISP_FAKE_ENG0 MTK_M4U_ID(SMI_L0_ID, 7)
+
+/* Larb1 -- disp */
+#define M4U_L1_P0_DISP_OVL1_4L_HDR MTK_M4U_ID(SMI_L1_ID, 0)
+#define M4U_L1_P1_DISP_OVL1_4L_RDMA0 MTK_M4U_ID(SMI_L1_ID, 1)
+#define M4U_L1_P2_DISP_OVL0_4L_RDMA1 MTK_M4U_ID(SMI_L1_ID, 2)
+#define M4U_L1_P3_DISP_OVL1_4L_RDMA2 MTK_M4U_ID(SMI_L1_ID, 3)
+#define M4U_L1_P4_DISP_OVL0_4L_RDMA3 MTK_M4U_ID(SMI_L1_ID, 4)
+#define M4U_L1_P5_DISP_RDMA1 MTK_M4U_ID(SMI_L1_ID, 5)
+#define M4U_L1_P6_DISP_WDMA1 MTK_M4U_ID(SMI_L1_ID, 6)
+#define M4U_L1_P7_DISP_FAKE_ENG1 MTK_M4U_ID(SMI_L1_ID, 7)
+
+/* Larb2 -- mmlsys(mdp) */
+#define M4U_L2_P0_MDP_RDMA0 MTK_M4U_ID(SMI_L2_ID, 0)
+#define M4U_L2_P1_MDP_RDMA1 MTK_M4U_ID(SMI_L2_ID, 1)
+#define M4U_L2_P2_MDP_WROT0 MTK_M4U_ID(SMI_L2_ID, 2)
+#define M4U_L2_P3_MDP_WROT1 MTK_M4U_ID(SMI_L2_ID, 3)
+#define M4U_L2_P4_MDP_DUMMY0 MTK_M4U_ID(SMI_L2_ID, 4)
+#define M4U_L2_P5_MDP_DUMMY1 MTK_M4U_ID(SMI_L2_ID, 5)
+#define M4U_L2_P6_MDP_RDMA2 MTK_M4U_ID(SMI_L2_ID, 6)
+#define M4U_L2_P7_MDP_RDMA3 MTK_M4U_ID(SMI_L2_ID, 7)
+#define M4U_L2_P8_MDP_WROT2 MTK_M4U_ID(SMI_L2_ID, 8)
+#define M4U_L2_P9_MDP_WROT3 MTK_M4U_ID(SMI_L2_ID, 9)
+#define M4U_L2_P10_DISP_FAKE0 MTK_M4U_ID(SMI_L2_ID, 10)
+
+/* Larb3: null */
+
+/* Larb4 -- vdec */
+#define M4U_L4_P0_HW_VDEC_MC_EXT MTK_M4U_ID(SMI_L4_ID, 0)
+#define M4U_L4_P1_HW_VDEC_UFO_EXT MTK_M4U_ID(SMI_L4_ID, 1)
+#define M4U_L4_P2_HW_VDEC_PP_EXT MTK_M4U_ID(SMI_L4_ID, 2)
+#define M4U_L4_P3_HW_VDEC_PRED_RD_EXT MTK_M4U_ID(SMI_L4_ID, 3)
+#define M4U_L4_P4_HW_VDEC_PRED_WR_EXT MTK_M4U_ID(SMI_L4_ID, 4)
+#define M4U_L4_P5_HW_VDEC_PPWRAP_EXT MTK_M4U_ID(SMI_L4_ID, 5)
+#define M4U_L4_P6_HW_VDEC_TILE_EXT MTK_M4U_ID(SMI_L4_ID, 6)
+#define M4U_L4_P7_HW_VDEC_VLD_EXT MTK_M4U_ID(SMI_L4_ID, 7)
+#define M4U_L4_P8_HW_VDEC_VLD2_EXT MTK_M4U_ID(SMI_L4_ID, 8)
+#define M4U_L4_P9_HW_VDEC_AVC_MV_EXT MTK_M4U_ID(SMI_L4_ID, 9)
+#define M4U_L4_P10_HW_VDEC_RG_CTRL_DMA_EXT MTK_M4U_ID(SMI_L4_ID, 10)
+#define M4U_L4_P11_HW_VDEC_UFO_ENC_EXT MTK_M4U_ID(SMI_L4_ID, 11)
+
+/* Larb5: null */
+
+/* Larb6: null */
+
+/* Larb7 -- venc */
+#define M4U_L7_P0_VENC_RCPU MTK_M4U_ID(SMI_L7_ID, 0)
+#define M4U_L7_P1_VENC_REC MTK_M4U_ID(SMI_L7_ID, 1)
+#define M4U_L7_P2_VENC_BSDMA MTK_M4U_ID(SMI_L7_ID, 2)
+#define M4U_L7_P3_VENC_SV_COMV MTK_M4U_ID(SMI_L7_ID, 3)
+#define M4U_L7_P4_VENC_RD_COMV MTK_M4U_ID(SMI_L7_ID, 4)
+#define M4U_L7_P5_JPGENC_Y_RDMA MTK_M4U_ID(SMI_L7_ID, 5)
+#define M4U_L7_P6_JPGENC_C_RDMA MTK_M4U_ID(SMI_L7_ID, 6)
+#define M4U_L7_P7_JPGENC_Q_RDMA MTK_M4U_ID(SMI_L7_ID, 7)
+#define M4U_L7_P8_VENC_SUB_W_LUMA MTK_M4U_ID(SMI_L7_ID, 8)
+#define M4U_L7_P9_JPGENC_BSDMA MTK_M4U_ID(SMI_L7_ID, 9)
+#define M4U_L7_P10_VENC_CUR_LUMA MTK_M4U_ID(SMI_L7_ID, 10)
+#define M4U_L7_P11_VENC_CUR_CHROMA MTK_M4U_ID(SMI_L7_ID, 11)
+#define M4U_L7_P12_VENC_REF_LUMA MTK_M4U_ID(SMI_L7_ID, 12)
+#define M4U_L7_P13_VENC_REF_CHROMA MTK_M4U_ID(SMI_L7_ID, 13)
+#define M4U_L7_P14_VENC_SUB_R_LUMA MTK_M4U_ID(SMI_L7_ID, 14)
+#define M4U_L7_P15_JPGDEC_WDMA MTK_M4U_ID(SMI_L7_ID, 15)
+#define M4U_L7_P16_JPGDEC_BSDMA MTK_M4U_ID(SMI_L7_ID, 16)
+#define M4U_L7_P17_JPGDEC_HUFF_OFFSET MTK_M4U_ID(SMI_L7_ID, 17)
+
+/* Larb8: null */
+
+/* Larb9 --imgsys */
+#define M4U_L9_P0_IMGI_D1 MTK_M4U_ID(SMI_L9_ID, 0)
+#define M4U_L9_P1_IMGBI_D1 MTK_M4U_ID(SMI_L9_ID, 1)
+#define M4U_L9_P2_DMGI_D1 MTK_M4U_ID(SMI_L9_ID, 2)
+#define M4U_L9_P3_DEPI_D1 MTK_M4U_ID(SMI_L9_ID, 3)
+#define M4U_L9_P4_LCE_D1 MTK_M4U_ID(SMI_L9_ID, 4)
+#define M4U_L9_P5_SMTI_D1 MTK_M4U_ID(SMI_L9_ID, 5)
+#define M4U_L9_P6_SMTO_D2 MTK_M4U_ID(SMI_L9_ID, 6)
+#define M4U_L9_P7_SMTO_D1 MTK_M4U_ID(SMI_L9_ID, 7)
+#define M4U_L9_P8_CRZO_D1 MTK_M4U_ID(SMI_L9_ID, 8)
+#define M4U_L9_P9_IMG3O_D1 MTK_M4U_ID(SMI_L9_ID, 9)
+#define M4U_L9_P10_VIPI_D1 MTK_M4U_ID(SMI_L9_ID, 10)
+#define M4U_L9_P11_SMTI_D5 MTK_M4U_ID(SMI_L9_ID, 11)
+#define M4U_L9_P12_TIMGO_D1 MTK_M4U_ID(SMI_L9_ID, 12)
+#define M4U_L9_P13_UFBC_W0 MTK_M4U_ID(SMI_L9_ID, 13)
+#define M4U_L9_P14_UFBC_R0 MTK_M4U_ID(SMI_L9_ID, 14)
+#define M4U_L9_P15_WPE_RDMA1 MTK_M4U_ID(SMI_L9_ID, 15)
+#define M4U_L9_P16_WPE_RDMA0 MTK_M4U_ID(SMI_L9_ID, 16)
+#define M4U_L9_P17_WPE_WDMA MTK_M4U_ID(SMI_L9_ID, 17)
+#define M4U_L9_P18_MFB_RDMA0 MTK_M4U_ID(SMI_L9_ID, 18)
+#define M4U_L9_P19_MFB_RDMA1 MTK_M4U_ID(SMI_L9_ID, 19)
+#define M4U_L9_P20_MFB_RDMA2 MTK_M4U_ID(SMI_L9_ID, 20)
+#define M4U_L9_P21_MFB_RDMA3 MTK_M4U_ID(SMI_L9_ID, 21)
+#define M4U_L9_P22_MFB_RDMA4 MTK_M4U_ID(SMI_L9_ID, 22)
+#define M4U_L9_P23_MFB_RDMA5 MTK_M4U_ID(SMI_L9_ID, 23)
+#define M4U_L9_P24_MFB_WDMA0 MTK_M4U_ID(SMI_L9_ID, 24)
+#define M4U_L9_P25_MFB_WDMA1 MTK_M4U_ID(SMI_L9_ID, 25)
+#define M4U_L9_P26_RESERVE6 MTK_M4U_ID(SMI_L9_ID, 26)
+#define M4U_L9_P27_RESERVE7 MTK_M4U_ID(SMI_L9_ID, 27)
+#define M4U_L9_P28_RESERVE8 MTK_M4U_ID(SMI_L9_ID, 28)
+
+/* Larb10: null */
+
+/* Larb11 -- imgsys */
+#define M4U_L11_P0_IMGI_D1 MTK_M4U_ID(SMI_L11_ID, 0)
+#define M4U_L11_P1_IMGBI_D1 MTK_M4U_ID(SMI_L11_ID, 1)
+#define M4U_L11_P2_DMGI_D1 MTK_M4U_ID(SMI_L11_ID, 2)
+#define M4U_L11_P3_DEPI_D1 MTK_M4U_ID(SMI_L11_ID, 3)
+#define M4U_L11_P4_LCE_D1 MTK_M4U_ID(SMI_L11_ID, 4)
+#define M4U_L11_P5_SMTI_D1 MTK_M4U_ID(SMI_L11_ID, 5)
+#define M4U_L11_P6_SMTO_D2 MTK_M4U_ID(SMI_L11_ID, 6)
+#define M4U_L11_P7_SMTO_D1 MTK_M4U_ID(SMI_L11_ID, 7)
+#define M4U_L11_P8_CRZO_D1 MTK_M4U_ID(SMI_L11_ID, 8)
+#define M4U_L11_P9_IMG3O_D1 MTK_M4U_ID(SMI_L11_ID, 9)
+#define M4U_L11_P10_VIPI_D1 MTK_M4U_ID(SMI_L11_ID, 10)
+#define M4U_L11_P11_SMTI_D5 MTK_M4U_ID(SMI_L11_ID, 11)
+#define M4U_L11_P12_TIMGO_D1 MTK_M4U_ID(SMI_L11_ID, 12)
+#define M4U_L11_P13_UFBC_W0 MTK_M4U_ID(SMI_L11_ID, 13)
+#define M4U_L11_P14_UFBC_R0 MTK_M4U_ID(SMI_L11_ID, 14)
+#define M4U_L11_P15_WPE_RDMA1 MTK_M4U_ID(SMI_L11_ID, 15)
+#define M4U_L11_P16_WPE_RDMA0 MTK_M4U_ID(SMI_L11_ID, 16)
+#define M4U_L11_P17_WPE_WDMA MTK_M4U_ID(SMI_L11_ID, 17)
+#define M4U_L11_P18_MFB_RDMA0 MTK_M4U_ID(SMI_L11_ID, 18)
+#define M4U_L11_P19_MFB_RDMA1 MTK_M4U_ID(SMI_L11_ID, 19)
+#define M4U_L11_P20_MFB_RDMA2 MTK_M4U_ID(SMI_L11_ID, 20)
+#define M4U_L11_P21_MFB_RDMA3 MTK_M4U_ID(SMI_L11_ID, 21)
+#define M4U_L11_P22_MFB_RDMA4 MTK_M4U_ID(SMI_L11_ID, 22)
+#define M4U_L11_P23_MFB_RDMA5 MTK_M4U_ID(SMI_L11_ID, 23)
+#define M4U_L11_P24_MFB_WDMA0 MTK_M4U_ID(SMI_L11_ID, 24)
+#define M4U_L11_P25_MFB_WDMA1 MTK_M4U_ID(SMI_L11_ID, 25)
+#define M4U_L11_P26_RESERVE6 MTK_M4U_ID(SMI_L11_ID, 26)
+#define M4U_L11_P27_RESERVE7 MTK_M4U_ID(SMI_L11_ID, 27)
+#define M4U_L11_P28_RESERVE8 MTK_M4U_ID(SMI_L11_ID, 28)
+
+/* Larb12: null */
+
+/* Larb13 -- cam */
+#define M4U_L13_P0_MRAWI MTK_M4U_ID(SMI_L13_ID, 0)
+#define M4U_L13_P1_MRAWO_0 MTK_M4U_ID(SMI_L13_ID, 1)
+#define M4U_L13_P2_MRAWO_1 MTK_M4U_ID(SMI_L13_ID, 2)
+#define M4U_L13_P3_CAMSV_1 MTK_M4U_ID(SMI_L13_ID, 3)
+#define M4U_L13_P4_CAMSV_2 MTK_M4U_ID(SMI_L13_ID, 4)
+#define M4U_L13_P5_CAMSV_3 MTK_M4U_ID(SMI_L13_ID, 5)
+#define M4U_L13_P6_CAMSV_4 MTK_M4U_ID(SMI_L13_ID, 6)
+#define M4U_L13_P7_CAMSV_5 MTK_M4U_ID(SMI_L13_ID, 7)
+#define M4U_L13_P8_CAMSV_6 MTK_M4U_ID(SMI_L13_ID, 8)
+#define M4U_L13_P9_CCUI MTK_M4U_ID(SMI_L13_ID, 9)
+#define M4U_L13_P10_CCUO MTK_M4U_ID(SMI_L13_ID, 10)
+#define M4U_L13_P11_FAKE MTK_M4U_ID(SMI_L13_ID, 11)
+#define M4U_L13_P12_PDAI_0 MTK_M4U_ID(SMI_L13_ID, 12)
+#define M4U_L13_P13_PDAI_1 MTK_M4U_ID(SMI_L13_ID, 13)
+#define M4U_L13_P14_PDAO MTK_M4U_ID(SMI_L13_ID, 14)
+
+/* Larb14 -- cam */
+#define M4U_L14_P0_RESERVE MTK_M4U_ID(SMI_L14_ID, 0)
+#define M4U_L14_P1_RESERVE MTK_M4U_ID(SMI_L14_ID, 1)
+#define M4U_L14_P2_RESERVE MTK_M4U_ID(SMI_L14_ID, 2)
+#define M4U_L14_P3_CAMSV_0 MTK_M4U_ID(SMI_L14_ID, 3)
+#define M4U_L14_P4_CCUI MTK_M4U_ID(SMI_L14_ID, 4)
+#define M4U_L14_P5_CCUO MTK_M4U_ID(SMI_L14_ID, 5)
+#define M4U_L14_P6_CAMSV_7 MTK_M4U_ID(SMI_L14_ID, 6)
+#define M4U_L14_P7_CAMSV_8 MTK_M4U_ID(SMI_L14_ID, 7)
+#define M4U_L14_P8_CAMSV_9 MTK_M4U_ID(SMI_L14_ID, 8)
+#define M4U_L14_P9_CAMSV_10 MTK_M4U_ID(SMI_L14_ID, 9)
+
+/* Larb15: null */
+
+/* Larb16 -- cam */
+#define M4U_L16_P0_IMGO_R1_A MTK_M4U_ID(SMI_L16_ID, 0)
+#define M4U_L16_P1_RRZO_R1_A MTK_M4U_ID(SMI_L16_ID, 1)
+#define M4U_L16_P2_CQI_R1_A MTK_M4U_ID(SMI_L16_ID, 2)
+#define M4U_L16_P3_BPCI_R1_A MTK_M4U_ID(SMI_L16_ID, 3)
+#define M4U_L16_P4_YUVO_R1_A MTK_M4U_ID(SMI_L16_ID, 4)
+#define M4U_L16_P5_UFDI_R2_A MTK_M4U_ID(SMI_L16_ID, 5)
+#define M4U_L16_P6_RAWI_R2_A MTK_M4U_ID(SMI_L16_ID, 6)
+#define M4U_L16_P7_RAWI_R3_A MTK_M4U_ID(SMI_L16_ID, 7)
+#define M4U_L16_P8_AAO_R1_A MTK_M4U_ID(SMI_L16_ID, 8)
+#define M4U_L16_P9_AFO_R1_A MTK_M4U_ID(SMI_L16_ID, 9)
+#define M4U_L16_P10_FLKO_R1_A MTK_M4U_ID(SMI_L16_ID, 10)
+#define M4U_L16_P11_LCESO_R1_A MTK_M4U_ID(SMI_L16_ID, 11)
+#define M4U_L16_P12_CRZO_R1_A MTK_M4U_ID(SMI_L16_ID, 12)
+#define M4U_L16_P13_LTMSO_R1_A MTK_M4U_ID(SMI_L16_ID, 13)
+#define M4U_L16_P14_RSSO_R1_A MTK_M4U_ID(SMI_L16_ID, 14)
+#define M4U_L16_P15_AAHO_R1_A MTK_M4U_ID(SMI_L16_ID, 15)
+#define M4U_L16_P16_LSCI_R1_A MTK_M4U_ID(SMI_L16_ID, 16)
+
+/* Larb17 -- cam */
+#define M4U_L17_P0_IMGO_R1_B MTK_M4U_ID(SMI_L17_ID, 0)
+#define M4U_L17_P1_RRZO_R1_B MTK_M4U_ID(SMI_L17_ID, 1)
+#define M4U_L17_P2_CQI_R1_B MTK_M4U_ID(SMI_L17_ID, 2)
+#define M4U_L17_P3_BPCI_R1_B MTK_M4U_ID(SMI_L17_ID, 3)
+#define M4U_L17_P4_YUVO_R1_B MTK_M4U_ID(SMI_L17_ID, 4)
+#define M4U_L17_P5_UFDI_R2_B MTK_M4U_ID(SMI_L17_ID, 5)
+#define M4U_L17_P6_RAWI_R2_B MTK_M4U_ID(SMI_L17_ID, 6)
+#define M4U_L17_P7_RAWI_R3_B MTK_M4U_ID(SMI_L17_ID, 7)
+#define M4U_L17_P8_AAO_R1_B MTK_M4U_ID(SMI_L17_ID, 8)
+#define M4U_L17_P9_AFO_R1_B MTK_M4U_ID(SMI_L17_ID, 9)
+#define M4U_L17_P10_FLKO_R1_B MTK_M4U_ID(SMI_L17_ID, 10)
+#define M4U_L17_P11_LCESO_R1_B MTK_M4U_ID(SMI_L17_ID, 11)
+#define M4U_L17_P12_CRZO_R1_B MTK_M4U_ID(SMI_L17_ID, 12)
+#define M4U_L17_P13_LTMSO_R1_B MTK_M4U_ID(SMI_L17_ID, 13)
+#define M4U_L17_P14_RSSO_R1_B MTK_M4U_ID(SMI_L17_ID, 14)
+#define M4U_L17_P15_AAHO_R1_B MTK_M4U_ID(SMI_L17_ID, 15)
+#define M4U_L17_P16_LSCI_R1_B MTK_M4U_ID(SMI_L17_ID, 16)
+
+/* Larb19 -- ipesys */
+#define M4U_L19_P0_DVS_RDMA MTK_M4U_ID(SMI_L19_ID, 0)
+#define M4U_L19_P1_DVS_WDMA MTK_M4U_ID(SMI_L19_ID, 1)
+#define M4U_L19_P2_DVP_RDMA MTK_M4U_ID(SMI_L19_ID, 2)
+#define M4U_L19_P3_DVP_WDMA MTK_M4U_ID(SMI_L19_ID, 3)
+
+/* Larb20 -- ipesys */
+#define M4U_L20_P0_FDVT_RDA_0 MTK_M4U_ID(SMI_L20_ID, 0)
+#define M4U_L20_P1_FDVT_RDB_0 MTK_M4U_ID(SMI_L20_ID, 1)
+#define M4U_L20_P2_FDVT_WRA_0 MTK_M4U_ID(SMI_L20_ID, 2)
+#define M4U_L20_P3_FDVT_WRB_0 MTK_M4U_ID(SMI_L20_ID, 3)
+#define M4U_L20_P4_RSC_RDMA MTK_M4U_ID(SMI_L20_ID, 4)
+#define M4U_L20_P5_RSC_WDMA MTK_M4U_ID(SMI_L20_ID, 5)
+
+/* fake larb21 for gce */
+#define M4U_L21_GCE_DM MTK_M4U_ID(21, 0)
+#define M4U_L21_GCE_MM MTK_M4U_ID(21, 1)
+
+/* fake larb & port for svp and dual svp and wfd */
+#define M4U_PORT_SVP_HEAP MTK_M4U_ID(22, 0)
+#define M4U_PORT_DUAL_SVP_HEAP MTK_M4U_ID(22, 1)
+#define M4U_PORT_WFD_HEAP MTK_M4U_ID(22, 2)
+
+/* fake larb0 for apu */
+#define M4U_L0_APU_DATA MTK_M4U_ID(0, 0)
+#define M4U_L0_APU_CODE MTK_M4U_ID(0, 1)
+#define M4U_L0_APU_SECURE MTK_M4U_ID(0, 2)
+#define M4U_L0_APU_VLM MTK_M4U_ID(0, 3)
+
+/* infra/peri */
+#define IFR_IOMMU_PORT_PCIE_0 MTK_IFAIOMMU_PERI_ID(0, 26)
+
+#endif
diff --git a/include/linux/cper.h b/include/linux/cper.h
index 0ed60a91eca9..5b1236d8c65b 100644
--- a/include/linux/cper.h
+++ b/include/linux/cper.h
@@ -297,11 +297,11 @@ enum {
#define CPER_ARM_INFO_FLAGS_PROPAGATED BIT(2)
#define CPER_ARM_INFO_FLAGS_OVERFLOW BIT(3)
-#define CPER_ARM_CACHE_ERROR 0
-#define CPER_ARM_TLB_ERROR 1
-#define CPER_ARM_BUS_ERROR 2
-#define CPER_ARM_VENDOR_ERROR 3
-#define CPER_ARM_MAX_TYPE CPER_ARM_VENDOR_ERROR
+#define CPER_ARM_ERR_TYPE_MASK GENMASK(4,1)
+#define CPER_ARM_CACHE_ERROR BIT(1)
+#define CPER_ARM_TLB_ERROR BIT(2)
+#define CPER_ARM_BUS_ERROR BIT(3)
+#define CPER_ARM_VENDOR_ERROR BIT(4)
#define CPER_ARM_ERR_VALID_TRANSACTION_TYPE BIT(0)
#define CPER_ARM_ERR_VALID_OPERATION_TYPE BIT(1)
@@ -588,6 +588,8 @@ const char *cper_mem_err_type_str(unsigned int);
const char *cper_mem_err_status_str(u64 status);
void cper_print_bits(const char *prefix, unsigned int bits,
const char * const strs[], unsigned int strs_size);
+int cper_bits_to_str(char *buf, int buf_size, unsigned long bits,
+ const char * const strs[], unsigned int strs_size);
void cper_mem_err_pack(const struct cper_sec_mem_err *,
struct cper_mem_err_compact *);
const char *cper_mem_err_unpack(struct trace_seq *,
diff --git a/include/linux/dma-buf-mapping.h b/include/linux/dma-buf-mapping.h
new file mode 100644
index 000000000000..a3c0ce2d3a42
--- /dev/null
+++ b/include/linux/dma-buf-mapping.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * DMA BUF Mapping Helpers
+ *
+ */
+#ifndef __DMA_BUF_MAPPING_H__
+#define __DMA_BUF_MAPPING_H__
+#include <linux/dma-buf.h>
+
+struct sg_table *dma_buf_phys_vec_to_sgt(struct dma_buf_attachment *attach,
+ struct p2pdma_provider *provider,
+ struct dma_buf_phys_vec *phys_vec,
+ size_t nr_ranges, size_t size,
+ enum dma_data_direction dir);
+void dma_buf_free_sgt(struct dma_buf_attachment *attach, struct sg_table *sgt,
+ enum dma_data_direction dir);
+#endif
diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
index d58e329ac0e7..0bc492090237 100644
--- a/include/linux/dma-buf.h
+++ b/include/linux/dma-buf.h
@@ -22,6 +22,7 @@
#include <linux/fs.h>
#include <linux/dma-fence.h>
#include <linux/wait.h>
+#include <linux/pci-p2pdma.h>
struct device;
struct dma_buf;
@@ -531,6 +532,16 @@ struct dma_buf_export_info {
};
/**
+ * struct dma_buf_phys_vec - describe continuous chunk of memory
+ * @paddr: physical address of that chunk
+ * @len: Length of this chunk
+ */
+struct dma_buf_phys_vec {
+ phys_addr_t paddr;
+ size_t len;
+};
+
+/**
* DEFINE_DMA_BUF_EXPORT_INFO - helper macro for exporters
* @name: export-info name
*
diff --git a/include/linux/efi.h b/include/linux/efi.h
index b23ff8b83219..2a43094e23f7 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -290,7 +290,7 @@ typedef efi_status_t efi_get_variable_t (efi_char16_t *name, efi_guid_t *vendor,
unsigned long *data_size, void *data);
typedef efi_status_t efi_get_next_variable_t (unsigned long *name_size, efi_char16_t *name,
efi_guid_t *vendor);
-typedef efi_status_t efi_set_variable_t (efi_char16_t *name, efi_guid_t *vendor,
+typedef efi_status_t efi_set_variable_t (efi_char16_t *name, efi_guid_t *vendor,
u32 attr, unsigned long data_size,
void *data);
typedef efi_status_t efi_get_next_high_mono_count_t (u32 *count);
@@ -373,6 +373,8 @@ void efi_native_runtime_setup(void);
#define EFI_DEVICE_PATH_TO_TEXT_PROTOCOL_GUID EFI_GUID(0x8b843e20, 0x8132, 0x4852, 0x90, 0xcc, 0x55, 0x1a, 0x4e, 0x4a, 0x7f, 0x1c)
#define EFI_DEVICE_PATH_FROM_TEXT_PROTOCOL_GUID EFI_GUID(0x05c99a21, 0xc70f, 0x4ad2, 0x8a, 0x5f, 0x35, 0xdf, 0x33, 0x43, 0xf5, 0x1e)
#define EFI_GRAPHICS_OUTPUT_PROTOCOL_GUID EFI_GUID(0x9042a9de, 0x23dc, 0x4a38, 0x96, 0xfb, 0x7a, 0xde, 0xd0, 0x80, 0x51, 0x6a)
+#define EFI_EDID_DISCOVERED_PROTOCOL_GUID EFI_GUID(0x1c0c34f6, 0xd380, 0x41fa, 0xa0, 0x49, 0x8a, 0xd0, 0x6c, 0x1a, 0x66, 0xaa)
+#define EFI_EDID_ACTIVE_PROTOCOL_GUID EFI_GUID(0xbd8c1056, 0x9f36, 0x44ec, 0x92, 0xa8, 0xa6, 0x33, 0x7f, 0x81, 0x79, 0x86)
#define EFI_PCI_IO_PROTOCOL_GUID EFI_GUID(0x4cf5b200, 0x68b8, 0x4ca5, 0x9e, 0xec, 0xb2, 0x3e, 0x3f, 0x50, 0x02, 0x9a)
#define EFI_FILE_INFO_ID EFI_GUID(0x09576e92, 0x6d3f, 0x11d2, 0x8e, 0x39, 0x00, 0xa0, 0xc9, 0x69, 0x72, 0x3b)
#define EFI_SYSTEM_RESOURCE_TABLE_GUID EFI_GUID(0xb122a263, 0x3661, 0x4f68, 0x99, 0x29, 0x78, 0xf8, 0xb0, 0xd6, 0x21, 0x80)
@@ -772,7 +774,7 @@ extern unsigned long efi_mem_attr_table;
*/
typedef int (*efi_memattr_perm_setter)(struct mm_struct *, efi_memory_desc_t *, bool);
-extern int efi_memattr_init(void);
+extern void efi_memattr_init(void);
extern int efi_memattr_apply_permissions(struct mm_struct *mm,
efi_memattr_perm_setter fn);
diff --git a/include/linux/generic_pt/common.h b/include/linux/generic_pt/common.h
new file mode 100644
index 000000000000..6a9a1acb5aad
--- /dev/null
+++ b/include/linux/generic_pt/common.h
@@ -0,0 +1,191 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ */
+#ifndef __GENERIC_PT_COMMON_H
+#define __GENERIC_PT_COMMON_H
+
+#include <linux/types.h>
+#include <linux/build_bug.h>
+#include <linux/bits.h>
+
+/**
+ * DOC: Generic Radix Page Table
+ *
+ * Generic Radix Page Table is a set of functions and helpers to efficiently
+ * parse radix style page tables typically seen in HW implementations. The
+ * interface is built to deliver similar code generation as the mm's pte/pmd/etc
+ * system by fully inlining the exact code required to handle each table level.
+ *
+ * Like the mm subsystem each format contributes its parsing implementation
+ * under common names and the common code implements the required algorithms.
+ *
+ * The system is divided into three logical levels:
+ *
+ * - The page table format and its manipulation functions
+ * - Generic helpers to give a consistent API regardless of underlying format
+ * - An algorithm implementation (e.g. IOMMU/DRM/KVM/MM)
+ *
+ * Multiple implementations are supported. The intention is to have the generic
+ * format code be re-usable for whatever specialized implementation is required.
+ * The generic code is solely about the format of the radix tree; it does not
+ * include memory allocation or higher level decisions that are left for the
+ * implementation.
+ *
+ * The generic framework supports a superset of functions across many HW
+ * implementations:
+ *
+ * - Entries comprised of contiguous blocks of IO PTEs for larger page sizes
+ * - Multi-level tables, up to 6 levels. Runtime selected top level
+ * - Runtime variable table level size (ARM's concatenated tables)
+ * - Expandable top level allowing dynamic sizing of table levels
+ * - Optional leaf entries at any level
+ * - 32-bit/64-bit virtual and output addresses, using every address bit
+ * - Dirty tracking
+ * - Sign extended addressing
+ */
+
+/**
+ * struct pt_common - struct for all page table implementations
+ */
+struct pt_common {
+ /**
+ * @top_of_table: Encodes the table top pointer and the top level in a
+ * single value. Must use READ_ONCE/WRITE_ONCE to access it. The lower
+ * bits of the aligned table pointer are used for the level.
+ */
+ uintptr_t top_of_table;
+ /**
+ * @max_oasz_lg2: Maximum number of bits the OA can contain. Upper bits
+ * must be zero. This may be less than what the page table format
+ * supports, but must not be more.
+ */
+ u8 max_oasz_lg2;
+ /**
+ * @max_vasz_lg2: Maximum number of bits the VA can contain. Upper bits
+ * are 0 or 1 depending on pt_full_va_prefix(). This may be less than
+ * what the page table format supports, but must not be more. When
+ * PT_FEAT_DYNAMIC_TOP is set this reflects the maximum VA capability.
+ */
+ u8 max_vasz_lg2;
+ /**
+ * @features: Bitmap of `enum pt_features`
+ */
+ unsigned int features;
+};
+
+/* Encoding parameters for top_of_table */
+enum {
+ PT_TOP_LEVEL_BITS = 3,
+ PT_TOP_LEVEL_MASK = GENMASK(PT_TOP_LEVEL_BITS - 1, 0),
+};
+
+/**
+ * enum pt_features - Features turned on in the table. Each symbol is a bit
+ * position.
+ */
+enum pt_features {
+ /**
+ * @PT_FEAT_DMA_INCOHERENT: Cache flush page table memory before
+ * assuming the HW can read it. Otherwise a SMP release is sufficient
+ * for HW to read it.
+ */
+ PT_FEAT_DMA_INCOHERENT,
+ /**
+ * @PT_FEAT_FULL_VA: The table can span the full VA range from 0 to
+ * PT_VADDR_MAX.
+ */
+ PT_FEAT_FULL_VA,
+ /**
+ * @PT_FEAT_DYNAMIC_TOP: The table's top level can be increased
+ * dynamically during map. This requires HW support for atomically
+ * setting both the table top pointer and the starting table level.
+ */
+ PT_FEAT_DYNAMIC_TOP,
+ /**
+ * @PT_FEAT_SIGN_EXTEND: The top most bit of the valid VA range sign
+ * extends up to the full pt_vaddr_t. This divides the page table into
+ * three VA ranges::
+ *
+ * 0 -> 2^N - 1 Lower
+ * 2^N -> (MAX - 2^N - 1) Non-Canonical
+ * MAX - 2^N -> MAX Upper
+ *
+ * In this mode pt_common::max_vasz_lg2 includes the sign bit and the
+ * upper bits that don't fall within the translation are just validated.
+ *
+ * If not set there is no sign extension and valid VA goes from 0 to 2^N
+ * - 1.
+ */
+ PT_FEAT_SIGN_EXTEND,
+ /**
+ * @PT_FEAT_FLUSH_RANGE: IOTLB maintenance is done by flushing IOVA
+ * ranges which will clean out any walk cache or any IOPTE fully
+ * contained by the range. The optimization objective is to minimize the
+ * number of flushes even if ranges include IOVA gaps that do not need
+ * to be flushed.
+ */
+ PT_FEAT_FLUSH_RANGE,
+ /**
+ * @PT_FEAT_FLUSH_RANGE_NO_GAPS: Like PT_FEAT_FLUSH_RANGE except that
+ * the optimization objective is to only flush IOVA that has been
+ * changed. This mode is suitable for cases like hypervisor shadowing
+ * where flushing unchanged ranges may cause the hypervisor to reparse
+ * significant amount of page table.
+ */
+ PT_FEAT_FLUSH_RANGE_NO_GAPS,
+ /* private: */
+ PT_FEAT_FMT_START,
+};
+
+struct pt_amdv1 {
+ struct pt_common common;
+};
+
+enum {
+ /*
+ * The memory backing the tables is encrypted. Use __sme_set() to adjust
+ * the page table pointers in the tree. This only works with
+ * CONFIG_AMD_MEM_ENCRYPT.
+ */
+ PT_FEAT_AMDV1_ENCRYPT_TABLES = PT_FEAT_FMT_START,
+ /*
+ * The PTEs are set to prevent cache incoherent traffic, such as PCI no
+ * snoop. This is set either at creation time or before the first map
+ * operation.
+ */
+ PT_FEAT_AMDV1_FORCE_COHERENCE,
+};
+
+struct pt_vtdss {
+ struct pt_common common;
+};
+
+enum {
+ /*
+ * The PTEs are set to prevent cache incoherent traffic, such as PCI no
+ * snoop. This is set either at creation time or before the first map
+ * operation.
+ */
+ PT_FEAT_VTDSS_FORCE_COHERENCE = PT_FEAT_FMT_START,
+ /*
+ * Prevent creating read-only PTEs. Used to work around HW errata
+ * ERRATA_772415_SPR17.
+ */
+ PT_FEAT_VTDSS_FORCE_WRITEABLE,
+};
+
+struct pt_x86_64 {
+ struct pt_common common;
+};
+
+enum {
+ /*
+ * The memory backing the tables is encrypted. Use __sme_set() to adjust
+ * the page table pointers in the tree. This only works with
+ * CONFIG_AMD_MEM_ENCRYPT.
+ */
+ PT_FEAT_X86_64_AMD_ENCRYPT_TABLES = PT_FEAT_FMT_START,
+};
+
+#endif
diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h
new file mode 100644
index 000000000000..9eefbb74efd0
--- /dev/null
+++ b/include/linux/generic_pt/iommu.h
@@ -0,0 +1,293 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ */
+#ifndef __GENERIC_PT_IOMMU_H
+#define __GENERIC_PT_IOMMU_H
+
+#include <linux/generic_pt/common.h>
+#include <linux/iommu.h>
+#include <linux/mm_types.h>
+
+struct iommu_iotlb_gather;
+struct pt_iommu_ops;
+struct pt_iommu_driver_ops;
+struct iommu_dirty_bitmap;
+
+/**
+ * DOC: IOMMU Radix Page Table
+ *
+ * The IOMMU implementation of the Generic Page Table provides an ops struct
+ * that is useful to go with an iommu_domain to serve the DMA API, IOMMUFD and
+ * the generic map/unmap interface.
+ *
+ * This interface uses a caller provided locking approach. The caller must have
+ * a VA range lock concept that prevents concurrent threads from calling ops on
+ * the same VA. Generally the range lock must be at least as large as a single
+ * map call.
+ */
+
+/**
+ * struct pt_iommu - Base structure for IOMMU page tables
+ *
+ * The format-specific struct will include this as the first member.
+ */
+struct pt_iommu {
+ /**
+ * @domain: The core IOMMU domain. The driver should use a union to
+ * overlay this memory with its previously existing domain struct to
+ * create an alias.
+ */
+ struct iommu_domain domain;
+
+ /**
+ * @ops: Function pointers to access the API
+ */
+ const struct pt_iommu_ops *ops;
+
+ /**
+ * @driver_ops: Function pointers provided by the HW driver to help
+ * manage HW details like caches.
+ */
+ const struct pt_iommu_driver_ops *driver_ops;
+
+ /**
+ * @nid: Node ID to use for table memory allocations. The IOMMU driver
+ * may want to set the NID to the device's NID, if there are multiple
+ * table walkers.
+ */
+ int nid;
+
+ /**
+ * @iommu_device: Device pointer used for any DMA cache flushing when
+ * PT_FEAT_DMA_INCOHERENT. This is the iommu device that created the
+ * page table which must have dma ops that perform cache flushing.
+ */
+ struct device *iommu_device;
+};
+
+/**
+ * struct pt_iommu_info - Details about the IOMMU page table
+ *
+ * Returned from pt_iommu_ops->get_info()
+ */
+struct pt_iommu_info {
+ /**
+ * @pgsize_bitmap: A bitmask where each set bit indicates
+ * a page size that can be natively stored in the page table.
+ */
+ u64 pgsize_bitmap;
+};
+
+struct pt_iommu_ops {
+ /**
+ * @set_dirty: Make the iova write dirty
+ * @iommu_table: Table to manipulate
+ * @iova: IO virtual address to start
+ *
+ * This is only used by iommufd testing. It makes the iova dirty so that
+ * read_and_clear_dirty() will see it as dirty. Unlike all the other ops
+ * this one is safe to call without holding any locking. It may return
+ * -EAGAIN if there is a race.
+ */
+ int (*set_dirty)(struct pt_iommu *iommu_table, dma_addr_t iova);
+
+ /**
+ * @get_info: Return the pt_iommu_info structure
+ * @iommu_table: Table to query
+ *
+ * Return some basic static information about the page table.
+ */
+ void (*get_info)(struct pt_iommu *iommu_table,
+ struct pt_iommu_info *info);
+
+ /**
+ * @deinit: Undo a format specific init operation
+ * @iommu_table: Table to destroy
+ *
+ * Release all of the memory. The caller must have already removed the
+ * table from all HW access and all caches.
+ */
+ void (*deinit)(struct pt_iommu *iommu_table);
+};
+
+/**
+ * struct pt_iommu_driver_ops - HW IOTLB cache flushing operations
+ *
+ * The IOMMU driver should implement these using container_of(iommu_table) to
+ * get to it's iommu_domain derived structure. All ops can be called in atomic
+ * contexts as they are buried under DMA API calls.
+ */
+struct pt_iommu_driver_ops {
+ /**
+ * @change_top: Update the top of table pointer
+ * @iommu_table: Table to operate on
+ * @top_paddr: New CPU physical address of the top pointer
+ * @top_level: IOMMU PT level of the new top
+ *
+ * Called under the get_top_lock() spinlock. The driver must update all
+ * HW references to this domain with a new top address and
+ * configuration. On return mappings placed in the new top must be
+ * reachable by the HW.
+ *
+ * top_level encodes the level in IOMMU PT format, level 0 is the
+ * smallest page size increasing from there. This has to be translated
+ * to any HW specific format. During this call the new top will not be
+ * visible to any other API.
+ *
+ * This op is only used by PT_FEAT_DYNAMIC_TOP, and is required if
+ * enabled.
+ */
+ void (*change_top)(struct pt_iommu *iommu_table, phys_addr_t top_paddr,
+ unsigned int top_level);
+
+ /**
+ * @get_top_lock: lock to hold when changing the table top
+ * @iommu_table: Table to operate on
+ *
+ * Return a lock to hold when changing the table top page table from
+ * being stored in HW. The lock will be held prior to calling
+ * change_top() and released once the top is fully visible.
+ *
+ * Typically this would be a lock that protects the iommu_domain's
+ * attachment list.
+ *
+ * This op is only used by PT_FEAT_DYNAMIC_TOP, and is required if
+ * enabled.
+ */
+ spinlock_t *(*get_top_lock)(struct pt_iommu *iommu_table);
+};
+
+static inline void pt_iommu_deinit(struct pt_iommu *iommu_table)
+{
+ /*
+ * It is safe to call pt_iommu_deinit() before an init, or if init
+ * fails. The ops pointer will only become non-NULL if deinit needs to be
+ * run.
+ */
+ if (iommu_table->ops)
+ iommu_table->ops->deinit(iommu_table);
+}
+
+/**
+ * struct pt_iommu_cfg - Common configuration values for all formats
+ */
+struct pt_iommu_cfg {
+ /**
+ * @features: Features required. Only these features will be turned on.
+ * The feature list should reflect what the IOMMU HW is capable of.
+ */
+ unsigned int features;
+ /**
+ * @hw_max_vasz_lg2: Maximum VA the IOMMU HW can support. This will
+ * imply the top level of the table.
+ */
+ u8 hw_max_vasz_lg2;
+ /**
+ * @hw_max_oasz_lg2: Maximum OA the IOMMU HW can support. The format
+ * might select a lower maximum OA.
+ */
+ u8 hw_max_oasz_lg2;
+};
+
+/* Generate the exported function signatures from iommu_pt.h */
+#define IOMMU_PROTOTYPES(fmt) \
+ phys_addr_t pt_iommu_##fmt##_iova_to_phys(struct iommu_domain *domain, \
+ dma_addr_t iova); \
+ int pt_iommu_##fmt##_map_pages(struct iommu_domain *domain, \
+ unsigned long iova, phys_addr_t paddr, \
+ size_t pgsize, size_t pgcount, \
+ int prot, gfp_t gfp, size_t *mapped); \
+ size_t pt_iommu_##fmt##_unmap_pages( \
+ struct iommu_domain *domain, unsigned long iova, \
+ size_t pgsize, size_t pgcount, \
+ struct iommu_iotlb_gather *iotlb_gather); \
+ int pt_iommu_##fmt##_read_and_clear_dirty( \
+ struct iommu_domain *domain, unsigned long iova, size_t size, \
+ unsigned long flags, struct iommu_dirty_bitmap *dirty); \
+ int pt_iommu_##fmt##_init(struct pt_iommu_##fmt *table, \
+ const struct pt_iommu_##fmt##_cfg *cfg, \
+ gfp_t gfp); \
+ void pt_iommu_##fmt##_hw_info(struct pt_iommu_##fmt *table, \
+ struct pt_iommu_##fmt##_hw_info *info)
+#define IOMMU_FORMAT(fmt, member) \
+ struct pt_iommu_##fmt { \
+ struct pt_iommu iommu; \
+ struct pt_##fmt member; \
+ }; \
+ IOMMU_PROTOTYPES(fmt)
+
+/*
+ * A driver uses IOMMU_PT_DOMAIN_OPS to populate the iommu_domain_ops for the
+ * iommu_pt
+ */
+#define IOMMU_PT_DOMAIN_OPS(fmt) \
+ .iova_to_phys = &pt_iommu_##fmt##_iova_to_phys, \
+ .map_pages = &pt_iommu_##fmt##_map_pages, \
+ .unmap_pages = &pt_iommu_##fmt##_unmap_pages
+#define IOMMU_PT_DIRTY_OPS(fmt) \
+ .read_and_clear_dirty = &pt_iommu_##fmt##_read_and_clear_dirty
+
+/*
+ * The driver should setup its domain struct like
+ * union {
+ * struct iommu_domain domain;
+ * struct pt_iommu_xxx xx;
+ * };
+ * PT_IOMMU_CHECK_DOMAIN(struct mock_iommu_domain, xx.iommu, domain);
+ *
+ * Which creates an alias between driver_domain.domain and
+ * driver_domain.xx.iommu.domain. This is to avoid a mass rename of existing
+ * driver_domain.domain users.
+ */
+#define PT_IOMMU_CHECK_DOMAIN(s, pt_iommu_memb, domain_memb) \
+ static_assert(offsetof(s, pt_iommu_memb.domain) == \
+ offsetof(s, domain_memb))
+
+struct pt_iommu_amdv1_cfg {
+ struct pt_iommu_cfg common;
+ unsigned int starting_level;
+};
+
+struct pt_iommu_amdv1_hw_info {
+ u64 host_pt_root;
+ u8 mode;
+};
+
+IOMMU_FORMAT(amdv1, amdpt);
+
+/* amdv1_mock is used by the iommufd selftest */
+#define pt_iommu_amdv1_mock pt_iommu_amdv1
+#define pt_iommu_amdv1_mock_cfg pt_iommu_amdv1_cfg
+struct pt_iommu_amdv1_mock_hw_info;
+IOMMU_PROTOTYPES(amdv1_mock);
+
+struct pt_iommu_vtdss_cfg {
+ struct pt_iommu_cfg common;
+ /* 4 is a 57 bit 5 level table */
+ unsigned int top_level;
+};
+
+struct pt_iommu_vtdss_hw_info {
+ u64 ssptptr;
+ u8 aw;
+};
+
+IOMMU_FORMAT(vtdss, vtdss_pt);
+
+struct pt_iommu_x86_64_cfg {
+ struct pt_iommu_cfg common;
+ /* 4 is a 57 bit 5 level table */
+ unsigned int top_level;
+};
+
+struct pt_iommu_x86_64_hw_info {
+ u64 gcr3_pt;
+ u8 levels;
+};
+
+IOMMU_FORMAT(x86_64, x86_64_pt);
+
+#undef IOMMU_PROTOTYPES
+#undef IOMMU_FORMAT
+#endif
diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h
index c4690e365ade..ca1ec437a3ca 100644
--- a/include/linux/hisi_acc_qm.h
+++ b/include/linux/hisi_acc_qm.h
@@ -99,6 +99,9 @@
#define QM_DEV_ALG_MAX_LEN 256
+#define QM_MIG_REGION_SEL 0x100198
+#define QM_MIG_REGION_EN BIT(0)
+
/* uacce mode of the driver */
#define UACCE_MODE_NOUACCE 0 /* don't use uacce */
#define UACCE_MODE_SVA 1 /* use uacce sva mode */
diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
index 8a823c6f2b4a..7a1516011ccf 100644
--- a/include/linux/io-pgtable.h
+++ b/include/linux/io-pgtable.h
@@ -15,8 +15,6 @@ enum io_pgtable_fmt {
ARM_64_LPAE_S2,
ARM_V7S,
ARM_MALI_LPAE,
- AMD_IOMMU_V1,
- AMD_IOMMU_V2,
APPLE_DART,
APPLE_DART2,
IO_PGTABLE_NUM_FMTS,
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index c30d12e16473..801b2bd9e8d4 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -751,7 +751,8 @@ struct iommu_ops {
* @free: Release the domain after use.
*/
struct iommu_domain_ops {
- int (*attach_dev)(struct iommu_domain *domain, struct device *dev);
+ int (*attach_dev)(struct iommu_domain *domain, struct device *dev,
+ struct iommu_domain *old);
int (*set_dev_pasid)(struct iommu_domain *domain, struct device *dev,
ioasid_t pasid, struct iommu_domain *old);
diff --git a/include/linux/irqchip/riscv-imsic.h b/include/linux/irqchip/riscv-imsic.h
index 7494952c5518..7f3ff5c5ea53 100644
--- a/include/linux/irqchip/riscv-imsic.h
+++ b/include/linux/irqchip/riscv-imsic.h
@@ -10,7 +10,6 @@
#include <linux/bitops.h>
#include <linux/device.h>
#include <linux/fwnode.h>
-#include <asm/csr.h>
#define IMSIC_MMIO_PAGE_SHIFT 12
#define IMSIC_MMIO_PAGE_SZ BIT(IMSIC_MMIO_PAGE_SHIFT)
@@ -86,7 +85,7 @@ static inline const struct imsic_global_config *imsic_get_global_config(void)
#endif
-#ifdef CONFIG_ACPI
+#if IS_ENABLED(CONFIG_ACPI) && IS_ENABLED(CONFIG_RISCV_IMSIC)
int imsic_platform_acpi_probe(struct fwnode_handle *fwnode);
struct fwnode_handle *imsic_acpi_get_fwnode(struct device *dev);
#else
diff --git a/include/linux/pci-epf.h b/include/linux/pci-epf.h
index 2e85504ba2ba..48f68c4dcfa5 100644
--- a/include/linux/pci-epf.h
+++ b/include/linux/pci-epf.h
@@ -115,8 +115,8 @@ struct pci_epf_driver {
* @phys_addr: physical address that should be mapped to the BAR
* @addr: virtual address corresponding to the @phys_addr
* @size: the size of the address space present in BAR
- * @aligned_size: the size actually allocated to accommodate the iATU alignment
- * requirement
+ * @mem_size: the size actually allocated to accommodate the iATU alignment
+ * requirement
* @barno: BAR number
* @flags: flags that are set for the BAR
*/
@@ -124,7 +124,7 @@ struct pci_epf_bar {
dma_addr_t phys_addr;
void *addr;
size_t size;
- size_t aligned_size;
+ size_t mem_size;
enum pci_barno barno;
int flags;
};
@@ -242,6 +242,12 @@ void *pci_epf_alloc_space(struct pci_epf *epf, size_t size, enum pci_barno bar,
void pci_epf_free_space(struct pci_epf *epf, void *addr, enum pci_barno bar,
enum pci_epc_interface_type type);
+int pci_epf_assign_bar_space(struct pci_epf *epf, size_t size,
+ enum pci_barno bar,
+ const struct pci_epc_features *epc_features,
+ enum pci_epc_interface_type type,
+ dma_addr_t bar_addr);
+
int pci_epf_align_inbound_addr(struct pci_epf *epf, enum pci_barno bar,
u64 addr, dma_addr_t *base, size_t *off);
int pci_epf_bind(struct pci_epf *epf);
diff --git a/include/linux/pci-p2pdma.h b/include/linux/pci-p2pdma.h
index 951f81a38f3a..517e121d2598 100644
--- a/include/linux/pci-p2pdma.h
+++ b/include/linux/pci-p2pdma.h
@@ -16,7 +16,58 @@
struct block_device;
struct scatterlist;
+/**
+ * struct p2pdma_provider
+ *
+ * A p2pdma provider is a range of MMIO address space available to the CPU.
+ */
+struct p2pdma_provider {
+ struct device *owner;
+ u64 bus_offset;
+};
+
+enum pci_p2pdma_map_type {
+ /*
+ * PCI_P2PDMA_MAP_UNKNOWN: Used internally as an initial state before
+ * the mapping type has been calculated. Exported routines for the API
+ * will never return this value.
+ */
+ PCI_P2PDMA_MAP_UNKNOWN = 0,
+
+ /*
+ * Not a PCI P2PDMA transfer.
+ */
+ PCI_P2PDMA_MAP_NONE,
+
+ /*
+ * PCI_P2PDMA_MAP_NOT_SUPPORTED: Indicates the transaction will
+ * traverse the host bridge and the host bridge is not in the
+ * allowlist. DMA Mapping routines should return an error when
+ * this is returned.
+ */
+ PCI_P2PDMA_MAP_NOT_SUPPORTED,
+
+ /*
+ * PCI_P2PDMA_MAP_BUS_ADDR: Indicates that two devices can talk to
+ * each other directly through a PCI switch and the transaction will
+ * not traverse the host bridge. Such a mapping should program
+ * the DMA engine with PCI bus addresses.
+ */
+ PCI_P2PDMA_MAP_BUS_ADDR,
+
+ /*
+ * PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: Indicates two devices can talk
+ * to each other, but the transaction traverses a host bridge on the
+ * allowlist. In this case, a normal mapping either with CPU physical
+ * addresses (in the case of dma-direct) or IOVA addresses (in the
+ * case of IOMMUs) should be used to program the DMA engine.
+ */
+ PCI_P2PDMA_MAP_THRU_HOST_BRIDGE,
+};
+
#ifdef CONFIG_PCI_P2PDMA
+int pcim_p2pdma_init(struct pci_dev *pdev);
+struct p2pdma_provider *pcim_p2pdma_provider(struct pci_dev *pdev, int bar);
int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
u64 offset);
int pci_p2pdma_distance_many(struct pci_dev *provider, struct device **clients,
@@ -33,7 +84,18 @@ int pci_p2pdma_enable_store(const char *page, struct pci_dev **p2p_dev,
bool *use_p2pdma);
ssize_t pci_p2pdma_enable_show(char *page, struct pci_dev *p2p_dev,
bool use_p2pdma);
+enum pci_p2pdma_map_type pci_p2pdma_map_type(struct p2pdma_provider *provider,
+ struct device *dev);
#else /* CONFIG_PCI_P2PDMA */
+static inline int pcim_p2pdma_init(struct pci_dev *pdev)
+{
+ return -EOPNOTSUPP;
+}
+static inline struct p2pdma_provider *pcim_p2pdma_provider(struct pci_dev *pdev,
+ int bar)
+{
+ return NULL;
+}
static inline int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar,
size_t size, u64 offset)
{
@@ -85,6 +147,11 @@ static inline ssize_t pci_p2pdma_enable_show(char *page,
{
return sprintf(page, "none\n");
}
+static inline enum pci_p2pdma_map_type
+pci_p2pdma_map_type(struct p2pdma_provider *provider, struct device *dev)
+{
+ return PCI_P2PDMA_MAP_NOT_SUPPORTED;
+}
#endif /* CONFIG_PCI_P2PDMA */
@@ -99,51 +166,12 @@ static inline struct pci_dev *pci_p2pmem_find(struct device *client)
return pci_p2pmem_find_many(&client, 1);
}
-enum pci_p2pdma_map_type {
- /*
- * PCI_P2PDMA_MAP_UNKNOWN: Used internally as an initial state before
- * the mapping type has been calculated. Exported routines for the API
- * will never return this value.
- */
- PCI_P2PDMA_MAP_UNKNOWN = 0,
-
- /*
- * Not a PCI P2PDMA transfer.
- */
- PCI_P2PDMA_MAP_NONE,
-
- /*
- * PCI_P2PDMA_MAP_NOT_SUPPORTED: Indicates the transaction will
- * traverse the host bridge and the host bridge is not in the
- * allowlist. DMA Mapping routines should return an error when
- * this is returned.
- */
- PCI_P2PDMA_MAP_NOT_SUPPORTED,
-
- /*
- * PCI_P2PDMA_MAP_BUS_ADDR: Indicates that two devices can talk to
- * each other directly through a PCI switch and the transaction will
- * not traverse the host bridge. Such a mapping should program
- * the DMA engine with PCI bus addresses.
- */
- PCI_P2PDMA_MAP_BUS_ADDR,
-
- /*
- * PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: Indicates two devices can talk
- * to each other, but the transaction traverses a host bridge on the
- * allowlist. In this case, a normal mapping either with CPU physical
- * addresses (in the case of dma-direct) or IOVA addresses (in the
- * case of IOMMUs) should be used to program the DMA engine.
- */
- PCI_P2PDMA_MAP_THRU_HOST_BRIDGE,
-};
-
struct pci_p2pdma_map_state {
- struct dev_pagemap *pgmap;
+ struct p2pdma_provider *mem;
enum pci_p2pdma_map_type map;
- u64 bus_off;
};
+
/* helper for pci_p2pdma_state(), do not use directly */
void __pci_p2pdma_update_state(struct pci_p2pdma_map_state *state,
struct device *dev, struct page *page);
@@ -162,8 +190,7 @@ pci_p2pdma_state(struct pci_p2pdma_map_state *state, struct device *dev,
struct page *page)
{
if (IS_ENABLED(CONFIG_PCI_P2PDMA) && is_pci_p2pdma_page(page)) {
- if (state->pgmap != page_pgmap(page))
- __pci_p2pdma_update_state(state, dev, page);
+ __pci_p2pdma_update_state(state, dev, page);
return state->map;
}
return PCI_P2PDMA_MAP_NONE;
@@ -172,16 +199,15 @@ pci_p2pdma_state(struct pci_p2pdma_map_state *state, struct device *dev,
/**
* pci_p2pdma_bus_addr_map - Translate a physical address to a bus address
* for a PCI_P2PDMA_MAP_BUS_ADDR transfer.
- * @state: P2P state structure
+ * @provider: P2P provider structure
* @paddr: physical address to map
*
* Map a physically contiguous PCI_P2PDMA_MAP_BUS_ADDR transfer.
*/
static inline dma_addr_t
-pci_p2pdma_bus_addr_map(struct pci_p2pdma_map_state *state, phys_addr_t paddr)
+pci_p2pdma_bus_addr_map(struct p2pdma_provider *provider, phys_addr_t paddr)
{
- WARN_ON_ONCE(state->map != PCI_P2PDMA_MAP_BUS_ADDR);
- return paddr + state->bus_off;
+ return paddr + provider->bus_offset;
}
#endif /* _LINUX_PCI_P2P_H */
diff --git a/include/linux/pci.h b/include/linux/pci.h
index bf97d49c23cf..b16127c6a7b4 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -502,6 +502,8 @@ struct pci_dev {
#ifdef CONFIG_PCIE_PTM
u16 ptm_cap; /* PTM Capability */
unsigned int ptm_root:1;
+ unsigned int ptm_responder:1;
+ unsigned int ptm_requester:1;
unsigned int ptm_enabled:1;
u8 ptm_granularity;
#endif
@@ -648,6 +650,7 @@ struct pci_host_bridge *pci_alloc_host_bridge(size_t priv);
struct pci_host_bridge *devm_pci_alloc_host_bridge(struct device *dev,
size_t priv);
void pci_free_host_bridge(struct pci_host_bridge *bridge);
+struct device *pci_get_host_bridge_device(struct pci_dev *dev);
struct pci_host_bridge *pci_find_host_bridge(struct pci_bus *bus);
void pci_set_host_bridge_release(struct pci_host_bridge *bridge,
@@ -831,6 +834,7 @@ struct pci_ops {
void __iomem *(*map_bus)(struct pci_bus *bus, unsigned int devfn, int where);
int (*read)(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 *val);
int (*write)(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 val);
+ int (*assert_perst)(struct pci_bus *bus, bool assert);
};
/*
@@ -1421,16 +1425,16 @@ void pcibios_reset_secondary_bus(struct pci_dev *dev);
void pci_update_resource(struct pci_dev *dev, int resno);
int __must_check pci_assign_resource(struct pci_dev *dev, int i);
int pci_release_resource(struct pci_dev *dev, int resno);
-static inline int pci_rebar_bytes_to_size(u64 bytes)
-{
- bytes = roundup_pow_of_two(bytes);
- /* Return BAR size as defined in the resizable BAR specification */
- return max(ilog2(bytes), 20) - 20;
-}
+/* Resizable BAR related routines */
+int pci_rebar_bytes_to_size(u64 bytes);
+resource_size_t pci_rebar_size_to_bytes(int size);
+u64 pci_rebar_get_possible_sizes(struct pci_dev *pdev, int bar);
+bool pci_rebar_size_supported(struct pci_dev *pdev, int bar, int size);
+int pci_rebar_get_max_size(struct pci_dev *pdev, int bar);
+int __must_check pci_resize_resource(struct pci_dev *dev, int i, int size,
+ int exclude_bars);
-u32 pci_rebar_get_possible_sizes(struct pci_dev *pdev, int bar);
-int __must_check pci_resize_resource(struct pci_dev *dev, int i, int size);
int pci_select_bars(struct pci_dev *dev, unsigned long flags);
bool pci_device_is_present(struct pci_dev *pdev);
void pci_ignore_hotplug(struct pci_dev *dev);
@@ -1958,10 +1962,17 @@ DEFINE_GUARD(pci_dev, struct pci_dev *, pci_dev_lock(_T), pci_dev_unlock(_T))
*/
#ifdef CONFIG_PCI_DOMAINS
extern int pci_domains_supported;
+int pci_bus_find_emul_domain_nr(u32 hint, u32 min, u32 max);
+void pci_bus_release_emul_domain_nr(int domain_nr);
#else
enum { pci_domains_supported = 0 };
static inline int pci_domain_nr(struct pci_bus *bus) { return 0; }
static inline int pci_proc_domain(struct pci_bus *bus) { return 0; }
+static inline int pci_bus_find_emul_domain_nr(u32 hint, u32 min, u32 max)
+{
+ return 0;
+}
+static inline void pci_bus_release_emul_domain_nr(int domain_nr) { }
#endif /* CONFIG_PCI_DOMAINS */
/*
diff --git a/include/linux/power/max77705_charger.h b/include/linux/power/max77705_charger.h
index 6653abfdf747..b3950ce0625e 100644
--- a/include/linux/power/max77705_charger.h
+++ b/include/linux/power/max77705_charger.h
@@ -123,6 +123,8 @@
#define MAX77705_DISABLE_SKIP 1
#define MAX77705_AUTO_SKIP 0
+#define AICL_WORK_DELAY_MS 100
+
/* uA */
#define MAX77705_CURRENT_CHGIN_STEP 25000
#define MAX77705_CURRENT_CHG_STEP 50000
diff --git a/include/linux/ras.h b/include/linux/ras.h
index a64182bc72ad..468941bfe855 100644
--- a/include/linux/ras.h
+++ b/include/linux/ras.h
@@ -24,8 +24,7 @@ int __init parse_cec_param(char *str);
void log_non_standard_event(const guid_t *sec_type,
const guid_t *fru_id, const char *fru_text,
const u8 sev, const u8 *err, const u32 len);
-void log_arm_hw_error(struct cper_sec_proc_arm *err);
-
+void log_arm_hw_error(struct cper_sec_proc_arm *err, const u8 sev);
#else
static inline void
log_non_standard_event(const guid_t *sec_type,
@@ -33,7 +32,7 @@ log_non_standard_event(const guid_t *sec_type,
const u8 sev, const u8 *err, const u32 len)
{ return; }
static inline void
-log_arm_hw_error(struct cper_sec_proc_arm *err) { return; }
+log_arm_hw_error(struct cper_sec_proc_arm *err, const u8 sev) { return; }
#endif
struct atl_err {
@@ -53,4 +52,15 @@ static inline unsigned long
amd_convert_umc_mca_addr_to_sys_addr(struct atl_err *err) { return -EINVAL; }
#endif /* CONFIG_AMD_ATL */
+#if defined(CONFIG_ARM) || defined(CONFIG_ARM64)
+#include <asm/smp_plat.h>
+/*
+ * Include ARM-specific SMP header which provides a function mapping mpidr to
+ * CPU logical index.
+ */
+#define GET_LOGICAL_INDEX(mpidr) get_logical_index(mpidr & MPIDR_HWID_BITMASK)
+#else
+#define GET_LOGICAL_INDEX(mpidr) -EINVAL
+#endif /* CONFIG_ARM || CONFIG_ARM64 */
+
#endif /* __RAS_H__ */
diff --git a/include/linux/sizes.h b/include/linux/sizes.h
index 49039494076f..f1f1a055b047 100644
--- a/include/linux/sizes.h
+++ b/include/linux/sizes.h
@@ -67,5 +67,6 @@
#define SZ_16T _AC(0x100000000000, ULL)
#define SZ_32T _AC(0x200000000000, ULL)
#define SZ_64T _AC(0x400000000000, ULL)
+#define SZ_128T _AC(0x800000000000, ULL)
#endif /* __LINUX_SIZES_H__ */
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index eb563f538dee..e90859956514 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -21,6 +21,7 @@ struct kvm;
struct iommufd_ctx;
struct iommufd_device;
struct iommufd_access;
+struct vfio_info_cap;
/*
* VFIO devices can be placed in a set, this allows all devices to share this
@@ -132,6 +133,9 @@ struct vfio_device_ops {
size_t count, loff_t *size);
long (*ioctl)(struct vfio_device *vdev, unsigned int cmd,
unsigned long arg);
+ int (*get_region_info_caps)(struct vfio_device *vdev,
+ struct vfio_region_info *info,
+ struct vfio_info_cap *caps);
int (*mmap)(struct vfio_device *vdev, struct vm_area_struct *vma);
void (*request)(struct vfio_device *vdev, unsigned int count);
int (*match)(struct vfio_device *vdev, char *buf);
@@ -297,6 +301,8 @@ static inline void vfio_put_device(struct vfio_device *device)
int vfio_register_group_dev(struct vfio_device *device);
int vfio_register_emulated_iommu_dev(struct vfio_device *device);
void vfio_unregister_group_dev(struct vfio_device *device);
+bool vfio_device_try_get_registration(struct vfio_device *device);
+void vfio_device_put_registration(struct vfio_device *device);
int vfio_assign_device_set(struct vfio_device *device, void *set_id);
unsigned int vfio_device_set_open_count(struct vfio_device_set *dev_set);
diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
index f541044e42a2..706877f998ff 100644
--- a/include/linux/vfio_pci_core.h
+++ b/include/linux/vfio_pci_core.h
@@ -12,6 +12,7 @@
#include <linux/pci.h>
#include <linux/vfio.h>
#include <linux/irqbypass.h>
+#include <linux/rcupdate.h>
#include <linux/types.h>
#include <linux/uuid.h>
#include <linux/notifier.h>
@@ -26,6 +27,14 @@
struct vfio_pci_core_device;
struct vfio_pci_region;
+struct p2pdma_provider;
+struct dma_buf_phys_vec;
+struct dma_buf_attachment;
+
+struct vfio_pci_eventfd {
+ struct eventfd_ctx *ctx;
+ struct rcu_head rcu;
+};
struct vfio_pci_regops {
ssize_t (*rw)(struct vfio_pci_core_device *vdev, char __user *buf,
@@ -49,9 +58,48 @@ struct vfio_pci_region {
u32 flags;
};
+struct vfio_pci_device_ops {
+ int (*get_dmabuf_phys)(struct vfio_pci_core_device *vdev,
+ struct p2pdma_provider **provider,
+ unsigned int region_index,
+ struct dma_buf_phys_vec *phys_vec,
+ struct vfio_region_dma_range *dma_ranges,
+ size_t nr_ranges);
+};
+
+#if IS_ENABLED(CONFIG_VFIO_PCI_DMABUF)
+int vfio_pci_core_fill_phys_vec(struct dma_buf_phys_vec *phys_vec,
+ struct vfio_region_dma_range *dma_ranges,
+ size_t nr_ranges, phys_addr_t start,
+ phys_addr_t len);
+int vfio_pci_core_get_dmabuf_phys(struct vfio_pci_core_device *vdev,
+ struct p2pdma_provider **provider,
+ unsigned int region_index,
+ struct dma_buf_phys_vec *phys_vec,
+ struct vfio_region_dma_range *dma_ranges,
+ size_t nr_ranges);
+#else
+static inline int
+vfio_pci_core_fill_phys_vec(struct dma_buf_phys_vec *phys_vec,
+ struct vfio_region_dma_range *dma_ranges,
+ size_t nr_ranges, phys_addr_t start,
+ phys_addr_t len)
+{
+ return -EINVAL;
+}
+static inline int vfio_pci_core_get_dmabuf_phys(
+ struct vfio_pci_core_device *vdev, struct p2pdma_provider **provider,
+ unsigned int region_index, struct dma_buf_phys_vec *phys_vec,
+ struct vfio_region_dma_range *dma_ranges, size_t nr_ranges)
+{
+ return -EOPNOTSUPP;
+}
+#endif
+
struct vfio_pci_core_device {
struct vfio_device vdev;
struct pci_dev *pdev;
+ const struct vfio_pci_device_ops *pci_ops;
void __iomem *barmap[PCI_STD_NUM_BARS];
bool bar_mmap_supported[PCI_STD_NUM_BARS];
u8 *pci_config_map;
@@ -83,8 +131,8 @@ struct vfio_pci_core_device {
struct pci_saved_state *pci_saved_state;
struct pci_saved_state *pm_save;
int ioeventfds_nr;
- struct eventfd_ctx *err_trigger;
- struct eventfd_ctx *req_trigger;
+ struct vfio_pci_eventfd __rcu *err_trigger;
+ struct vfio_pci_eventfd __rcu *req_trigger;
struct eventfd_ctx *pm_wake_eventfd_ctx;
struct list_head dummy_resources_list;
struct mutex ioeventfds_lock;
@@ -94,6 +142,7 @@ struct vfio_pci_core_device {
struct vfio_pci_core_device *sriov_pf_core_dev;
struct notifier_block nb;
struct rw_semaphore memory_lock;
+ struct list_head dmabufs;
};
/* Will be exported for vfio pci drivers usage */
@@ -115,10 +164,16 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd,
unsigned long arg);
int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
void __user *arg, size_t argsz);
+int vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev,
+ struct vfio_region_info *info,
+ struct vfio_info_cap *caps);
ssize_t vfio_pci_core_read(struct vfio_device *core_vdev, char __user *buf,
size_t count, loff_t *ppos);
ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *buf,
size_t count, loff_t *ppos);
+vm_fault_t vfio_pci_vmf_insert_pfn(struct vfio_pci_core_device *vdev,
+ struct vm_fault *vmf, unsigned long pfn,
+ unsigned int order);
int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma);
void vfio_pci_core_request(struct vfio_device *core_vdev, unsigned int count);
int vfio_pci_core_match(struct vfio_device *core_vdev, char *buf);
@@ -134,6 +189,7 @@ ssize_t vfio_pci_core_do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem,
void __iomem *io, char __user *buf,
loff_t off, size_t count, size_t x_start,
size_t x_end, bool iswrite);
+bool __vfio_pci_memory_enabled(struct vfio_pci_core_device *vdev);
bool vfio_pci_core_range_intersect_range(loff_t buf_start, size_t buf_cnt,
loff_t reg_start, size_t reg_cnt,
loff_t *buf_offset,
@@ -161,4 +217,17 @@ VFIO_IOREAD_DECLARATION(32)
VFIO_IOREAD_DECLARATION(64)
#endif
+static inline bool is_aligned_for_order(struct vm_area_struct *vma,
+ unsigned long addr,
+ unsigned long pfn,
+ unsigned int order)
+{
+ return !(order && (addr < vma->vm_start ||
+ addr + (PAGE_SIZE << order) > vma->vm_end ||
+ !IS_ALIGNED(pfn, 1 << order)));
+}
+
+int vfio_pci_dma_buf_iommufd_map(struct dma_buf_attachment *attachment,
+ struct dma_buf_phys_vec *phys);
+
#endif /* VFIO_PCI_CORE_H */
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index 96c66126c074..132a474e5914 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -177,7 +177,7 @@ struct virtio_device {
union virtio_map vmap;
#ifdef CONFIG_VIRTIO_DEBUG
struct dentry *debugfs_dir;
- u64 debugfs_filter_features[VIRTIO_FEATURES_DWORDS];
+ u64 debugfs_filter_features[VIRTIO_FEATURES_U64S];
#endif
};
diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index 16001e9f9b39..69f84ea85d71 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -24,7 +24,7 @@ typedef void vq_callback_t(struct virtqueue *);
* a virtqueue unused by the driver.
* @callback: A callback to invoke on a used buffer notification.
* NULL for a virtqueue that does not need a callback.
- * @ctx: A flag to indicate to maintain an extra context per virtqueue.
+ * @ctx: whether to maintain an extra context per virtqueue.
*/
struct virtqueue_info {
const char *name;
@@ -80,13 +80,13 @@ struct virtqueue_info {
* Returns the first 64 feature bits.
* @get_extended_features:
* vdev: the virtio_device
- * Returns the first VIRTIO_FEATURES_MAX feature bits (all we currently
+ * Returns the first VIRTIO_FEATURES_BITS feature bits (all we currently
* need).
* @finalize_features: confirm what device features we'll be using.
* vdev: the virtio_device
* This sends the driver feature bits to the device: it can change
* the dev->feature bits if it wants.
- * Note that despite the name this can be called any number of
+ * Note that despite the name this can be called any number of
* times.
* Returns 0 on success or error status
* @bus_name: return the bus name associated with the device (optional)
@@ -141,8 +141,8 @@ struct virtio_config_ops {
/**
* struct virtio_map_ops - operations for mapping buffer for a virtio device
- * Note: For transport that has its own mapping logic it must
- * implements all of the operations
+ * Note: For a transport that has its own mapping logic it must
+ * implement all of the operations
* @map_page: map a buffer to the device
* map: metadata for performing mapping
* page: the page that will be mapped by the device
@@ -150,7 +150,7 @@ struct virtio_config_ops {
* size: the buffer size
* dir: mapping direction
* attrs: mapping attributes
- * Returns: the mapped address
+ * Returns the mapped address
* @unmap_page: unmap a buffer from the device
* map: device specific mapping map
* map_handle: the mapped address
@@ -172,23 +172,23 @@ struct virtio_config_ops {
* size: the size of the buffer
* map_handle: the mapping address to sync
* gfp: allocation flag (GFP_XXX)
- * Returns: virtual address of the allocated buffer
+ * Returns virtual address of the allocated buffer
* @free: free a coherent buffer mapping
* map: metadata for performing mapping
* size: the size of the buffer
* vaddr: virtual address of the buffer
- * map_handle: the mapping address to sync
+ * map_handle: the mapping address that needs to be freed
* attrs: unmapping attributes
* @need_sync: if the buffer needs synchronization
* map: metadata for performing mapping
* map_handle: the mapped address
- * Returns: whether the buffer needs synchronization
+ * Returns whether the buffer needs synchronization
* @mapping_error: if the mapping address is error
* map: metadata for performing mapping
* map_handle: the mapped address
* @max_mapping_size: get the maximum buffer size that can be mapped
* map: metadata for performing mapping
- * Returns: the maximum buffer size that can be mapped
+ * Returns the maximum buffer size that can be mapped
*/
struct virtio_map_ops {
dma_addr_t (*map_page)(union virtio_map map, struct page *page,
@@ -362,7 +362,7 @@ void virtio_device_ready(struct virtio_device *dev)
* specific set_status() method.
*
* A well behaved device will only notify a virtqueue after
- * DRIVER_OK, this means the device should "see" the coherenct
+ * DRIVER_OK, this means the device should "see" the coherent
* memory write that set vq->broken as false which is done by
* the driver when it sees DRIVER_OK, then the following
* driver's vring_interrupt() will see vq->broken as false so
@@ -384,7 +384,7 @@ const char *virtio_bus_name(struct virtio_device *vdev)
* @vq: the virtqueue
* @cpu_mask: the cpu mask
*
- * Pay attention the function are best-effort: the affinity hint may not be set
+ * Note that this function is best-effort: the affinity hint may not be set
* due to config support, irq type and sharing.
*
*/
diff --git a/include/linux/virtio_features.h b/include/linux/virtio_features.h
index f748f2f87de8..ea2ad8717882 100644
--- a/include/linux/virtio_features.h
+++ b/include/linux/virtio_features.h
@@ -4,15 +4,16 @@
#include <linux/bits.h>
-#define VIRTIO_FEATURES_DWORDS 2
-#define VIRTIO_FEATURES_MAX (VIRTIO_FEATURES_DWORDS * 64)
-#define VIRTIO_FEATURES_WORDS (VIRTIO_FEATURES_DWORDS * 2)
+#define VIRTIO_FEATURES_U64S 2
+#define VIRTIO_FEATURES_BITS (VIRTIO_FEATURES_U64S * 64)
+
#define VIRTIO_BIT(b) BIT_ULL((b) & 0x3f)
-#define VIRTIO_DWORD(b) ((b) >> 6)
+#define VIRTIO_U64(b) ((b) >> 6)
+
#define VIRTIO_DECLARE_FEATURES(name) \
union { \
u64 name; \
- u64 name##_array[VIRTIO_FEATURES_DWORDS];\
+ u64 name##_array[VIRTIO_FEATURES_U64S];\
}
static inline bool virtio_features_chk_bit(unsigned int bit)
@@ -22,9 +23,9 @@ static inline bool virtio_features_chk_bit(unsigned int bit)
* Don't care returning the correct value: the build
* will fail before any bad features access
*/
- BUILD_BUG_ON(bit >= VIRTIO_FEATURES_MAX);
+ BUILD_BUG_ON(bit >= VIRTIO_FEATURES_BITS);
} else {
- if (WARN_ON_ONCE(bit >= VIRTIO_FEATURES_MAX))
+ if (WARN_ON_ONCE(bit >= VIRTIO_FEATURES_BITS))
return false;
}
return true;
@@ -34,26 +35,26 @@ static inline bool virtio_features_test_bit(const u64 *features,
unsigned int bit)
{
return virtio_features_chk_bit(bit) &&
- !!(features[VIRTIO_DWORD(bit)] & VIRTIO_BIT(bit));
+ !!(features[VIRTIO_U64(bit)] & VIRTIO_BIT(bit));
}
static inline void virtio_features_set_bit(u64 *features,
unsigned int bit)
{
if (virtio_features_chk_bit(bit))
- features[VIRTIO_DWORD(bit)] |= VIRTIO_BIT(bit);
+ features[VIRTIO_U64(bit)] |= VIRTIO_BIT(bit);
}
static inline void virtio_features_clear_bit(u64 *features,
unsigned int bit)
{
if (virtio_features_chk_bit(bit))
- features[VIRTIO_DWORD(bit)] &= ~VIRTIO_BIT(bit);
+ features[VIRTIO_U64(bit)] &= ~VIRTIO_BIT(bit);
}
static inline void virtio_features_zero(u64 *features)
{
- memset(features, 0, sizeof(features[0]) * VIRTIO_FEATURES_DWORDS);
+ memset(features, 0, sizeof(features[0]) * VIRTIO_FEATURES_U64S);
}
static inline void virtio_features_from_u64(u64 *features, u64 from)
@@ -66,7 +67,7 @@ static inline bool virtio_features_equal(const u64 *f1, const u64 *f2)
{
int i;
- for (i = 0; i < VIRTIO_FEATURES_DWORDS; ++i)
+ for (i = 0; i < VIRTIO_FEATURES_U64S; ++i)
if (f1[i] != f2[i])
return false;
return true;
@@ -74,14 +75,14 @@ static inline bool virtio_features_equal(const u64 *f1, const u64 *f2)
static inline void virtio_features_copy(u64 *to, const u64 *from)
{
- memcpy(to, from, sizeof(to[0]) * VIRTIO_FEATURES_DWORDS);
+ memcpy(to, from, sizeof(to[0]) * VIRTIO_FEATURES_U64S);
}
static inline void virtio_features_andnot(u64 *to, const u64 *f1, const u64 *f2)
{
int i;
- for (i = 0; i < VIRTIO_FEATURES_DWORDS; i++)
+ for (i = 0; i < VIRTIO_FEATURES_U64S; i++)
to[i] = f1[i] & ~f2[i];
}
diff --git a/include/linux/virtio_pci_modern.h b/include/linux/virtio_pci_modern.h
index 48bc12d1045b..9a3f2fc53bd6 100644
--- a/include/linux/virtio_pci_modern.h
+++ b/include/linux/virtio_pci_modern.h
@@ -107,7 +107,7 @@ void vp_modern_set_extended_features(struct virtio_pci_modern_device *mdev,
static inline u64
vp_modern_get_features(struct virtio_pci_modern_device *mdev)
{
- u64 features_array[VIRTIO_FEATURES_DWORDS];
+ u64 features_array[VIRTIO_FEATURES_U64S];
vp_modern_get_extended_features(mdev, features_array);
return features_array[0];
@@ -116,11 +116,11 @@ vp_modern_get_features(struct virtio_pci_modern_device *mdev)
static inline u64
vp_modern_get_driver_features(struct virtio_pci_modern_device *mdev)
{
- u64 features_array[VIRTIO_FEATURES_DWORDS];
+ u64 features_array[VIRTIO_FEATURES_U64S];
int i;
vp_modern_get_driver_extended_features(mdev, features_array);
- for (i = 1; i < VIRTIO_FEATURES_DWORDS; ++i)
+ for (i = 1; i < VIRTIO_FEATURES_U64S; ++i)
WARN_ON_ONCE(features_array[i]);
return features_array[0];
}
@@ -128,7 +128,7 @@ vp_modern_get_driver_features(struct virtio_pci_modern_device *mdev)
static inline void
vp_modern_set_features(struct virtio_pci_modern_device *mdev, u64 features)
{
- u64 features_array[VIRTIO_FEATURES_DWORDS];
+ u64 features_array[VIRTIO_FEATURES_U64S];
virtio_features_from_u64(features_array, features);
vp_modern_set_extended_features(mdev, features_array);
diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h
index c8cd0f00c845..c9f0b1018bcc 100644
--- a/include/ras/ras_event.h
+++ b/include/ras/ras_event.h
@@ -168,11 +168,25 @@ TRACE_EVENT(mc_event,
* This event is generated when hardware detects an ARM processor error
* has occurred. UEFI 2.6 spec section N.2.4.4.
*/
+#define APEIL "ARM Processor Err Info data len"
+#define APEID "ARM Processor Err Info raw data"
+#define APECIL "ARM Processor Err Context Info data len"
+#define APECID "ARM Processor Err Context Info raw data"
+#define VSEIL "Vendor Specific Err Info data len"
+#define VSEID "Vendor Specific Err Info raw data"
TRACE_EVENT(arm_event,
- TP_PROTO(const struct cper_sec_proc_arm *proc),
+ TP_PROTO(const struct cper_sec_proc_arm *proc,
+ const u8 *pei_err,
+ const u32 pei_len,
+ const u8 *ctx_err,
+ const u32 ctx_len,
+ const u8 *oem,
+ const u32 oem_len,
+ u8 sev,
+ int cpu),
- TP_ARGS(proc),
+ TP_ARGS(proc, pei_err, pei_len, ctx_err, ctx_len, oem, oem_len, sev, cpu),
TP_STRUCT__entry(
__field(u64, mpidr)
@@ -180,6 +194,14 @@ TRACE_EVENT(arm_event,
__field(u32, running_state)
__field(u32, psci_state)
__field(u8, affinity)
+ __field(u32, pei_len)
+ __dynamic_array(u8, pei_buf, pei_len)
+ __field(u32, ctx_len)
+ __dynamic_array(u8, ctx_buf, ctx_len)
+ __field(u32, oem_len)
+ __dynamic_array(u8, oem_buf, oem_len)
+ __field(u8, sev)
+ __field(int, cpu)
),
TP_fast_assign(
@@ -199,12 +221,29 @@ TRACE_EVENT(arm_event,
__entry->running_state = ~0;
__entry->psci_state = ~0;
}
+ __entry->pei_len = pei_len;
+ memcpy(__get_dynamic_array(pei_buf), pei_err, pei_len);
+ __entry->ctx_len = ctx_len;
+ memcpy(__get_dynamic_array(ctx_buf), ctx_err, ctx_len);
+ __entry->oem_len = oem_len;
+ memcpy(__get_dynamic_array(oem_buf), oem, oem_len);
+ __entry->sev = sev;
+ __entry->cpu = cpu;
),
- TP_printk("affinity level: %d; MPIDR: %016llx; MIDR: %016llx; "
- "running state: %d; PSCI state: %d",
+ TP_printk("cpu: %d; error: %d; affinity level: %d; MPIDR: %016llx; MIDR: %016llx; "
+ "running state: %d; PSCI state: %d; "
+ "%s: %d; %s: %s; %s: %d; %s: %s; %s: %d; %s: %s",
+ __entry->cpu,
+ __entry->sev,
__entry->affinity, __entry->mpidr, __entry->midr,
- __entry->running_state, __entry->psci_state)
+ __entry->running_state, __entry->psci_state,
+ APEIL, __entry->pei_len, APEID,
+ __print_hex(__get_dynamic_array(pei_buf), __entry->pei_len),
+ APECIL, __entry->ctx_len, APECID,
+ __print_hex(__get_dynamic_array(ctx_buf), __entry->ctx_len),
+ VSEIL, __entry->oem_len, VSEID,
+ __print_hex(__get_dynamic_array(oem_buf), __entry->oem_len))
);
/*
diff --git a/include/rdma/ib_cm.h b/include/rdma/ib_cm.h
index 1fa3786f82f4..4808a355de41 100644
--- a/include/rdma/ib_cm.h
+++ b/include/rdma/ib_cm.h
@@ -271,7 +271,7 @@ struct ib_cm_event {
#define CM_APR_ATTR_ID cpu_to_be16(0x001A)
/**
- * ib_cm_handler - User-defined callback to process communication events.
+ * typedef ib_cm_handler - User-defined callback to process communication events.
* @cm_id: Communication identifier associated with the reported event.
* @event: Information about the communication event.
*
@@ -482,7 +482,7 @@ int ib_send_cm_rej(struct ib_cm_id *cm_id,
/**
* ib_prepare_cm_mra - Prepares to send a message receipt acknowledgment to a
- connection message in case duplicates are received.
+ * connection message in case duplicates are received.
* @cm_id: Connection identifier associated with the connection message.
*/
int ib_prepare_cm_mra(struct ib_cm_id *cm_id);
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 6139223e92e4..6aad66bc5dd7 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -586,10 +586,10 @@ enum ib_stat_flag {
};
/**
- * struct rdma_stat_desc
- * @name - The name of the counter
- * @flags - Flags of the counter; For example, IB_STAT_FLAG_OPTIONAL
- * @priv - Driver private information; Core code should not use
+ * struct rdma_stat_desc - description of one rdma stat/counter
+ * @name: The name of the counter
+ * @flags: Flags of the counter; For example, IB_STAT_FLAG_OPTIONAL
+ * @priv: Driver private information; Core code should not use
*/
struct rdma_stat_desc {
const char *name;
@@ -598,24 +598,24 @@ struct rdma_stat_desc {
};
/**
- * struct rdma_hw_stats
- * @lock - Mutex to protect parallel write access to lifespan and values
+ * struct rdma_hw_stats - collection of hardware stats and their management
+ * @lock: Mutex to protect parallel write access to lifespan and values
* of counters, which are 64bits and not guaranteed to be written
* atomicaly on 32bits systems.
- * @timestamp - Used by the core code to track when the last update was
- * @lifespan - Used by the core code to determine how old the counters
+ * @timestamp: Used by the core code to track when the last update was
+ * @lifespan: Used by the core code to determine how old the counters
* should be before being updated again. Stored in jiffies, defaults
* to 10 milliseconds, drivers can override the default be specifying
* their own value during their allocation routine.
- * @descs - Array of pointers to static descriptors used for the counters
+ * @descs: Array of pointers to static descriptors used for the counters
* in directory.
- * @is_disabled - A bitmap to indicate each counter is currently disabled
+ * @is_disabled: A bitmap to indicate each counter is currently disabled
* or not.
- * @num_counters - How many hardware counters there are. If name is
+ * @num_counters: How many hardware counters there are. If name is
* shorter than this number, a kernel oops will result. Driver authors
* are encouraged to leave BUILD_BUG_ON(ARRAY_SIZE(@name) < num_counters)
* in their code to prevent this.
- * @value - Array of u64 counters that are accessed by the sysfs code and
+ * @value: Array of u64 counters that are accessed by the sysfs code and
* filled in by the drivers get_stats routine
*/
struct rdma_hw_stats {
@@ -859,6 +859,7 @@ enum ib_rate {
IB_RATE_400_GBPS = 21,
IB_RATE_600_GBPS = 22,
IB_RATE_800_GBPS = 23,
+ IB_RATE_1600_GBPS = 25,
};
/**
@@ -2405,7 +2406,7 @@ struct ib_device_ops {
int (*modify_port)(struct ib_device *device, u32 port_num,
int port_modify_mask,
struct ib_port_modify *port_modify);
- /**
+ /*
* The following mandatory functions are used only at device
* registration. Keep functions such as these at the end of this
* structure to avoid cache line misses when accessing struct ib_device
@@ -2415,7 +2416,7 @@ struct ib_device_ops {
struct ib_port_immutable *immutable);
enum rdma_link_layer (*get_link_layer)(struct ib_device *device,
u32 port_num);
- /**
+ /*
* When calling get_netdev, the HW vendor's driver should return the
* net device of device @device at port @port_num or NULL if such
* a net device doesn't exist. The vendor driver should call dev_hold
@@ -2425,7 +2426,7 @@ struct ib_device_ops {
*/
struct net_device *(*get_netdev)(struct ib_device *device,
u32 port_num);
- /**
+ /*
* rdma netdev operation
*
* Driver implementing alloc_rdma_netdev or rdma_netdev_get_params
@@ -2439,14 +2440,14 @@ struct ib_device_ops {
int (*rdma_netdev_get_params)(struct ib_device *device, u32 port_num,
enum rdma_netdev_t type,
struct rdma_netdev_alloc_params *params);
- /**
+ /*
* query_gid should be return GID value for @device, when @port_num
* link layer is either IB or iWarp. It is no-op if @port_num port
* is RoCE link layer.
*/
int (*query_gid)(struct ib_device *device, u32 port_num, int index,
union ib_gid *gid);
- /**
+ /*
* When calling add_gid, the HW vendor's driver should add the gid
* of device of port at gid index available at @attr. Meta-info of
* that gid (for example, the network device related to this gid) is
@@ -2460,7 +2461,7 @@ struct ib_device_ops {
* roce_gid_table is used.
*/
int (*add_gid)(const struct ib_gid_attr *attr, void **context);
- /**
+ /*
* When calling del_gid, the HW vendor's driver should delete the
* gid of device @device at gid index gid_index of port port_num
* available in @attr.
@@ -2475,7 +2476,7 @@ struct ib_device_ops {
struct ib_udata *udata);
void (*dealloc_ucontext)(struct ib_ucontext *context);
int (*mmap)(struct ib_ucontext *context, struct vm_area_struct *vma);
- /**
+ /*
* This will be called once refcount of an entry in mmap_xa reaches
* zero. The type of the memory that was mapped may differ between
* entries and is opaque to the rdma_user_mmap interface.
@@ -2516,12 +2517,12 @@ struct ib_device_ops {
int (*modify_cq)(struct ib_cq *cq, u16 cq_count, u16 cq_period);
int (*destroy_cq)(struct ib_cq *cq, struct ib_udata *udata);
int (*resize_cq)(struct ib_cq *cq, int cqe, struct ib_udata *udata);
- /**
+ /*
* pre_destroy_cq - Prevent a cq from generating any new work
* completions, but not free any kernel resources
*/
int (*pre_destroy_cq)(struct ib_cq *cq);
- /**
+ /*
* post_destroy_cq - Free all kernel resources
*/
void (*post_destroy_cq)(struct ib_cq *cq);
@@ -2615,7 +2616,7 @@ struct ib_device_ops {
struct scatterlist *meta_sg, int meta_sg_nents,
unsigned int *meta_sg_offset);
- /**
+ /*
* alloc_hw_[device,port]_stats - Allocate a struct rdma_hw_stats and
* fill in the driver initialized data. The struct is kfree()'ed by
* the sysfs core when the device is removed. A lifespan of -1 in the
@@ -2624,7 +2625,7 @@ struct ib_device_ops {
struct rdma_hw_stats *(*alloc_hw_device_stats)(struct ib_device *device);
struct rdma_hw_stats *(*alloc_hw_port_stats)(struct ib_device *device,
u32 port_num);
- /**
+ /*
* get_hw_stats - Fill in the counter value(s) in the stats struct.
* @index - The index in the value array we wish to have updated, or
* num_counters if we want all stats updated
@@ -2639,14 +2640,14 @@ struct ib_device_ops {
int (*get_hw_stats)(struct ib_device *device,
struct rdma_hw_stats *stats, u32 port, int index);
- /**
+ /*
* modify_hw_stat - Modify the counter configuration
* @enable: true/false when enable/disable a counter
* Return codes - 0 on success or error code otherwise.
*/
int (*modify_hw_stat)(struct ib_device *device, u32 port,
unsigned int counter_index, bool enable);
- /**
+ /*
* Allows rdma drivers to add their own restrack attributes.
*/
int (*fill_res_mr_entry)(struct sk_buff *msg, struct ib_mr *ibmr);
@@ -2682,39 +2683,39 @@ struct ib_device_ops {
u8 pdata_len);
int (*iw_create_listen)(struct iw_cm_id *cm_id, int backlog);
int (*iw_destroy_listen)(struct iw_cm_id *cm_id);
- /**
+ /*
* counter_bind_qp - Bind a QP to a counter.
* @counter - The counter to be bound. If counter->id is zero then
* the driver needs to allocate a new counter and set counter->id
*/
int (*counter_bind_qp)(struct rdma_counter *counter, struct ib_qp *qp,
u32 port);
- /**
+ /*
* counter_unbind_qp - Unbind the qp from the dynamically-allocated
* counter and bind it onto the default one
*/
int (*counter_unbind_qp)(struct ib_qp *qp, u32 port);
- /**
+ /*
* counter_dealloc -De-allocate the hw counter
*/
int (*counter_dealloc)(struct rdma_counter *counter);
- /**
+ /*
* counter_alloc_stats - Allocate a struct rdma_hw_stats and fill in
* the driver initialized data.
*/
struct rdma_hw_stats *(*counter_alloc_stats)(
struct rdma_counter *counter);
- /**
+ /*
* counter_update_stats - Query the stats value of this counter
*/
int (*counter_update_stats)(struct rdma_counter *counter);
- /**
+ /*
* counter_init - Initialize the driver specific rdma counter struct.
*/
void (*counter_init)(struct rdma_counter *counter);
- /**
+ /*
* Allows rdma drivers to add their own restrack attributes
* dumped via 'rdma stat' iproute2 command.
*/
@@ -2730,25 +2731,25 @@ struct ib_device_ops {
*/
int (*get_numa_node)(struct ib_device *dev);
- /**
+ /*
* add_sub_dev - Add a sub IB device
*/
struct ib_device *(*add_sub_dev)(struct ib_device *parent,
enum rdma_nl_dev_type type,
const char *name);
- /**
+ /*
* del_sub_dev - Delete a sub IB device
*/
void (*del_sub_dev)(struct ib_device *sub_dev);
- /**
+ /*
* ufile_cleanup - Attempt to cleanup ubojects HW resources inside
* the ufile.
*/
void (*ufile_hw_cleanup)(struct ib_uverbs_file *ufile);
- /**
+ /*
* report_port_event - Drivers need to implement this if they have
* some private stuff to handle when link status changes.
*/
@@ -3157,8 +3158,8 @@ static inline u32 rdma_start_port(const struct ib_device *device)
/**
* rdma_for_each_port - Iterate over all valid port numbers of the IB device
- * @device - The struct ib_device * to iterate over
- * @iter - The unsigned int to store the port number
+ * @device: The struct ib_device * to iterate over
+ * @iter: The unsigned int to store the port number
*/
#define rdma_for_each_port(device, iter) \
for (iter = rdma_start_port(device + \
@@ -3524,7 +3525,7 @@ static inline bool rdma_core_cap_opa_port(struct ib_device *device,
/**
* rdma_mtu_enum_to_int - Return the mtu of the port as an integer value.
* @device: Device
- * @port_num: Port number
+ * @port: Port number
* @mtu: enum value of MTU
*
* Return the MTU size supported by the port as an integer value. Will return
@@ -3542,7 +3543,7 @@ static inline int rdma_mtu_enum_to_int(struct ib_device *device, u32 port,
/**
* rdma_mtu_from_attr - Return the mtu of the port from the port attribute.
* @device: Device
- * @port_num: Port number
+ * @port: Port number
* @attr: port attribute
*
* Return the MTU size supported by the port as an integer value.
@@ -3919,7 +3920,7 @@ static inline int ib_destroy_qp(struct ib_qp *qp)
/**
* ib_open_qp - Obtain a reference to an existing sharable QP.
- * @xrcd - XRC domain
+ * @xrcd: XRC domain
* @qp_open_attr: Attributes identifying the QP to open.
*
* Returns a reference to a sharable QP.
@@ -4273,9 +4274,9 @@ static inline void ib_dma_unmap_sg_attrs(struct ib_device *dev,
/**
* ib_dma_map_sgtable_attrs - Map a scatter/gather table to DMA addresses
* @dev: The device for which the DMA addresses are to be created
- * @sg: The sg_table object describing the buffer
+ * @sgt: The sg_table object describing the buffer
* @direction: The direction of the DMA
- * @attrs: Optional DMA attributes for the map operation
+ * @dma_attrs: Optional DMA attributes for the map operation
*/
static inline int ib_dma_map_sgtable_attrs(struct ib_device *dev,
struct sg_table *sgt,
@@ -4419,8 +4420,8 @@ struct ib_mr *ib_alloc_mr_integrity(struct ib_pd *pd,
/**
* ib_update_fast_reg_key - updates the key portion of the fast_reg MR
* R_Key and L_Key.
- * @mr - struct ib_mr pointer to be updated.
- * @newkey - new key to be used.
+ * @mr: struct ib_mr pointer to be updated.
+ * @newkey: new key to be used.
*/
static inline void ib_update_fast_reg_key(struct ib_mr *mr, u8 newkey)
{
@@ -4431,7 +4432,7 @@ static inline void ib_update_fast_reg_key(struct ib_mr *mr, u8 newkey)
/**
* ib_inc_rkey - increments the key portion of the given rkey. Can be used
* for calculating a new rkey for type 2 memory windows.
- * @rkey - the rkey to increment.
+ * @rkey: the rkey to increment.
*/
static inline u32 ib_inc_rkey(u32 rkey)
{
@@ -4525,7 +4526,7 @@ int ib_check_mr_status(struct ib_mr *mr, u32 check_mask,
/**
* ib_device_try_get: Hold a registration lock
- * device: The device to lock
+ * @dev: The device to lock
*
* A device under an active registration lock cannot become unregistered. It
* is only possible to obtain a registration lock on a device that is fully
@@ -4832,7 +4833,7 @@ ib_get_vector_affinity(struct ib_device *device, int comp_vector)
* rdma_roce_rescan_device - Rescan all of the network devices in the system
* and add their gids, as needed, to the relevant RoCE devices.
*
- * @device: the rdma device
+ * @ibdev: the rdma device
*/
void rdma_roce_rescan_device(struct ib_device *ibdev);
void rdma_roce_rescan_port(struct ib_device *ib_dev, u32 port);
@@ -4885,7 +4886,7 @@ static inline struct ib_device *rdma_device_to_ibdev(struct device *device)
/**
* ibdev_to_node - return the NUMA node for a given ib_device
- * @dev: device to get the NUMA node for.
+ * @ibdev: device to get the NUMA node for.
*/
static inline int ibdev_to_node(struct ib_device *ibdev)
{
@@ -4923,6 +4924,7 @@ static inline struct net *rdma_dev_net(struct ib_device *device)
/**
* rdma_flow_label_to_udp_sport - generate a RoCE v2 UDP src port value based
* on the flow_label
+ * @fl: flow_label value
*
* This function will convert the 20 bit flow_label input to a valid RoCE v2
* UDP src port 14 bit value. All RoCE V2 drivers should use this same
diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h
index d67892944193..71140ea0aeb2 100644
--- a/include/rdma/rdmavt_qp.h
+++ b/include/rdma/rdmavt_qp.h
@@ -144,7 +144,7 @@
#define RVT_SEND_COMPLETION_ONLY (IB_SEND_RESERVED_START << 1)
/**
- * rvt_ud_wr - IB UD work plus AH cache
+ * struct rvt_ud_wr - IB UD work plus AH cache
* @wr: valid IB work request
* @attr: pointer to an allocated AH attribute
*
@@ -184,10 +184,10 @@ struct rvt_swqe {
* struct rvt_krwq - kernel struct receive work request
* @p_lock: lock to protect producer of the kernel buffer
* @head: index of next entry to fill
- * @c_lock:lock to protect consumer of the kernel buffer
+ * @c_lock: lock to protect consumer of the kernel buffer
* @tail: index of next entry to pull
- * @count: count is aproximate of total receive enteries posted
- * @rvt_rwqe: struct of receive work request queue entry
+ * @count: count is approximate of total receive entries posted
+ * @curr_wq: struct of receive work request queue entry
*
* This structure is used to contain the head pointer,
* tail pointer and receive work queue entries for kernel
@@ -309,10 +309,10 @@ struct rvt_ack_entry {
#define RVT_OPERATION_MAX (IB_WR_RESERVED10 + 1)
/**
- * rvt_operation_params - op table entry
- * @length - the length to copy into the swqe entry
- * @qpt_support - a bit mask indicating QP type support
- * @flags - RVT_OPERATION flags (see above)
+ * struct rvt_operation_params - op table entry
+ * @length: the length to copy into the swqe entry
+ * @qpt_support: a bit mask indicating QP type support
+ * @flags: RVT_OPERATION flags (see above)
*
* This supports table driven post send so that
* the driver can have differing an potentially
@@ -552,7 +552,7 @@ static inline struct rvt_rwqe *rvt_get_rwqe_ptr(struct rvt_rq *rq, unsigned n)
/**
* rvt_is_user_qp - return if this is user mode QP
- * @qp - the target QP
+ * @qp: the target QP
*/
static inline bool rvt_is_user_qp(struct rvt_qp *qp)
{
@@ -561,7 +561,7 @@ static inline bool rvt_is_user_qp(struct rvt_qp *qp)
/**
* rvt_get_qp - get a QP reference
- * @qp - the QP to hold
+ * @qp: the QP to hold
*/
static inline void rvt_get_qp(struct rvt_qp *qp)
{
@@ -570,7 +570,7 @@ static inline void rvt_get_qp(struct rvt_qp *qp)
/**
* rvt_put_qp - release a QP reference
- * @qp - the QP to release
+ * @qp: the QP to release
*/
static inline void rvt_put_qp(struct rvt_qp *qp)
{
@@ -580,7 +580,7 @@ static inline void rvt_put_qp(struct rvt_qp *qp)
/**
* rvt_put_swqe - drop mr refs held by swqe
- * @wqe - the send wqe
+ * @wqe: the send wqe
*
* This drops any mr references held by the swqe
*/
@@ -597,8 +597,8 @@ static inline void rvt_put_swqe(struct rvt_swqe *wqe)
/**
* rvt_qp_wqe_reserve - reserve operation
- * @qp - the rvt qp
- * @wqe - the send wqe
+ * @qp: the rvt qp
+ * @wqe: the send wqe
*
* This routine used in post send to record
* a wqe relative reserved operation use.
@@ -612,8 +612,8 @@ static inline void rvt_qp_wqe_reserve(
/**
* rvt_qp_wqe_unreserve - clean reserved operation
- * @qp - the rvt qp
- * @flags - send wqe flags
+ * @qp: the rvt qp
+ * @flags: send wqe flags
*
* This decrements the reserve use count.
*
@@ -653,8 +653,8 @@ u32 rvt_restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe, u32 len);
/**
* rvt_div_round_up_mtu - round up divide
- * @qp - the qp pair
- * @len - the length
+ * @qp: the qp pair
+ * @len: the length
*
* Perform a shift based mtu round up divide
*/
@@ -664,8 +664,9 @@ static inline u32 rvt_div_round_up_mtu(struct rvt_qp *qp, u32 len)
}
/**
- * @qp - the qp pair
- * @len - the length
+ * rvt_div_mtu - shift-based divide
+ * @qp: the qp pair
+ * @len: the length
*
* Perform a shift based mtu divide
*/
@@ -676,7 +677,7 @@ static inline u32 rvt_div_mtu(struct rvt_qp *qp, u32 len)
/**
* rvt_timeout_to_jiffies - Convert a ULP timeout input into jiffies
- * @timeout - timeout input(0 - 31).
+ * @timeout: timeout input(0 - 31).
*
* Return a timeout value in jiffies.
*/
@@ -690,7 +691,8 @@ static inline unsigned long rvt_timeout_to_jiffies(u8 timeout)
/**
* rvt_lookup_qpn - return the QP with the given QPN
- * @ibp: the ibport
+ * @rdi: rvt device info structure
+ * @rvp: the ibport
* @qpn: the QP number to look up
*
* The caller must hold the rcu_read_lock(), and keep the lock until
@@ -716,9 +718,9 @@ static inline struct rvt_qp *rvt_lookup_qpn(struct rvt_dev_info *rdi,
}
/**
- * rvt_mod_retry_timer - mod a retry timer
- * @qp - the QP
- * @shift - timeout shift to wait for multiple packets
+ * rvt_mod_retry_timer_ext - mod a retry timer
+ * @qp: the QP
+ * @shift: timeout shift to wait for multiple packets
* Modify a potentially already running retry timer
*/
static inline void rvt_mod_retry_timer_ext(struct rvt_qp *qp, u8 shift)
@@ -753,7 +755,7 @@ static inline void rvt_put_qp_swqe(struct rvt_qp *qp, struct rvt_swqe *wqe)
}
/**
- * rvt_qp_sqwe_incr - increment ring index
+ * rvt_qp_swqe_incr - increment ring index
* @qp: the qp
* @val: the starting value
*
@@ -811,10 +813,10 @@ static inline void rvt_send_cq(struct rvt_qp *qp, struct ib_wc *wc,
/**
* rvt_qp_complete_swqe - insert send completion
- * @qp - the qp
- * @wqe - the send wqe
- * @opcode - wc operation (driver dependent)
- * @status - completion status
+ * @qp: the qp
+ * @wqe: the send wqe
+ * @opcode: wc operation (driver dependent)
+ * @status: completion status
*
* Update the s_last information, and then insert a send
* completion into the completion
@@ -891,7 +893,7 @@ void rvt_ruc_loopback(struct rvt_qp *qp);
/**
* struct rvt_qp_iter - the iterator for QPs
- * @qp - the current QP
+ * @qp: the current QP
*
* This structure defines the current iterator
* state for sequenced access to all QPs relative
@@ -913,7 +915,7 @@ struct rvt_qp_iter {
/**
* ib_cq_tail - Return tail index of cq buffer
- * @send_cq - The cq for send
+ * @send_cq: The cq for send
*
* This is called in qp_iter_print to get tail
* of cq buffer.
@@ -929,7 +931,7 @@ static inline u32 ib_cq_tail(struct ib_cq *send_cq)
/**
* ib_cq_head - Return head index of cq buffer
- * @send_cq - The cq for send
+ * @send_cq: The cq for send
*
* This is called in qp_iter_print to get head
* of cq buffer.
@@ -945,7 +947,7 @@ static inline u32 ib_cq_head(struct ib_cq *send_cq)
/**
* rvt_free_rq - free memory allocated for rvt_rq struct
- * @rvt_rq: request queue data structure
+ * @rq: request queue data structure
*
* This function should only be called if the rvt_mmap_info()
* has not succeeded.
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index c218c89e0e2e..2c41920b641d 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -450,6 +450,16 @@ struct iommu_hwpt_vtd_s1 {
* nested domain will translate the same as the nesting parent. The S1 will
* install a Context Descriptor Table pointing at userspace memory translated
* by the nesting parent.
+ *
+ * It's suggested to allocate a vDEVICE object carrying vSID and then re-attach
+ * the nested domain, as soon as the vSID is available in the VMM level:
+ *
+ * - when Cfg=translate, a vDEVICE must be allocated prior to attaching to the
+ * allocated nested domain, as CD/ATS invalidations and vevents need a vSID.
+ * - when Cfg=bypass/abort, a vDEVICE is not enforced during the nested domain
+ * attachment, to support a GBPA case where VM sets CR0.SMMUEN=0. However, if
+ * VM sets CR0.SMMUEN=1 while missing a vDEVICE object, kernel would fail to
+ * report events to the VM. E.g. F_TRANSLATION when guest STE.Cfg=abort.
*/
struct iommu_hwpt_arm_smmuv3 {
__aligned_le64 ste[2];
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 75100bf009ba..ac2329f24141 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -14,6 +14,7 @@
#include <linux/types.h>
#include <linux/ioctl.h>
+#include <linux/stddef.h>
#define VFIO_API_VERSION 0
@@ -1478,6 +1479,33 @@ struct vfio_device_feature_bus_master {
};
#define VFIO_DEVICE_FEATURE_BUS_MASTER 10
+/**
+ * Upon VFIO_DEVICE_FEATURE_GET create a dma_buf fd for the
+ * regions selected.
+ *
+ * open_flags are the typical flags passed to open(2), eg O_RDWR, O_CLOEXEC,
+ * etc. offset/length specify a slice of the region to create the dmabuf from.
+ * nr_ranges is the total number of (P2P DMA) ranges that comprise the dmabuf.
+ *
+ * flags should be 0.
+ *
+ * Return: The fd number on success, -1 and errno is set on failure.
+ */
+#define VFIO_DEVICE_FEATURE_DMA_BUF 11
+
+struct vfio_region_dma_range {
+ __u64 offset;
+ __u64 length;
+};
+
+struct vfio_device_feature_dma_buf {
+ __u32 region_index;
+ __u32 open_flags;
+ __u32 flags;
+ __u32 nr_ranges;
+ struct vfio_region_dma_range dma_ranges[] __counted_by(nr_ranges);
+};
+
/* -------- API for Type1 VFIO IOMMU -------- */
/**
diff --git a/include/uapi/linux/virtio_pci.h b/include/uapi/linux/virtio_pci.h
index c691ac210ce2..e732e3456e27 100644
--- a/include/uapi/linux/virtio_pci.h
+++ b/include/uapi/linux/virtio_pci.h
@@ -40,7 +40,7 @@
#define _LINUX_VIRTIO_PCI_H
#include <linux/types.h>
-#include <linux/kernel.h>
+#include <linux/const.h>
#ifndef VIRTIO_PCI_NO_LEGACY
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index f973e7e73c90..50c3fe2a1d55 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -479,8 +479,8 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
}
break;
case PCI_P2PDMA_MAP_BUS_ADDR:
- sg->dma_address = pci_p2pdma_bus_addr_map(&p2pdma_state,
- sg_phys(sg));
+ sg->dma_address = pci_p2pdma_bus_addr_map(
+ p2pdma_state.mem, sg_phys(sg));
sg_dma_len(sg) = sg->length;
sg_dma_mark_bus_address(sg);
continue;
diff --git a/mm/hmm.c b/mm/hmm.c
index 87562914670a..9bf0b831a029 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -811,7 +811,7 @@ dma_addr_t hmm_dma_map_pfn(struct device *dev, struct hmm_dma_map *map,
break;
case PCI_P2PDMA_MAP_BUS_ADDR:
pfns[idx] |= HMM_PFN_P2PDMA_BUS | HMM_PFN_DMA_MAPPED;
- return pci_p2pdma_bus_addr_map(p2pdma_state, paddr);
+ return pci_p2pdma_bus_addr_map(p2pdma_state->mem, paddr);
default:
return DMA_MAPPING_ERROR;
}
diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c
index 18623ba666e3..64ea19253ee3 100644
--- a/samples/vfio-mdev/mbochs.c
+++ b/samples/vfio-mdev/mbochs.c
@@ -143,11 +143,6 @@ static struct mdev_parent mbochs_parent;
static atomic_t mbochs_avail_mbytes;
static const struct vfio_device_ops mbochs_dev_ops;
-struct vfio_region_info_ext {
- struct vfio_region_info base;
- struct vfio_region_info_cap_type type;
-};
-
struct mbochs_mode {
u32 drm_format;
u32 bytepp;
@@ -1033,10 +1028,12 @@ static int mbochs_dmabuf_export(struct mbochs_dmabuf *dmabuf)
return 0;
}
-static int mbochs_get_region_info(struct mdev_state *mdev_state,
- struct vfio_region_info_ext *ext)
+static int mbochs_ioctl_get_region_info(struct vfio_device *vdev,
+ struct vfio_region_info *region_info,
+ struct vfio_info_cap *caps)
{
- struct vfio_region_info *region_info = &ext->base;
+ struct mdev_state *mdev_state =
+ container_of(vdev, struct mdev_state, vdev);
if (region_info->index >= MBOCHS_NUM_REGIONS)
return -EINVAL;
@@ -1061,20 +1058,23 @@ static int mbochs_get_region_info(struct mdev_state *mdev_state,
region_info->flags = (VFIO_REGION_INFO_FLAG_READ |
VFIO_REGION_INFO_FLAG_WRITE);
break;
- case MBOCHS_EDID_REGION_INDEX:
- ext->base.argsz = sizeof(*ext);
- ext->base.offset = MBOCHS_EDID_OFFSET;
- ext->base.size = MBOCHS_EDID_SIZE;
- ext->base.flags = (VFIO_REGION_INFO_FLAG_READ |
- VFIO_REGION_INFO_FLAG_WRITE |
- VFIO_REGION_INFO_FLAG_CAPS);
- ext->base.cap_offset = offsetof(typeof(*ext), type);
- ext->type.header.id = VFIO_REGION_INFO_CAP_TYPE;
- ext->type.header.version = 1;
- ext->type.header.next = 0;
- ext->type.type = VFIO_REGION_TYPE_GFX;
- ext->type.subtype = VFIO_REGION_SUBTYPE_GFX_EDID;
- break;
+ case MBOCHS_EDID_REGION_INDEX: {
+ struct vfio_region_info_cap_type cap_type = {
+ .header.id = VFIO_REGION_INFO_CAP_TYPE,
+ .header.version = 1,
+ .type = VFIO_REGION_TYPE_GFX,
+ .subtype = VFIO_REGION_SUBTYPE_GFX_EDID,
+ };
+
+ region_info->offset = MBOCHS_EDID_OFFSET;
+ region_info->size = MBOCHS_EDID_SIZE;
+ region_info->flags = (VFIO_REGION_INFO_FLAG_READ |
+ VFIO_REGION_INFO_FLAG_WRITE |
+ VFIO_REGION_INFO_FLAG_CAPS);
+
+ return vfio_info_add_capability(caps, &cap_type.header,
+ sizeof(cap_type));
+ }
default:
region_info->size = 0;
region_info->offset = 0;
@@ -1191,7 +1191,7 @@ static long mbochs_ioctl(struct vfio_device *vdev, unsigned int cmd,
struct mdev_state *mdev_state =
container_of(vdev, struct mdev_state, vdev);
int ret = 0;
- unsigned long minsz, outsz;
+ unsigned long minsz;
switch (cmd) {
case VFIO_DEVICE_GET_INFO:
@@ -1215,30 +1215,6 @@ static long mbochs_ioctl(struct vfio_device *vdev, unsigned int cmd,
return 0;
}
- case VFIO_DEVICE_GET_REGION_INFO:
- {
- struct vfio_region_info_ext info;
-
- minsz = offsetofend(typeof(info), base.offset);
-
- if (copy_from_user(&info, (void __user *)arg, minsz))
- return -EFAULT;
-
- outsz = info.base.argsz;
- if (outsz < minsz)
- return -EINVAL;
- if (outsz > sizeof(info))
- return -EINVAL;
-
- ret = mbochs_get_region_info(mdev_state, &info);
- if (ret)
- return ret;
-
- if (copy_to_user((void __user *)arg, &info, outsz))
- return -EFAULT;
-
- return 0;
- }
case VFIO_DEVICE_GET_IRQ_INFO:
{
@@ -1376,6 +1352,7 @@ static const struct vfio_device_ops mbochs_dev_ops = {
.read = mbochs_read,
.write = mbochs_write,
.ioctl = mbochs_ioctl,
+ .get_region_info_caps = mbochs_ioctl_get_region_info,
.mmap = mbochs_mmap,
.bind_iommufd = vfio_iommufd_emulated_bind,
.unbind_iommufd = vfio_iommufd_emulated_unbind,
diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c
index 8104831ae125..0759bd68edca 100644
--- a/samples/vfio-mdev/mdpy.c
+++ b/samples/vfio-mdev/mdpy.c
@@ -435,10 +435,13 @@ static int mdpy_mmap(struct vfio_device *vdev, struct vm_area_struct *vma)
return remap_vmalloc_range(vma, mdev_state->memblk, 0);
}
-static int mdpy_get_region_info(struct mdev_state *mdev_state,
- struct vfio_region_info *region_info,
- u16 *cap_type_id, void **cap_type)
+static int mdpy_ioctl_get_region_info(struct vfio_device *vdev,
+ struct vfio_region_info *region_info,
+ struct vfio_info_cap *caps)
{
+ struct mdev_state *mdev_state =
+ container_of(vdev, struct mdev_state, vdev);
+
if (region_info->index >= VFIO_PCI_NUM_REGIONS &&
region_info->index != MDPY_DISPLAY_REGION)
return -EINVAL;
@@ -544,30 +547,6 @@ static long mdpy_ioctl(struct vfio_device *vdev, unsigned int cmd,
return 0;
}
- case VFIO_DEVICE_GET_REGION_INFO:
- {
- struct vfio_region_info info;
- u16 cap_type_id = 0;
- void *cap_type = NULL;
-
- minsz = offsetofend(struct vfio_region_info, offset);
-
- if (copy_from_user(&info, (void __user *)arg, minsz))
- return -EFAULT;
-
- if (info.argsz < minsz)
- return -EINVAL;
-
- ret = mdpy_get_region_info(mdev_state, &info, &cap_type_id,
- &cap_type);
- if (ret)
- return ret;
-
- if (copy_to_user((void __user *)arg, &info, minsz))
- return -EFAULT;
-
- return 0;
- }
case VFIO_DEVICE_GET_IRQ_INFO:
{
@@ -665,6 +644,7 @@ static const struct vfio_device_ops mdpy_dev_ops = {
.read = mdpy_read,
.write = mdpy_write,
.ioctl = mdpy_ioctl,
+ .get_region_info_caps = mdpy_ioctl_get_region_info,
.mmap = mdpy_mmap,
.bind_iommufd = vfio_iommufd_emulated_bind,
.unbind_iommufd = vfio_iommufd_emulated_unbind,
diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c
index 59eefe2fed10..bd92c38379b8 100644
--- a/samples/vfio-mdev/mtty.c
+++ b/samples/vfio-mdev/mtty.c
@@ -624,7 +624,7 @@ static void handle_bar_read(unsigned int index, struct mdev_state *mdev_state,
u8 lsr = 0;
mutex_lock(&mdev_state->rxtx_lock);
- /* atleast one char in FIFO */
+ /* at least one char in FIFO */
if (mdev_state->s[index].rxtx.head !=
mdev_state->s[index].rxtx.tail)
lsr |= UART_LSR_DR;
@@ -1717,10 +1717,12 @@ static int mtty_set_irqs(struct mdev_state *mdev_state, uint32_t flags,
return ret;
}
-static int mtty_get_region_info(struct mdev_state *mdev_state,
- struct vfio_region_info *region_info,
- u16 *cap_type_id, void **cap_type)
+static int mtty_ioctl_get_region_info(struct vfio_device *vdev,
+ struct vfio_region_info *region_info,
+ struct vfio_info_cap *caps)
{
+ struct mdev_state *mdev_state =
+ container_of(vdev, struct mdev_state, vdev);
unsigned int size = 0;
u32 bar_index;
@@ -1817,30 +1819,6 @@ static long mtty_ioctl(struct vfio_device *vdev, unsigned int cmd,
return 0;
}
- case VFIO_DEVICE_GET_REGION_INFO:
- {
- struct vfio_region_info info;
- u16 cap_type_id = 0;
- void *cap_type = NULL;
-
- minsz = offsetofend(struct vfio_region_info, offset);
-
- if (copy_from_user(&info, (void __user *)arg, minsz))
- return -EFAULT;
-
- if (info.argsz < minsz)
- return -EINVAL;
-
- ret = mtty_get_region_info(mdev_state, &info, &cap_type_id,
- &cap_type);
- if (ret)
- return ret;
-
- if (copy_to_user((void __user *)arg, &info, minsz))
- return -EFAULT;
-
- return 0;
- }
case VFIO_DEVICE_GET_IRQ_INFO:
{
@@ -1949,6 +1927,7 @@ static const struct vfio_device_ops mtty_dev_ops = {
.read = mtty_read,
.write = mtty_write,
.ioctl = mtty_ioctl,
+ .get_region_info_caps = mtty_ioctl_get_region_info,
.bind_iommufd = vfio_iommufd_emulated_bind,
.unbind_iommufd = vfio_iommufd_emulated_unbind,
.attach_ioas = vfio_iommufd_emulated_attach_ioas,
diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild
index 0d5ce4b74b9f..0e151d0572d1 100644
--- a/tools/testing/cxl/Kbuild
+++ b/tools/testing/cxl/Kbuild
@@ -4,13 +4,12 @@ ldflags-y += --wrap=is_acpi_device_node
ldflags-y += --wrap=acpi_evaluate_integer
ldflags-y += --wrap=acpi_pci_find_root
ldflags-y += --wrap=nvdimm_bus_register
-ldflags-y += --wrap=devm_cxl_port_enumerate_dports
ldflags-y += --wrap=cxl_await_media_ready
ldflags-y += --wrap=devm_cxl_add_rch_dport
-ldflags-y += --wrap=cxl_rcd_component_reg_phys
ldflags-y += --wrap=cxl_endpoint_parse_cdat
ldflags-y += --wrap=cxl_dport_init_ras_reporting
ldflags-y += --wrap=devm_cxl_endpoint_decoders_setup
+ldflags-y += --wrap=hmat_get_extended_linear_cache_size
DRIVERS := ../../../drivers
CXL_SRC := $(DRIVERS)/cxl
diff --git a/tools/testing/cxl/test/Kbuild b/tools/testing/cxl/test/Kbuild
index 6b1927897856..af50972c8b6d 100644
--- a/tools/testing/cxl/test/Kbuild
+++ b/tools/testing/cxl/test/Kbuild
@@ -4,6 +4,7 @@ ccflags-y := -I$(srctree)/drivers/cxl/ -I$(srctree)/drivers/cxl/core
obj-m += cxl_test.o
obj-m += cxl_mock.o
obj-m += cxl_mock_mem.o
+obj-m += cxl_translate.o
cxl_test-y := cxl.o
cxl_mock-y := mock.o
diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c
index 2d135ca533d0..81e2aef3627a 100644
--- a/tools/testing/cxl/test/cxl.c
+++ b/tools/testing/cxl/test/cxl.c
@@ -15,6 +15,7 @@
#include "mock.h"
static int interleave_arithmetic;
+static bool extended_linear_cache;
#define FAKE_QTG_ID 42
@@ -26,6 +27,9 @@ static int interleave_arithmetic;
#define NR_CXL_PORT_DECODERS 8
#define NR_BRIDGES (NR_CXL_HOST_BRIDGES + NR_CXL_SINGLE_HOST + NR_CXL_RCH)
+#define MOCK_AUTO_REGION_SIZE_DEFAULT SZ_512M
+static int mock_auto_region_size = MOCK_AUTO_REGION_SIZE_DEFAULT;
+
static struct platform_device *cxl_acpi;
static struct platform_device *cxl_host_bridge[NR_CXL_HOST_BRIDGES];
#define NR_MULTI_ROOT (NR_CXL_HOST_BRIDGES * NR_CXL_ROOT_PORTS)
@@ -426,6 +430,22 @@ static struct cxl_mock_res *alloc_mock_res(resource_size_t size, int align)
return res;
}
+/* Only update CFMWS0 as this is used by the auto region. */
+static void cfmws_elc_update(struct acpi_cedt_cfmws *window, int index)
+{
+ if (!extended_linear_cache)
+ return;
+
+ if (index != 0)
+ return;
+
+ /*
+ * The window size should be 2x of the CXL region size where half is
+ * DRAM and half is CXL
+ */
+ window->window_size = mock_auto_region_size * 2;
+}
+
static int populate_cedt(void)
{
struct cxl_mock_res *res;
@@ -450,6 +470,7 @@ static int populate_cedt(void)
for (i = cfmws_start; i <= cfmws_end; i++) {
struct acpi_cedt_cfmws *window = mock_cfmws[i];
+ cfmws_elc_update(window, i);
res = alloc_mock_res(window->window_size, SZ_256M);
if (!res)
return -ENOMEM;
@@ -591,6 +612,25 @@ mock_acpi_evaluate_integer(acpi_handle handle, acpi_string pathname,
return AE_OK;
}
+static int
+mock_hmat_get_extended_linear_cache_size(struct resource *backing_res,
+ int nid, resource_size_t *cache_size)
+{
+ struct acpi_cedt_cfmws *window = mock_cfmws[0];
+ struct resource cfmws0_res =
+ DEFINE_RES_MEM(window->base_hpa, window->window_size);
+
+ if (!extended_linear_cache ||
+ !resource_contains(&cfmws0_res, backing_res)) {
+ return hmat_get_extended_linear_cache_size(backing_res,
+ nid, cache_size);
+ }
+
+ *cache_size = mock_auto_region_size;
+
+ return 0;
+}
+
static struct pci_bus mock_pci_bus[NR_BRIDGES];
static struct acpi_pci_root mock_pci_root[ARRAY_SIZE(mock_pci_bus)] = {
[0] = {
@@ -738,7 +778,6 @@ static void mock_init_hdm_decoder(struct cxl_decoder *cxld)
struct cxl_endpoint_decoder *cxled;
struct cxl_switch_decoder *cxlsd;
struct cxl_port *port, *iter;
- const int size = SZ_512M;
struct cxl_memdev *cxlmd;
struct cxl_dport *dport;
struct device *dev;
@@ -781,9 +820,11 @@ static void mock_init_hdm_decoder(struct cxl_decoder *cxld)
}
base = window->base_hpa;
+ if (extended_linear_cache)
+ base += mock_auto_region_size;
cxld->hpa_range = (struct range) {
.start = base,
- .end = base + size - 1,
+ .end = base + mock_auto_region_size - 1,
};
cxld->interleave_ways = 2;
@@ -792,7 +833,8 @@ static void mock_init_hdm_decoder(struct cxl_decoder *cxld)
cxld->flags = CXL_DECODER_F_ENABLE;
cxled->state = CXL_DECODER_STATE_AUTO;
port->commit_end = cxld->id;
- devm_cxl_dpa_reserve(cxled, 0, size / cxld->interleave_ways, 0);
+ devm_cxl_dpa_reserve(cxled, 0,
+ mock_auto_region_size / cxld->interleave_ways, 0);
cxld->commit = mock_decoder_commit;
cxld->reset = mock_decoder_reset;
@@ -841,7 +883,7 @@ static void mock_init_hdm_decoder(struct cxl_decoder *cxld)
cxld->interleave_granularity = 4096;
cxld->hpa_range = (struct range) {
.start = base,
- .end = base + size - 1,
+ .end = base + mock_auto_region_size - 1,
};
put_device(dev);
}
@@ -995,37 +1037,6 @@ static int get_port_array(struct cxl_port *port,
return 0;
}
-static int mock_cxl_port_enumerate_dports(struct cxl_port *port)
-{
- struct platform_device **array;
- int i, array_size;
- int rc;
-
- rc = get_port_array(port, &array, &array_size);
- if (rc)
- return rc;
-
- for (i = 0; i < array_size; i++) {
- struct platform_device *pdev = array[i];
- struct cxl_dport *dport;
-
- if (pdev->dev.parent != port->uport_dev) {
- dev_dbg(&port->dev, "%s: mismatch parent %s\n",
- dev_name(port->uport_dev),
- dev_name(pdev->dev.parent));
- continue;
- }
-
- dport = devm_cxl_add_dport(port, &pdev->dev, pdev->id,
- CXL_RESOURCE_NONE);
-
- if (IS_ERR(dport))
- return PTR_ERR(dport);
- }
-
- return 0;
-}
-
static struct cxl_dport *mock_cxl_add_dport_by_dev(struct cxl_port *port,
struct device *dport_dev)
{
@@ -1114,9 +1125,10 @@ static struct cxl_mock_ops cxl_mock_ops = {
.acpi_pci_find_root = mock_acpi_pci_find_root,
.devm_cxl_switch_port_decoders_setup = mock_cxl_switch_port_decoders_setup,
.devm_cxl_endpoint_decoders_setup = mock_cxl_endpoint_decoders_setup,
- .devm_cxl_port_enumerate_dports = mock_cxl_port_enumerate_dports,
.cxl_endpoint_parse_cdat = mock_cxl_endpoint_parse_cdat,
.devm_cxl_add_dport_by_dev = mock_cxl_add_dport_by_dev,
+ .hmat_get_extended_linear_cache_size =
+ mock_hmat_get_extended_linear_cache_size,
.list = LIST_HEAD_INIT(cxl_mock_ops.list),
};
@@ -1606,6 +1618,8 @@ static __exit void cxl_test_exit(void)
module_param(interleave_arithmetic, int, 0444);
MODULE_PARM_DESC(interleave_arithmetic, "Modulo:0, XOR:1");
+module_param(extended_linear_cache, bool, 0444);
+MODULE_PARM_DESC(extended_linear_cache, "Enable extended linear cache support");
module_init(cxl_test_init);
module_exit(cxl_test_exit);
MODULE_LICENSE("GPL v2");
diff --git a/tools/testing/cxl/test/cxl_translate.c b/tools/testing/cxl/test/cxl_translate.c
new file mode 100644
index 000000000000..2200ae21795c
--- /dev/null
+++ b/tools/testing/cxl/test/cxl_translate.c
@@ -0,0 +1,445 @@
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright(c) 2025 Intel Corporation. All rights reserved.
+
+/* Preface all log entries with "cxl_translate" */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/moduleparam.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/acpi.h>
+#include <cxlmem.h>
+#include <cxl.h>
+
+/* Maximum number of test vectors and entry length */
+#define MAX_TABLE_ENTRIES 128
+#define MAX_ENTRY_LEN 128
+
+/* Expected number of parameters in each test vector */
+#define EXPECTED_PARAMS 7
+
+/* Module parameters for test vectors */
+static char *table[MAX_TABLE_ENTRIES];
+static int table_num;
+
+/* Interleave Arithmetic */
+#define MODULO_MATH 0
+#define XOR_MATH 1
+
+/*
+ * XOR mapping configuration
+ * The test data sets all use the same set of xormaps. When additional
+ * data sets arrive for validation, this static setup will need to
+ * be changed to accept xormaps as additional parameters.
+ */
+struct cxl_cxims_data *cximsd;
+static u64 xormaps[] = {
+ 0x2020900,
+ 0x4041200,
+ 0x1010400,
+ 0x800,
+};
+
+static int nr_maps = ARRAY_SIZE(xormaps);
+
+#define HBIW_TO_NR_MAPS_SIZE (CXL_DECODER_MAX_INTERLEAVE + 1)
+static const int hbiw_to_nr_maps[HBIW_TO_NR_MAPS_SIZE] = {
+ [1] = 0, [2] = 1, [3] = 0, [4] = 2, [6] = 1, [8] = 3, [12] = 2, [16] = 4
+};
+
+/**
+ * to_hpa - calculate an HPA offset from a DPA offset and position
+ *
+ * dpa_offset: device physical address offset
+ * pos: devices position in interleave
+ * r_eiw: region encoded interleave ways
+ * r_eig: region encoded interleave granularity
+ * hb_ways: host bridge interleave ways
+ * math: interleave arithmetic (MODULO_MATH or XOR_MATH)
+ *
+ * Returns: host physical address offset
+ */
+static u64 to_hpa(u64 dpa_offset, int pos, u8 r_eiw, u16 r_eig, u8 hb_ways,
+ u8 math)
+{
+ u64 hpa_offset;
+
+ /* Calculate base HPA offset from DPA and position */
+ hpa_offset = cxl_calculate_hpa_offset(dpa_offset, pos, r_eiw, r_eig);
+
+ if (math == XOR_MATH) {
+ cximsd->nr_maps = hbiw_to_nr_maps[hb_ways];
+ if (cximsd->nr_maps)
+ return cxl_do_xormap_calc(cximsd, hpa_offset, hb_ways);
+ }
+ return hpa_offset;
+}
+
+/**
+ * to_dpa - translate an HPA offset to DPA offset
+ *
+ * hpa_offset: host physical address offset
+ * r_eiw: region encoded interleave ways
+ * r_eig: region encoded interleave granularity
+ * hb_ways: host bridge interleave ways
+ * math: interleave arithmetic (MODULO_MATH or XOR_MATH)
+ *
+ * Returns: device physical address offset
+ */
+static u64 to_dpa(u64 hpa_offset, u8 r_eiw, u16 r_eig, u8 hb_ways, u8 math)
+{
+ u64 offset = hpa_offset;
+
+ if (math == XOR_MATH) {
+ cximsd->nr_maps = hbiw_to_nr_maps[hb_ways];
+ if (cximsd->nr_maps)
+ offset =
+ cxl_do_xormap_calc(cximsd, hpa_offset, hb_ways);
+ }
+ return cxl_calculate_dpa_offset(offset, r_eiw, r_eig);
+}
+
+/**
+ * to_pos - extract an interleave position from an HPA offset
+ *
+ * hpa_offset: host physical address offset
+ * r_eiw: region encoded interleave ways
+ * r_eig: region encoded interleave granularity
+ * hb_ways: host bridge interleave ways
+ * math: interleave arithmetic (MODULO_MATH or XOR_MATH)
+ *
+ * Returns: devices position in region interleave
+ */
+static u64 to_pos(u64 hpa_offset, u8 r_eiw, u16 r_eig, u8 hb_ways, u8 math)
+{
+ u64 offset = hpa_offset;
+
+ /* Reverse XOR mapping if specified */
+ if (math == XOR_MATH)
+ offset = cxl_do_xormap_calc(cximsd, hpa_offset, hb_ways);
+
+ return cxl_calculate_position(offset, r_eiw, r_eig);
+}
+
+/**
+ * run_translation_test - execute forward and reverse translations
+ *
+ * @dpa: device physical address
+ * @pos: expected position in region interleave
+ * @r_eiw: region encoded interleave ways
+ * @r_eig: region encoded interleave granularity
+ * @hb_ways: host bridge interleave ways
+ * @math: interleave arithmetic (MODULO_MATH or XOR_MATH)
+ * @expect_spa: expected system physical address
+ *
+ * Returns: 0 on success, -1 on failure
+ */
+static int run_translation_test(u64 dpa, int pos, u8 r_eiw, u16 r_eig,
+ u8 hb_ways, int math, u64 expect_hpa)
+{
+ u64 translated_spa, reverse_dpa;
+ int reverse_pos;
+
+ /* Test Device to Host translation: DPA + POS -> SPA */
+ translated_spa = to_hpa(dpa, pos, r_eiw, r_eig, hb_ways, math);
+ if (translated_spa != expect_hpa) {
+ pr_err("Device to host failed: expected HPA %llu, got %llu\n",
+ expect_hpa, translated_spa);
+ return -1;
+ }
+
+ /* Test Host to Device DPA translation: SPA -> DPA */
+ reverse_dpa = to_dpa(translated_spa, r_eiw, r_eig, hb_ways, math);
+ if (reverse_dpa != dpa) {
+ pr_err("Host to Device DPA failed: expected %llu, got %llu\n",
+ dpa, reverse_dpa);
+ return -1;
+ }
+
+ /* Test Host to Device Position translation: SPA -> POS */
+ reverse_pos = to_pos(translated_spa, r_eiw, r_eig, hb_ways, math);
+ if (reverse_pos != pos) {
+ pr_err("Position lookup failed: expected %d, got %d\n", pos,
+ reverse_pos);
+ return -1;
+ }
+
+ return 0;
+}
+
+/**
+ * parse_test_vector - parse a single test vector string
+ *
+ * entry: test vector string to parse
+ * dpa: device physical address
+ * pos: expected position in region interleave
+ * r_eiw: region encoded interleave ways
+ * r_eig: region encoded interleave granularity
+ * hb_ways: host bridge interleave ways
+ * math: interleave arithmetic (MODULO_MATH or XOR_MATH)
+ * expect_spa: expected system physical address
+ *
+ * Returns: 0 on success, negative error code on failure
+ */
+static int parse_test_vector(const char *entry, u64 *dpa, int *pos, u8 *r_eiw,
+ u16 *r_eig, u8 *hb_ways, int *math,
+ u64 *expect_hpa)
+{
+ unsigned int tmp_r_eiw, tmp_r_eig, tmp_hb_ways;
+ int parsed;
+
+ parsed = sscanf(entry, "%llu %d %u %u %u %d %llu", dpa, pos, &tmp_r_eiw,
+ &tmp_r_eig, &tmp_hb_ways, math, expect_hpa);
+
+ if (parsed != EXPECTED_PARAMS) {
+ pr_err("Parse error: expected %d parameters, got %d in '%s'\n",
+ EXPECTED_PARAMS, parsed, entry);
+ return -EINVAL;
+ }
+ if (tmp_r_eiw > U8_MAX || tmp_r_eig > U16_MAX || tmp_hb_ways > U8_MAX) {
+ pr_err("Parameter overflow in entry: '%s'\n", entry);
+ return -ERANGE;
+ }
+ if (*math != MODULO_MATH && *math != XOR_MATH) {
+ pr_err("Invalid math type %d in entry: '%s'\n", *math, entry);
+ return -EINVAL;
+ }
+ *r_eiw = tmp_r_eiw;
+ *r_eig = tmp_r_eig;
+ *hb_ways = tmp_hb_ways;
+
+ return 0;
+}
+
+/*
+ * setup_xor_mapping - Initialize XOR mapping data structure
+ *
+ * The test data sets all use the same HBIG so we can use one set
+ * of xormaps, and set the number to apply based on HBIW before
+ * calling cxl_do_xormap_calc().
+ *
+ * When additional data sets arrive for validation with different
+ * HBIG's this static setup will need to be updated.
+ *
+ * Returns: 0 on success, negative error code on failure
+ */
+static int setup_xor_mapping(void)
+{
+ if (nr_maps <= 0)
+ return -EINVAL;
+
+ cximsd = kzalloc(struct_size(cximsd, xormaps, nr_maps), GFP_KERNEL);
+ if (!cximsd)
+ return -ENOMEM;
+
+ memcpy(cximsd->xormaps, xormaps, nr_maps * sizeof(*cximsd->xormaps));
+ cximsd->nr_maps = nr_maps;
+
+ return 0;
+}
+
+static int test_random_params(void)
+{
+ u8 valid_eiws[] = { 0, 1, 2, 3, 4, 8, 9, 10 };
+ u16 valid_eigs[] = { 0, 1, 2, 3, 4, 5, 6 };
+ int i, ways, pos, reverse_pos;
+ u64 dpa, hpa, reverse_dpa;
+ int iterations = 10000;
+ int failures = 0;
+
+ for (i = 0; i < iterations; i++) {
+ /* Generate valid random parameters for eiw, eig, pos, dpa */
+ u8 eiw = valid_eiws[get_random_u32() % ARRAY_SIZE(valid_eiws)];
+ u16 eig = valid_eigs[get_random_u32() % ARRAY_SIZE(valid_eigs)];
+
+ eiw_to_ways(eiw, &ways);
+ pos = get_random_u32() % ways;
+ dpa = get_random_u64() >> 12;
+
+ hpa = cxl_calculate_hpa_offset(dpa, pos, eiw, eig);
+ reverse_dpa = cxl_calculate_dpa_offset(hpa, eiw, eig);
+ reverse_pos = cxl_calculate_position(hpa, eiw, eig);
+
+ if (reverse_dpa != dpa || reverse_pos != pos) {
+ pr_err("test random iter %d FAIL hpa=%llu, dpa=%llu reverse_dpa=%llu, pos=%d reverse_pos=%d eiw=%u eig=%u\n",
+ i, hpa, dpa, reverse_dpa, pos, reverse_pos, eiw,
+ eig);
+
+ if (failures++ > 10) {
+ pr_err("test random too many failures, stop\n");
+ break;
+ }
+ }
+ }
+ pr_info("..... test random: PASS %d FAIL %d\n", i - failures, failures);
+
+ if (failures)
+ return -EINVAL;
+
+ return 0;
+}
+
+struct param_test {
+ u8 eiw;
+ u16 eig;
+ int pos;
+ bool expect; /* true: expect pass, false: expect fail */
+ const char *desc;
+};
+
+static struct param_test param_tests[] = {
+ { 0x0, 0, 0, true, "1-way, min eig=0, pos=0" },
+ { 0x0, 3, 0, true, "1-way, mid eig=3, pos=0" },
+ { 0x0, 6, 0, true, "1-way, max eig=6, pos=0" },
+ { 0x1, 0, 0, true, "2-way, eig=0, pos=0" },
+ { 0x1, 3, 1, true, "2-way, eig=3, max pos=1" },
+ { 0x1, 6, 1, true, "2-way, eig=6, max pos=1" },
+ { 0x2, 0, 0, true, "4-way, eig=0, pos=0" },
+ { 0x2, 3, 3, true, "4-way, eig=3, max pos=3" },
+ { 0x2, 6, 3, true, "4-way, eig=6, max pos=3" },
+ { 0x3, 0, 0, true, "8-way, eig=0, pos=0" },
+ { 0x3, 3, 7, true, "8-way, eig=3, max pos=7" },
+ { 0x3, 6, 7, true, "8-way, eig=6, max pos=7" },
+ { 0x4, 0, 0, true, "16-way, eig=0, pos=0" },
+ { 0x4, 3, 15, true, "16-way, eig=3, max pos=15" },
+ { 0x4, 6, 15, true, "16-way, eig=6, max pos=15" },
+ { 0x8, 0, 0, true, "3-way, eig=0, pos=0" },
+ { 0x8, 3, 2, true, "3-way, eig=3, max pos=2" },
+ { 0x8, 6, 2, true, "3-way, eig=6, max pos=2" },
+ { 0x9, 0, 0, true, "6-way, eig=0, pos=0" },
+ { 0x9, 3, 5, true, "6-way, eig=3, max pos=5" },
+ { 0x9, 6, 5, true, "6-way, eig=6, max pos=5" },
+ { 0xA, 0, 0, true, "12-way, eig=0, pos=0" },
+ { 0xA, 3, 11, true, "12-way, eig=3, max pos=11" },
+ { 0xA, 6, 11, true, "12-way, eig=6, max pos=11" },
+ { 0x5, 0, 0, false, "invalid eiw=5" },
+ { 0x7, 0, 0, false, "invalid eiw=7" },
+ { 0xB, 0, 0, false, "invalid eiw=0xB" },
+ { 0xFF, 0, 0, false, "invalid eiw=0xFF" },
+ { 0x1, 7, 0, false, "invalid eig=7 (out of range)" },
+ { 0x2, 0x10, 0, false, "invalid eig=0x10" },
+ { 0x3, 0xFFFF, 0, false, "invalid eig=0xFFFF" },
+ { 0x1, 0, -1, false, "pos < 0" },
+ { 0x1, 0, 2, false, "2-way, pos=2 (>= ways)" },
+ { 0x2, 0, 4, false, "4-way, pos=4 (>= ways)" },
+ { 0x3, 0, 8, false, "8-way, pos=8 (>= ways)" },
+ { 0x4, 0, 16, false, "16-way, pos=16 (>= ways)" },
+ { 0x8, 0, 3, false, "3-way, pos=3 (>= ways)" },
+ { 0x9, 0, 6, false, "6-way, pos=6 (>= ways)" },
+ { 0xA, 0, 12, false, "12-way, pos=12 (>= ways)" },
+};
+
+static int test_cxl_validate_translation_params(void)
+{
+ int i, rc, failures = 0;
+ bool valid;
+
+ for (i = 0; i < ARRAY_SIZE(param_tests); i++) {
+ struct param_test *t = &param_tests[i];
+
+ rc = cxl_validate_translation_params(t->eiw, t->eig, t->pos);
+ valid = (rc == 0);
+
+ if (valid != t->expect) {
+ pr_err("test params failed: %s\n", t->desc);
+ failures++;
+ }
+ }
+ pr_info("..... test params: PASS %d FAIL %d\n", i - failures, failures);
+
+ if (failures)
+ return -EINVAL;
+
+ return 0;
+}
+
+/*
+ * cxl_translate_init
+ *
+ * Run the internal validation tests when no params are passed.
+ * Otherwise, parse the parameters (test vectors), and kick off
+ * the translation test.
+ *
+ * Returns: 0 on success, negative error code on failure
+ */
+static int __init cxl_translate_init(void)
+{
+ int rc, i;
+
+ /* If no tables are passed, validate module params only */
+ if (table_num == 0) {
+ pr_info("Internal validation test start...\n");
+ rc = test_cxl_validate_translation_params();
+ if (rc)
+ return rc;
+
+ rc = test_random_params();
+ if (rc)
+ return rc;
+
+ pr_info("Internal validation test completed successfully\n");
+
+ return 0;
+ }
+
+ pr_info("CXL translate test module loaded with %d test vectors\n",
+ table_num);
+
+ rc = setup_xor_mapping();
+ if (rc)
+ return rc;
+
+ /* Process each test vector */
+ for (i = 0; i < table_num; i++) {
+ u64 dpa, expect_spa;
+ int pos, math;
+ u8 r_eiw, hb_ways;
+ u16 r_eig;
+
+ pr_debug("Processing test vector %d: '%s'\n", i, table[i]);
+
+ /* Parse the test vector */
+ rc = parse_test_vector(table[i], &dpa, &pos, &r_eiw, &r_eig,
+ &hb_ways, &math, &expect_spa);
+ if (rc) {
+ pr_err("CXL Translate Test %d: FAIL\n"
+ " Failed to parse test vector '%s'\n",
+ i, table[i]);
+ continue;
+ }
+ /* Run the translation test */
+ rc = run_translation_test(dpa, pos, r_eiw, r_eig, hb_ways, math,
+ expect_spa);
+ if (rc) {
+ pr_err("CXL Translate Test %d: FAIL\n"
+ " dpa=%llu pos=%d r_eiw=%u r_eig=%u hb_ways=%u math=%s expect_spa=%llu\n",
+ i, dpa, pos, r_eiw, r_eig, hb_ways,
+ (math == XOR_MATH) ? "XOR" : "MODULO",
+ expect_spa);
+ } else {
+ pr_info("CXL Translate Test %d: PASS\n", i);
+ }
+ }
+
+ kfree(cximsd);
+ pr_info("CXL translate test completed\n");
+
+ return 0;
+}
+
+static void __exit cxl_translate_exit(void)
+{
+ pr_info("CXL translate test module unloaded\n");
+}
+
+module_param_array(table, charp, &table_num, 0444);
+MODULE_PARM_DESC(table, "Test vectors as space-separated decimal strings");
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("cxl_test: cxl address translation test module");
+MODULE_IMPORT_NS("CXL");
+
+module_init(cxl_translate_init);
+module_exit(cxl_translate_exit);
diff --git a/tools/testing/cxl/test/mem.c b/tools/testing/cxl/test/mem.c
index d533481672b7..176dcde570cd 100644
--- a/tools/testing/cxl/test/mem.c
+++ b/tools/testing/cxl/test/mem.c
@@ -250,22 +250,21 @@ static void mes_add_event(struct mock_event_store *mes,
* Vary the number of events returned to simulate events occuring while the
* logs are being read.
*/
-static int ret_limit = 0;
+static atomic_t event_counter = ATOMIC_INIT(0);
static int mock_get_event(struct device *dev, struct cxl_mbox_cmd *cmd)
{
struct cxl_get_event_payload *pl;
struct mock_event_log *log;
- u16 nr_overflow;
+ int ret_limit;
u8 log_type;
int i;
if (cmd->size_in != sizeof(log_type))
return -EINVAL;
- ret_limit = (ret_limit + 1) % CXL_TEST_EVENT_RET_MAX;
- if (!ret_limit)
- ret_limit = 1;
+ /* Vary return limit from 1 to CXL_TEST_EVENT_RET_MAX */
+ ret_limit = (atomic_inc_return(&event_counter) % CXL_TEST_EVENT_RET_MAX) + 1;
if (cmd->size_out < struct_size(pl, records, ret_limit))
return -EINVAL;
@@ -299,7 +298,7 @@ static int mock_get_event(struct device *dev, struct cxl_mbox_cmd *cmd)
u64 ns;
pl->flags |= CXL_GET_EVENT_FLAG_OVERFLOW;
- pl->overflow_err_count = cpu_to_le16(nr_overflow);
+ pl->overflow_err_count = cpu_to_le16(log->nr_overflow);
ns = ktime_get_real_ns();
ns -= 5000000000; /* 5s ago */
pl->first_overflow_timestamp = cpu_to_le64(ns);
diff --git a/tools/testing/cxl/test/mock.c b/tools/testing/cxl/test/mock.c
index 995269a75cbd..44bce80ef3ff 100644
--- a/tools/testing/cxl/test/mock.c
+++ b/tools/testing/cxl/test/mock.c
@@ -111,6 +111,26 @@ acpi_status __wrap_acpi_evaluate_integer(acpi_handle handle,
}
EXPORT_SYMBOL(__wrap_acpi_evaluate_integer);
+int __wrap_hmat_get_extended_linear_cache_size(struct resource *backing_res,
+ int nid,
+ resource_size_t *cache_size)
+{
+ int index, rc;
+ struct cxl_mock_ops *ops = get_cxl_mock_ops(&index);
+
+ if (ops)
+ rc = ops->hmat_get_extended_linear_cache_size(backing_res, nid,
+ cache_size);
+ else
+ rc = hmat_get_extended_linear_cache_size(backing_res, nid,
+ cache_size);
+
+ put_cxl_mock_ops(index);
+
+ return rc;
+}
+EXPORT_SYMBOL_GPL(__wrap_hmat_get_extended_linear_cache_size);
+
struct acpi_pci_root *__wrap_acpi_pci_find_root(acpi_handle handle)
{
int index;
@@ -172,21 +192,6 @@ int __wrap_devm_cxl_endpoint_decoders_setup(struct cxl_port *port)
}
EXPORT_SYMBOL_NS_GPL(__wrap_devm_cxl_endpoint_decoders_setup, "CXL");
-int __wrap_devm_cxl_port_enumerate_dports(struct cxl_port *port)
-{
- int rc, index;
- struct cxl_mock_ops *ops = get_cxl_mock_ops(&index);
-
- if (ops && ops->is_mock_port(port->uport_dev))
- rc = ops->devm_cxl_port_enumerate_dports(port);
- else
- rc = devm_cxl_port_enumerate_dports(port);
- put_cxl_mock_ops(index);
-
- return rc;
-}
-EXPORT_SYMBOL_NS_GPL(__wrap_devm_cxl_port_enumerate_dports, "CXL");
-
int __wrap_cxl_await_media_ready(struct cxl_dev_state *cxlds)
{
int rc, index;
@@ -226,23 +231,6 @@ struct cxl_dport *__wrap_devm_cxl_add_rch_dport(struct cxl_port *port,
}
EXPORT_SYMBOL_NS_GPL(__wrap_devm_cxl_add_rch_dport, "CXL");
-resource_size_t __wrap_cxl_rcd_component_reg_phys(struct device *dev,
- struct cxl_dport *dport)
-{
- int index;
- resource_size_t component_reg_phys;
- struct cxl_mock_ops *ops = get_cxl_mock_ops(&index);
-
- if (ops && ops->is_mock_port(dev))
- component_reg_phys = CXL_RESOURCE_NONE;
- else
- component_reg_phys = cxl_rcd_component_reg_phys(dev, dport);
- put_cxl_mock_ops(index);
-
- return component_reg_phys;
-}
-EXPORT_SYMBOL_NS_GPL(__wrap_cxl_rcd_component_reg_phys, "CXL");
-
void __wrap_cxl_endpoint_parse_cdat(struct cxl_port *port)
{
int index;
diff --git a/tools/testing/cxl/test/mock.h b/tools/testing/cxl/test/mock.h
index 4ed932e76aae..2684b89c8aa2 100644
--- a/tools/testing/cxl/test/mock.h
+++ b/tools/testing/cxl/test/mock.h
@@ -19,12 +19,14 @@ struct cxl_mock_ops {
bool (*is_mock_bus)(struct pci_bus *bus);
bool (*is_mock_port)(struct device *dev);
bool (*is_mock_dev)(struct device *dev);
- int (*devm_cxl_port_enumerate_dports)(struct cxl_port *port);
int (*devm_cxl_switch_port_decoders_setup)(struct cxl_port *port);
int (*devm_cxl_endpoint_decoders_setup)(struct cxl_port *port);
void (*cxl_endpoint_parse_cdat)(struct cxl_port *port);
struct cxl_dport *(*devm_cxl_add_dport_by_dev)(struct cxl_port *port,
struct device *dport_dev);
+ int (*hmat_get_extended_linear_cache_size)(struct resource *backing_res,
+ int nid,
+ resource_size_t *cache_size);
};
void register_cxl_mock_ops(struct cxl_mock_ops *ops);
diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c
index bb4d33dde3c8..10e051b6f592 100644
--- a/tools/testing/selftests/iommu/iommufd.c
+++ b/tools/testing/selftests/iommu/iommufd.c
@@ -13,9 +13,6 @@
static unsigned long HUGEPAGE_SIZE;
-#define MOCK_PAGE_SIZE (PAGE_SIZE / 2)
-#define MOCK_HUGE_PAGE_SIZE (512 * MOCK_PAGE_SIZE)
-
static unsigned long get_huge_page_size(void)
{
char buf[80];
@@ -1574,6 +1571,49 @@ TEST_F(iommufd_ioas, copy_sweep)
test_ioctl_destroy(dst_ioas_id);
}
+TEST_F(iommufd_ioas, dmabuf_simple)
+{
+ size_t buf_size = PAGE_SIZE*4;
+ __u64 iova;
+ int dfd;
+
+ test_cmd_get_dmabuf(buf_size, &dfd);
+ test_err_ioctl_ioas_map_file(EINVAL, dfd, 0, 0, &iova);
+ test_err_ioctl_ioas_map_file(EINVAL, dfd, buf_size, buf_size, &iova);
+ test_err_ioctl_ioas_map_file(EINVAL, dfd, 0, buf_size + 1, &iova);
+ test_ioctl_ioas_map_file(dfd, 0, buf_size, &iova);
+
+ close(dfd);
+}
+
+TEST_F(iommufd_ioas, dmabuf_revoke)
+{
+ size_t buf_size = PAGE_SIZE*4;
+ __u32 hwpt_id;
+ __u64 iova;
+ __u64 iova2;
+ int dfd;
+
+ test_cmd_get_dmabuf(buf_size, &dfd);
+ test_ioctl_ioas_map_file(dfd, 0, buf_size, &iova);
+ test_cmd_revoke_dmabuf(dfd, true);
+
+ if (variant->mock_domains)
+ test_cmd_hwpt_alloc(self->device_id, self->ioas_id, 0,
+ &hwpt_id);
+
+ test_err_ioctl_ioas_map_file(ENODEV, dfd, 0, buf_size, &iova2);
+
+ test_cmd_revoke_dmabuf(dfd, false);
+ test_ioctl_ioas_map_file(dfd, 0, buf_size, &iova2);
+
+ /* Restore the iova back */
+ test_ioctl_ioas_unmap(iova, buf_size);
+ test_ioctl_ioas_map_fixed_file(dfd, 0, buf_size, iova);
+
+ close(dfd);
+}
+
FIXTURE(iommufd_mock_domain)
{
int fd;
@@ -2058,6 +2098,12 @@ FIXTURE_VARIANT(iommufd_dirty_tracking)
FIXTURE_SETUP(iommufd_dirty_tracking)
{
+ struct iommu_option cmd = {
+ .size = sizeof(cmd),
+ .option_id = IOMMU_OPTION_HUGE_PAGES,
+ .op = IOMMU_OPTION_OP_SET,
+ .val64 = 0,
+ };
size_t mmap_buffer_size;
unsigned long size;
int mmap_flags;
@@ -2066,7 +2112,7 @@ FIXTURE_SETUP(iommufd_dirty_tracking)
if (variant->buffer_size < MOCK_PAGE_SIZE) {
SKIP(return,
- "Skipping buffer_size=%lu, less than MOCK_PAGE_SIZE=%lu",
+ "Skipping buffer_size=%lu, less than MOCK_PAGE_SIZE=%u",
variant->buffer_size, MOCK_PAGE_SIZE);
}
@@ -2114,16 +2160,18 @@ FIXTURE_SETUP(iommufd_dirty_tracking)
assert((uintptr_t)self->bitmap % PAGE_SIZE == 0);
test_ioctl_ioas_alloc(&self->ioas_id);
- /* Enable 1M mock IOMMU hugepages */
- if (variant->hugepages) {
- test_cmd_mock_domain_flags(self->ioas_id,
- MOCK_FLAGS_DEVICE_HUGE_IOVA,
- &self->stdev_id, &self->hwpt_id,
- &self->idev_id);
- } else {
- test_cmd_mock_domain(self->ioas_id, &self->stdev_id,
- &self->hwpt_id, &self->idev_id);
- }
+
+ /*
+ * For dirty testing it is important that the page size fed into
+ * the iommu page tables matches the size the dirty logic
+ * expects, or set_dirty can touch too much stuff.
+ */
+ cmd.object_id = self->ioas_id;
+ if (!variant->hugepages)
+ ASSERT_EQ(0, ioctl(self->fd, IOMMU_OPTION, &cmd));
+
+ test_cmd_mock_domain(self->ioas_id, &self->stdev_id, &self->hwpt_id,
+ &self->idev_id);
}
FIXTURE_TEARDOWN(iommufd_dirty_tracking)
@@ -2248,18 +2296,23 @@ TEST_F(iommufd_dirty_tracking, device_dirty_capability)
TEST_F(iommufd_dirty_tracking, get_dirty_bitmap)
{
uint32_t page_size = MOCK_PAGE_SIZE;
+ uint32_t ioas_id = self->ioas_id;
uint32_t hwpt_id;
- uint32_t ioas_id;
if (variant->hugepages)
page_size = MOCK_HUGE_PAGE_SIZE;
- test_ioctl_ioas_alloc(&ioas_id);
test_ioctl_ioas_map_fixed_id(ioas_id, self->buffer,
variant->buffer_size, MOCK_APERTURE_START);
- test_cmd_hwpt_alloc(self->idev_id, ioas_id,
- IOMMU_HWPT_ALLOC_DIRTY_TRACKING, &hwpt_id);
+ if (variant->hugepages)
+ test_cmd_hwpt_alloc_iommupt(self->idev_id, ioas_id,
+ IOMMU_HWPT_ALLOC_DIRTY_TRACKING,
+ MOCK_IOMMUPT_HUGE, &hwpt_id);
+ else
+ test_cmd_hwpt_alloc_iommupt(self->idev_id, ioas_id,
+ IOMMU_HWPT_ALLOC_DIRTY_TRACKING,
+ MOCK_IOMMUPT_DEFAULT, &hwpt_id);
test_cmd_set_dirty_tracking(hwpt_id, true);
@@ -2285,18 +2338,24 @@ TEST_F(iommufd_dirty_tracking, get_dirty_bitmap)
TEST_F(iommufd_dirty_tracking, get_dirty_bitmap_no_clear)
{
uint32_t page_size = MOCK_PAGE_SIZE;
+ uint32_t ioas_id = self->ioas_id;
uint32_t hwpt_id;
- uint32_t ioas_id;
if (variant->hugepages)
page_size = MOCK_HUGE_PAGE_SIZE;
- test_ioctl_ioas_alloc(&ioas_id);
test_ioctl_ioas_map_fixed_id(ioas_id, self->buffer,
variant->buffer_size, MOCK_APERTURE_START);
- test_cmd_hwpt_alloc(self->idev_id, ioas_id,
- IOMMU_HWPT_ALLOC_DIRTY_TRACKING, &hwpt_id);
+
+ if (variant->hugepages)
+ test_cmd_hwpt_alloc_iommupt(self->idev_id, ioas_id,
+ IOMMU_HWPT_ALLOC_DIRTY_TRACKING,
+ MOCK_IOMMUPT_HUGE, &hwpt_id);
+ else
+ test_cmd_hwpt_alloc_iommupt(self->idev_id, ioas_id,
+ IOMMU_HWPT_ALLOC_DIRTY_TRACKING,
+ MOCK_IOMMUPT_DEFAULT, &hwpt_id);
test_cmd_set_dirty_tracking(hwpt_id, true);
diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h
index 9f472c20c190..0a0ff6f7926d 100644
--- a/tools/testing/selftests/iommu/iommufd_utils.h
+++ b/tools/testing/selftests/iommu/iommufd_utils.h
@@ -215,6 +215,18 @@ static int _test_cmd_hwpt_alloc(int fd, __u32 device_id, __u32 pt_id, __u32 ft_i
ASSERT_EQ(0, _test_cmd_hwpt_alloc(self->fd, device_id, pt_id, 0, flags, \
hwpt_id, IOMMU_HWPT_DATA_NONE, NULL, \
0))
+#define test_cmd_hwpt_alloc_iommupt(device_id, pt_id, flags, iommupt_type, \
+ hwpt_id) \
+ ({ \
+ struct iommu_hwpt_selftest user_cfg = { \
+ .pagetable_type = iommupt_type \
+ }; \
+ \
+ ASSERT_EQ(0, _test_cmd_hwpt_alloc( \
+ self->fd, device_id, pt_id, 0, flags, \
+ hwpt_id, IOMMU_HWPT_DATA_SELFTEST, \
+ &user_cfg, sizeof(user_cfg))); \
+ })
#define test_err_hwpt_alloc(_errno, device_id, pt_id, flags, hwpt_id) \
EXPECT_ERRNO(_errno, _test_cmd_hwpt_alloc( \
self->fd, device_id, pt_id, 0, flags, \
@@ -548,6 +560,39 @@ static int _test_cmd_destroy_access_pages(int fd, unsigned int access_id,
EXPECT_ERRNO(_errno, _test_cmd_destroy_access_pages( \
self->fd, access_id, access_pages_id))
+static int _test_cmd_get_dmabuf(int fd, size_t len, int *out_fd)
+{
+ struct iommu_test_cmd cmd = {
+ .size = sizeof(cmd),
+ .op = IOMMU_TEST_OP_DMABUF_GET,
+ .dmabuf_get = { .length = len, .open_flags = O_CLOEXEC },
+ };
+
+ *out_fd = ioctl(fd, IOMMU_TEST_CMD, &cmd);
+ if (*out_fd < 0)
+ return -1;
+ return 0;
+}
+#define test_cmd_get_dmabuf(len, out_fd) \
+ ASSERT_EQ(0, _test_cmd_get_dmabuf(self->fd, len, out_fd))
+
+static int _test_cmd_revoke_dmabuf(int fd, int dmabuf_fd, bool revoked)
+{
+ struct iommu_test_cmd cmd = {
+ .size = sizeof(cmd),
+ .op = IOMMU_TEST_OP_DMABUF_REVOKE,
+ .dmabuf_revoke = { .dmabuf_fd = dmabuf_fd, .revoked = revoked },
+ };
+ int ret;
+
+ ret = ioctl(fd, IOMMU_TEST_CMD, &cmd);
+ if (ret < 0)
+ return -1;
+ return 0;
+}
+#define test_cmd_revoke_dmabuf(dmabuf_fd, revoke) \
+ ASSERT_EQ(0, _test_cmd_revoke_dmabuf(self->fd, dmabuf_fd, revoke))
+
static int _test_ioctl_destroy(int fd, unsigned int id)
{
struct iommu_destroy cmd = {
@@ -718,6 +763,17 @@ static int _test_ioctl_ioas_map_file(int fd, unsigned int ioas_id, int mfd,
self->fd, ioas_id, mfd, start, length, iova_p, \
IOMMU_IOAS_MAP_WRITEABLE | IOMMU_IOAS_MAP_READABLE))
+#define test_ioctl_ioas_map_fixed_file(mfd, start, length, iova) \
+ ({ \
+ __u64 __iova = iova; \
+ ASSERT_EQ(0, _test_ioctl_ioas_map_file( \
+ self->fd, self->ioas_id, mfd, start, \
+ length, &__iova, \
+ IOMMU_IOAS_MAP_FIXED_IOVA | \
+ IOMMU_IOAS_MAP_WRITEABLE | \
+ IOMMU_IOAS_MAP_READABLE)); \
+ })
+
static int _test_ioctl_set_temp_memory_limit(int fd, unsigned int limit)
{
struct iommu_test_cmd memlimit_cmd = {
diff --git a/tools/testing/selftests/vfio/Makefile b/tools/testing/selftests/vfio/Makefile
index 324ba0175a33..3c796ca99a50 100644
--- a/tools/testing/selftests/vfio/Makefile
+++ b/tools/testing/selftests/vfio/Makefile
@@ -2,8 +2,14 @@ CFLAGS = $(KHDR_INCLUDES)
TEST_GEN_PROGS += vfio_dma_mapping_test
TEST_GEN_PROGS += vfio_iommufd_setup_test
TEST_GEN_PROGS += vfio_pci_device_test
+TEST_GEN_PROGS += vfio_pci_device_init_perf_test
TEST_GEN_PROGS += vfio_pci_driver_test
-TEST_PROGS_EXTENDED := run.sh
+
+TEST_FILES += scripts/cleanup.sh
+TEST_FILES += scripts/lib.sh
+TEST_FILES += scripts/run.sh
+TEST_FILES += scripts/setup.sh
+
include ../lib.mk
include lib/libvfio.mk
@@ -11,6 +17,8 @@ CFLAGS += -I$(top_srcdir)/tools/include
CFLAGS += -MD
CFLAGS += $(EXTRA_CFLAGS)
+LDFLAGS += -pthread
+
$(TEST_GEN_PROGS): %: %.o $(LIBVFIO_O)
$(CC) $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) $< $(LIBVFIO_O) $(LDLIBS) -o $@
diff --git a/tools/testing/selftests/vfio/lib/drivers/dsa/dsa.c b/tools/testing/selftests/vfio/lib/drivers/dsa/dsa.c
index 0ca2cbc2a316..c75045bcab79 100644
--- a/tools/testing/selftests/vfio/lib/drivers/dsa/dsa.c
+++ b/tools/testing/selftests/vfio/lib/drivers/dsa/dsa.c
@@ -9,7 +9,7 @@
#include <linux/pci_ids.h>
#include <linux/sizes.h>
-#include <vfio_util.h>
+#include <libvfio.h>
#include "registers.h"
@@ -70,7 +70,7 @@ static int dsa_probe(struct vfio_pci_device *device)
return -EINVAL;
if (dsa_int_handle_request_required(device)) {
- printf("Device requires requesting interrupt handles\n");
+ dev_err(device, "Device requires requesting interrupt handles\n");
return -EINVAL;
}
@@ -91,23 +91,23 @@ static void dsa_check_sw_err(struct vfio_pci_device *device)
return;
}
- fprintf(stderr, "SWERR: 0x%016lx 0x%016lx 0x%016lx 0x%016lx\n",
+ dev_err(device, "SWERR: 0x%016lx 0x%016lx 0x%016lx 0x%016lx\n",
err.bits[0], err.bits[1], err.bits[2], err.bits[3]);
- fprintf(stderr, " valid: 0x%x\n", err.valid);
- fprintf(stderr, " overflow: 0x%x\n", err.overflow);
- fprintf(stderr, " desc_valid: 0x%x\n", err.desc_valid);
- fprintf(stderr, " wq_idx_valid: 0x%x\n", err.wq_idx_valid);
- fprintf(stderr, " batch: 0x%x\n", err.batch);
- fprintf(stderr, " fault_rw: 0x%x\n", err.fault_rw);
- fprintf(stderr, " priv: 0x%x\n", err.priv);
- fprintf(stderr, " error: 0x%x\n", err.error);
- fprintf(stderr, " wq_idx: 0x%x\n", err.wq_idx);
- fprintf(stderr, " operation: 0x%x\n", err.operation);
- fprintf(stderr, " pasid: 0x%x\n", err.pasid);
- fprintf(stderr, " batch_idx: 0x%x\n", err.batch_idx);
- fprintf(stderr, " invalid_flags: 0x%x\n", err.invalid_flags);
- fprintf(stderr, " fault_addr: 0x%lx\n", err.fault_addr);
+ dev_err(device, " valid: 0x%x\n", err.valid);
+ dev_err(device, " overflow: 0x%x\n", err.overflow);
+ dev_err(device, " desc_valid: 0x%x\n", err.desc_valid);
+ dev_err(device, " wq_idx_valid: 0x%x\n", err.wq_idx_valid);
+ dev_err(device, " batch: 0x%x\n", err.batch);
+ dev_err(device, " fault_rw: 0x%x\n", err.fault_rw);
+ dev_err(device, " priv: 0x%x\n", err.priv);
+ dev_err(device, " error: 0x%x\n", err.error);
+ dev_err(device, " wq_idx: 0x%x\n", err.wq_idx);
+ dev_err(device, " operation: 0x%x\n", err.operation);
+ dev_err(device, " pasid: 0x%x\n", err.pasid);
+ dev_err(device, " batch_idx: 0x%x\n", err.batch_idx);
+ dev_err(device, " invalid_flags: 0x%x\n", err.invalid_flags);
+ dev_err(device, " fault_addr: 0x%lx\n", err.fault_addr);
VFIO_FAIL("Software Error Detected!\n");
}
@@ -256,7 +256,7 @@ static int dsa_completion_wait(struct vfio_pci_device *device,
if (status == DSA_COMP_SUCCESS)
return 0;
- printf("Error detected during memcpy operation: 0x%x\n", status);
+ dev_err(device, "Error detected during memcpy operation: 0x%x\n", status);
return -1;
}
diff --git a/tools/testing/selftests/vfio/lib/drivers/ioat/ioat.c b/tools/testing/selftests/vfio/lib/drivers/ioat/ioat.c
index c3b91d9b1f59..a871b935542b 100644
--- a/tools/testing/selftests/vfio/lib/drivers/ioat/ioat.c
+++ b/tools/testing/selftests/vfio/lib/drivers/ioat/ioat.c
@@ -7,7 +7,7 @@
#include <linux/pci_ids.h>
#include <linux/sizes.h>
-#include <vfio_util.h>
+#include <libvfio.h>
#include "hw.h"
#include "registers.h"
@@ -51,7 +51,7 @@ static int ioat_probe(struct vfio_pci_device *device)
r = 0;
break;
default:
- printf("ioat: Unsupported version: 0x%x\n", version);
+ dev_err(device, "ioat: Unsupported version: 0x%x\n", version);
r = -EINVAL;
}
return r;
@@ -135,13 +135,13 @@ static void ioat_handle_error(struct vfio_pci_device *device)
{
void *registers = ioat_channel_registers(device);
- printf("Error detected during memcpy operation!\n"
- " CHANERR: 0x%x\n"
- " CHANERR_INT: 0x%x\n"
- " DMAUNCERRSTS: 0x%x\n",
- readl(registers + IOAT_CHANERR_OFFSET),
- vfio_pci_config_readl(device, IOAT_PCI_CHANERR_INT_OFFSET),
- vfio_pci_config_readl(device, IOAT_PCI_DMAUNCERRSTS_OFFSET));
+ dev_err(device, "Error detected during memcpy operation!\n"
+ " CHANERR: 0x%x\n"
+ " CHANERR_INT: 0x%x\n"
+ " DMAUNCERRSTS: 0x%x\n",
+ readl(registers + IOAT_CHANERR_OFFSET),
+ vfio_pci_config_readl(device, IOAT_PCI_CHANERR_INT_OFFSET),
+ vfio_pci_config_readl(device, IOAT_PCI_DMAUNCERRSTS_OFFSET));
ioat_reset(device);
}
diff --git a/tools/testing/selftests/vfio/lib/include/libvfio.h b/tools/testing/selftests/vfio/lib/include/libvfio.h
new file mode 100644
index 000000000000..279ddcd70194
--- /dev/null
+++ b/tools/testing/selftests/vfio/lib/include/libvfio.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_H
+#define SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_H
+
+#include <libvfio/assert.h>
+#include <libvfio/iommu.h>
+#include <libvfio/iova_allocator.h>
+#include <libvfio/vfio_pci_device.h>
+#include <libvfio/vfio_pci_driver.h>
+
+/*
+ * Return the BDF string of the device that the test should use.
+ *
+ * If a BDF string is provided by the user on the command line (as the last
+ * element of argv[]), then this function will return that and decrement argc
+ * by 1.
+ *
+ * Otherwise this function will attempt to use the environment variable
+ * $VFIO_SELFTESTS_BDF.
+ *
+ * If BDF cannot be determined then the test will exit with KSFT_SKIP.
+ */
+const char *vfio_selftests_get_bdf(int *argc, char *argv[]);
+char **vfio_selftests_get_bdfs(int *argc, char *argv[], int *nr_bdfs);
+
+#endif /* SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_H */
diff --git a/tools/testing/selftests/vfio/lib/include/libvfio/assert.h b/tools/testing/selftests/vfio/lib/include/libvfio/assert.h
new file mode 100644
index 000000000000..f4ebd122d9b6
--- /dev/null
+++ b/tools/testing/selftests/vfio/lib/include/libvfio/assert.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_ASSERT_H
+#define SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_ASSERT_H
+
+#include <stdio.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "../../../../kselftest.h"
+
+#define VFIO_LOG_AND_EXIT(...) do { \
+ fprintf(stderr, " " __VA_ARGS__); \
+ fprintf(stderr, "\n"); \
+ exit(KSFT_FAIL); \
+} while (0)
+
+#define VFIO_ASSERT_OP(_lhs, _rhs, _op, ...) do { \
+ typeof(_lhs) __lhs = (_lhs); \
+ typeof(_rhs) __rhs = (_rhs); \
+ \
+ if (__lhs _op __rhs) \
+ break; \
+ \
+ fprintf(stderr, "%s:%u: Assertion Failure\n\n", __FILE__, __LINE__); \
+ fprintf(stderr, " Expression: " #_lhs " " #_op " " #_rhs "\n"); \
+ fprintf(stderr, " Observed: %#lx %s %#lx\n", \
+ (u64)__lhs, #_op, (u64)__rhs); \
+ fprintf(stderr, " [errno: %d - %s]\n", errno, strerror(errno)); \
+ VFIO_LOG_AND_EXIT(__VA_ARGS__); \
+} while (0)
+
+#define VFIO_ASSERT_EQ(_a, _b, ...) VFIO_ASSERT_OP(_a, _b, ==, ##__VA_ARGS__)
+#define VFIO_ASSERT_NE(_a, _b, ...) VFIO_ASSERT_OP(_a, _b, !=, ##__VA_ARGS__)
+#define VFIO_ASSERT_LT(_a, _b, ...) VFIO_ASSERT_OP(_a, _b, <, ##__VA_ARGS__)
+#define VFIO_ASSERT_LE(_a, _b, ...) VFIO_ASSERT_OP(_a, _b, <=, ##__VA_ARGS__)
+#define VFIO_ASSERT_GT(_a, _b, ...) VFIO_ASSERT_OP(_a, _b, >, ##__VA_ARGS__)
+#define VFIO_ASSERT_GE(_a, _b, ...) VFIO_ASSERT_OP(_a, _b, >=, ##__VA_ARGS__)
+#define VFIO_ASSERT_TRUE(_a, ...) VFIO_ASSERT_NE(false, (_a), ##__VA_ARGS__)
+#define VFIO_ASSERT_FALSE(_a, ...) VFIO_ASSERT_EQ(false, (_a), ##__VA_ARGS__)
+#define VFIO_ASSERT_NULL(_a, ...) VFIO_ASSERT_EQ(NULL, _a, ##__VA_ARGS__)
+#define VFIO_ASSERT_NOT_NULL(_a, ...) VFIO_ASSERT_NE(NULL, _a, ##__VA_ARGS__)
+
+#define VFIO_FAIL(_fmt, ...) do { \
+ fprintf(stderr, "%s:%u: FAIL\n\n", __FILE__, __LINE__); \
+ VFIO_LOG_AND_EXIT(_fmt, ##__VA_ARGS__); \
+} while (0)
+
+#define ioctl_assert(_fd, _op, _arg) do { \
+ void *__arg = (_arg); \
+ int __ret = ioctl((_fd), (_op), (__arg)); \
+ VFIO_ASSERT_EQ(__ret, 0, "ioctl(%s, %s, %s) returned %d\n", #_fd, #_op, #_arg, __ret); \
+} while (0)
+
+#endif /* SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_ASSERT_H */
diff --git a/tools/testing/selftests/vfio/lib/include/libvfio/iommu.h b/tools/testing/selftests/vfio/lib/include/libvfio/iommu.h
new file mode 100644
index 000000000000..5c9b9dc6d993
--- /dev/null
+++ b/tools/testing/selftests/vfio/lib/include/libvfio/iommu.h
@@ -0,0 +1,76 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_IOMMU_H
+#define SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_IOMMU_H
+
+#include <linux/list.h>
+#include <linux/types.h>
+
+#include <libvfio/assert.h>
+
+typedef u64 iova_t;
+
+struct iommu_mode {
+ const char *name;
+ const char *container_path;
+ unsigned long iommu_type;
+};
+
+extern const char *default_iommu_mode;
+
+struct dma_region {
+ struct list_head link;
+ void *vaddr;
+ iova_t iova;
+ u64 size;
+};
+
+struct iommu {
+ const struct iommu_mode *mode;
+ int container_fd;
+ int iommufd;
+ u32 ioas_id;
+ struct list_head dma_regions;
+};
+
+struct iommu *iommu_init(const char *iommu_mode);
+void iommu_cleanup(struct iommu *iommu);
+
+int __iommu_map(struct iommu *iommu, struct dma_region *region);
+
+static inline void iommu_map(struct iommu *iommu, struct dma_region *region)
+{
+ VFIO_ASSERT_EQ(__iommu_map(iommu, region), 0);
+}
+
+int __iommu_unmap(struct iommu *iommu, struct dma_region *region, u64 *unmapped);
+
+static inline void iommu_unmap(struct iommu *iommu, struct dma_region *region)
+{
+ VFIO_ASSERT_EQ(__iommu_unmap(iommu, region, NULL), 0);
+}
+
+int __iommu_unmap_all(struct iommu *iommu, u64 *unmapped);
+
+static inline void iommu_unmap_all(struct iommu *iommu)
+{
+ VFIO_ASSERT_EQ(__iommu_unmap_all(iommu, NULL), 0);
+}
+
+int __iommu_hva2iova(struct iommu *iommu, void *vaddr, iova_t *iova);
+iova_t iommu_hva2iova(struct iommu *iommu, void *vaddr);
+
+struct iommu_iova_range *iommu_iova_ranges(struct iommu *iommu, u32 *nranges);
+
+/*
+ * Generator for VFIO selftests fixture variants that replicate across all
+ * possible IOMMU modes. Tests must define FIXTURE_VARIANT_ADD_IOMMU_MODE()
+ * which should then use FIXTURE_VARIANT_ADD() to create the variant.
+ */
+#define FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES(...) \
+FIXTURE_VARIANT_ADD_IOMMU_MODE(vfio_type1_iommu, ##__VA_ARGS__); \
+FIXTURE_VARIANT_ADD_IOMMU_MODE(vfio_type1v2_iommu, ##__VA_ARGS__); \
+FIXTURE_VARIANT_ADD_IOMMU_MODE(iommufd_compat_type1, ##__VA_ARGS__); \
+FIXTURE_VARIANT_ADD_IOMMU_MODE(iommufd_compat_type1v2, ##__VA_ARGS__); \
+FIXTURE_VARIANT_ADD_IOMMU_MODE(iommufd, ##__VA_ARGS__)
+
+#endif /* SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_IOMMU_H */
diff --git a/tools/testing/selftests/vfio/lib/include/libvfio/iova_allocator.h b/tools/testing/selftests/vfio/lib/include/libvfio/iova_allocator.h
new file mode 100644
index 000000000000..8f1d994e9ea2
--- /dev/null
+++ b/tools/testing/selftests/vfio/lib/include/libvfio/iova_allocator.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_IOVA_ALLOCATOR_H
+#define SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_IOVA_ALLOCATOR_H
+
+#include <uapi/linux/types.h>
+#include <linux/list.h>
+#include <linux/types.h>
+#include <linux/iommufd.h>
+
+#include <libvfio/iommu.h>
+
+struct iova_allocator {
+ struct iommu_iova_range *ranges;
+ u32 nranges;
+ u32 range_idx;
+ u64 range_offset;
+};
+
+struct iova_allocator *iova_allocator_init(struct iommu *iommu);
+void iova_allocator_cleanup(struct iova_allocator *allocator);
+iova_t iova_allocator_alloc(struct iova_allocator *allocator, size_t size);
+
+#endif /* SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_IOVA_ALLOCATOR_H */
diff --git a/tools/testing/selftests/vfio/lib/include/libvfio/vfio_pci_device.h b/tools/testing/selftests/vfio/lib/include/libvfio/vfio_pci_device.h
new file mode 100644
index 000000000000..2858885a89bb
--- /dev/null
+++ b/tools/testing/selftests/vfio/lib/include/libvfio/vfio_pci_device.h
@@ -0,0 +1,125 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_VFIO_PCI_DEVICE_H
+#define SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_VFIO_PCI_DEVICE_H
+
+#include <fcntl.h>
+#include <linux/vfio.h>
+#include <linux/pci_regs.h>
+
+#include <libvfio/assert.h>
+#include <libvfio/iommu.h>
+#include <libvfio/vfio_pci_driver.h>
+
+struct vfio_pci_bar {
+ struct vfio_region_info info;
+ void *vaddr;
+};
+
+struct vfio_pci_device {
+ const char *bdf;
+ int fd;
+ int group_fd;
+
+ struct iommu *iommu;
+
+ struct vfio_device_info info;
+ struct vfio_region_info config_space;
+ struct vfio_pci_bar bars[PCI_STD_NUM_BARS];
+
+ struct vfio_irq_info msi_info;
+ struct vfio_irq_info msix_info;
+
+ /* eventfds for MSI and MSI-x interrupts */
+ int msi_eventfds[PCI_MSIX_FLAGS_QSIZE + 1];
+
+ struct vfio_pci_driver driver;
+};
+
+#define dev_info(_dev, _fmt, ...) printf("%s: " _fmt, (_dev)->bdf, ##__VA_ARGS__)
+#define dev_err(_dev, _fmt, ...) fprintf(stderr, "%s: " _fmt, (_dev)->bdf, ##__VA_ARGS__)
+
+struct vfio_pci_device *vfio_pci_device_init(const char *bdf, struct iommu *iommu);
+void vfio_pci_device_cleanup(struct vfio_pci_device *device);
+
+void vfio_pci_device_reset(struct vfio_pci_device *device);
+
+void vfio_pci_config_access(struct vfio_pci_device *device, bool write,
+ size_t config, size_t size, void *data);
+
+#define vfio_pci_config_read(_device, _offset, _type) ({ \
+ _type __data; \
+ vfio_pci_config_access((_device), false, _offset, sizeof(__data), &__data); \
+ __data; \
+})
+
+#define vfio_pci_config_readb(_d, _o) vfio_pci_config_read(_d, _o, u8)
+#define vfio_pci_config_readw(_d, _o) vfio_pci_config_read(_d, _o, u16)
+#define vfio_pci_config_readl(_d, _o) vfio_pci_config_read(_d, _o, u32)
+
+#define vfio_pci_config_write(_device, _offset, _value, _type) do { \
+ _type __data = (_value); \
+ vfio_pci_config_access((_device), true, _offset, sizeof(_type), &__data); \
+} while (0)
+
+#define vfio_pci_config_writeb(_d, _o, _v) vfio_pci_config_write(_d, _o, _v, u8)
+#define vfio_pci_config_writew(_d, _o, _v) vfio_pci_config_write(_d, _o, _v, u16)
+#define vfio_pci_config_writel(_d, _o, _v) vfio_pci_config_write(_d, _o, _v, u32)
+
+void vfio_pci_irq_enable(struct vfio_pci_device *device, u32 index,
+ u32 vector, int count);
+void vfio_pci_irq_disable(struct vfio_pci_device *device, u32 index);
+void vfio_pci_irq_trigger(struct vfio_pci_device *device, u32 index, u32 vector);
+
+static inline void fcntl_set_nonblock(int fd)
+{
+ int r;
+
+ r = fcntl(fd, F_GETFL, 0);
+ VFIO_ASSERT_NE(r, -1, "F_GETFL failed for fd %d\n", fd);
+
+ r = fcntl(fd, F_SETFL, r | O_NONBLOCK);
+ VFIO_ASSERT_NE(r, -1, "F_SETFL O_NONBLOCK failed for fd %d\n", fd);
+}
+
+static inline void vfio_pci_msi_enable(struct vfio_pci_device *device,
+ u32 vector, int count)
+{
+ vfio_pci_irq_enable(device, VFIO_PCI_MSI_IRQ_INDEX, vector, count);
+}
+
+static inline void vfio_pci_msi_disable(struct vfio_pci_device *device)
+{
+ vfio_pci_irq_disable(device, VFIO_PCI_MSI_IRQ_INDEX);
+}
+
+static inline void vfio_pci_msix_enable(struct vfio_pci_device *device,
+ u32 vector, int count)
+{
+ vfio_pci_irq_enable(device, VFIO_PCI_MSIX_IRQ_INDEX, vector, count);
+}
+
+static inline void vfio_pci_msix_disable(struct vfio_pci_device *device)
+{
+ vfio_pci_irq_disable(device, VFIO_PCI_MSIX_IRQ_INDEX);
+}
+
+static inline int __to_iova(struct vfio_pci_device *device, void *vaddr, iova_t *iova)
+{
+ return __iommu_hva2iova(device->iommu, vaddr, iova);
+}
+
+static inline iova_t to_iova(struct vfio_pci_device *device, void *vaddr)
+{
+ return iommu_hva2iova(device->iommu, vaddr);
+}
+
+static inline bool vfio_pci_device_match(struct vfio_pci_device *device,
+ u16 vendor_id, u16 device_id)
+{
+ return (vendor_id == vfio_pci_config_readw(device, PCI_VENDOR_ID)) &&
+ (device_id == vfio_pci_config_readw(device, PCI_DEVICE_ID));
+}
+
+const char *vfio_pci_get_cdev_path(const char *bdf);
+
+#endif /* SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_VFIO_PCI_DEVICE_H */
diff --git a/tools/testing/selftests/vfio/lib/include/libvfio/vfio_pci_driver.h b/tools/testing/selftests/vfio/lib/include/libvfio/vfio_pci_driver.h
new file mode 100644
index 000000000000..e5ada209b1d1
--- /dev/null
+++ b/tools/testing/selftests/vfio/lib/include/libvfio/vfio_pci_driver.h
@@ -0,0 +1,97 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_VFIO_PCI_DRIVER_H
+#define SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_VFIO_PCI_DRIVER_H
+
+#include <libvfio/iommu.h>
+
+struct vfio_pci_device;
+
+struct vfio_pci_driver_ops {
+ const char *name;
+
+ /**
+ * @probe() - Check if the driver supports the given device.
+ *
+ * Return: 0 on success, non-0 on failure.
+ */
+ int (*probe)(struct vfio_pci_device *device);
+
+ /**
+ * @init() - Initialize the driver for @device.
+ *
+ * Must be called after device->driver.region has been initialized.
+ */
+ void (*init)(struct vfio_pci_device *device);
+
+ /**
+ * remove() - Deinitialize the driver for @device.
+ */
+ void (*remove)(struct vfio_pci_device *device);
+
+ /**
+ * memcpy_start() - Kick off @count repeated memcpy operations from
+ * [@src, @src + @size) to [@dst, @dst + @size).
+ *
+ * Guarantees:
+ * - The device will attempt DMA reads on [src, src + size).
+ * - The device will attempt DMA writes on [dst, dst + size).
+ * - The device will not generate any interrupts.
+ *
+ * memcpy_start() returns immediately, it does not wait for the
+ * copies to complete.
+ */
+ void (*memcpy_start)(struct vfio_pci_device *device,
+ iova_t src, iova_t dst, u64 size, u64 count);
+
+ /**
+ * memcpy_wait() - Wait until the memcpy operations started by
+ * memcpy_start() have finished.
+ *
+ * Guarantees:
+ * - All in-flight DMAs initiated by memcpy_start() are fully complete
+ * before memcpy_wait() returns.
+ *
+ * Returns non-0 if the driver detects that an error occurred during the
+ * memcpy, 0 otherwise.
+ */
+ int (*memcpy_wait)(struct vfio_pci_device *device);
+
+ /**
+ * send_msi() - Make the device send the MSI device->driver.msi.
+ *
+ * Guarantees:
+ * - The device will send the MSI once.
+ */
+ void (*send_msi)(struct vfio_pci_device *device);
+};
+
+struct vfio_pci_driver {
+ const struct vfio_pci_driver_ops *ops;
+ bool initialized;
+ bool memcpy_in_progress;
+
+ /* Region to be used by the driver (e.g. for in-memory descriptors) */
+ struct dma_region region;
+
+ /* The maximum size that can be passed to memcpy_start(). */
+ u64 max_memcpy_size;
+
+ /* The maximum count that can be passed to memcpy_start(). */
+ u64 max_memcpy_count;
+
+ /* The MSI vector the device will signal in ops->send_msi(). */
+ int msi;
+};
+
+void vfio_pci_driver_probe(struct vfio_pci_device *device);
+void vfio_pci_driver_init(struct vfio_pci_device *device);
+void vfio_pci_driver_remove(struct vfio_pci_device *device);
+int vfio_pci_driver_memcpy(struct vfio_pci_device *device,
+ iova_t src, iova_t dst, u64 size);
+void vfio_pci_driver_memcpy_start(struct vfio_pci_device *device,
+ iova_t src, iova_t dst, u64 size,
+ u64 count);
+int vfio_pci_driver_memcpy_wait(struct vfio_pci_device *device);
+void vfio_pci_driver_send_msi(struct vfio_pci_device *device);
+
+#endif /* SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_VFIO_PCI_DRIVER_H */
diff --git a/tools/testing/selftests/vfio/lib/include/vfio_util.h b/tools/testing/selftests/vfio/lib/include/vfio_util.h
deleted file mode 100644
index 69ec0c856481..000000000000
--- a/tools/testing/selftests/vfio/lib/include/vfio_util.h
+++ /dev/null
@@ -1,331 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-#ifndef SELFTESTS_VFIO_LIB_INCLUDE_VFIO_UTIL_H
-#define SELFTESTS_VFIO_LIB_INCLUDE_VFIO_UTIL_H
-
-#include <fcntl.h>
-#include <string.h>
-
-#include <uapi/linux/types.h>
-#include <linux/iommufd.h>
-#include <linux/list.h>
-#include <linux/pci_regs.h>
-#include <linux/vfio.h>
-
-#include "../../../kselftest.h"
-
-#define VFIO_LOG_AND_EXIT(...) do { \
- fprintf(stderr, " " __VA_ARGS__); \
- fprintf(stderr, "\n"); \
- exit(KSFT_FAIL); \
-} while (0)
-
-#define VFIO_ASSERT_OP(_lhs, _rhs, _op, ...) do { \
- typeof(_lhs) __lhs = (_lhs); \
- typeof(_rhs) __rhs = (_rhs); \
- \
- if (__lhs _op __rhs) \
- break; \
- \
- fprintf(stderr, "%s:%u: Assertion Failure\n\n", __FILE__, __LINE__); \
- fprintf(stderr, " Expression: " #_lhs " " #_op " " #_rhs "\n"); \
- fprintf(stderr, " Observed: %#lx %s %#lx\n", \
- (u64)__lhs, #_op, (u64)__rhs); \
- fprintf(stderr, " [errno: %d - %s]\n", errno, strerror(errno)); \
- VFIO_LOG_AND_EXIT(__VA_ARGS__); \
-} while (0)
-
-#define VFIO_ASSERT_EQ(_a, _b, ...) VFIO_ASSERT_OP(_a, _b, ==, ##__VA_ARGS__)
-#define VFIO_ASSERT_NE(_a, _b, ...) VFIO_ASSERT_OP(_a, _b, !=, ##__VA_ARGS__)
-#define VFIO_ASSERT_LT(_a, _b, ...) VFIO_ASSERT_OP(_a, _b, <, ##__VA_ARGS__)
-#define VFIO_ASSERT_LE(_a, _b, ...) VFIO_ASSERT_OP(_a, _b, <=, ##__VA_ARGS__)
-#define VFIO_ASSERT_GT(_a, _b, ...) VFIO_ASSERT_OP(_a, _b, >, ##__VA_ARGS__)
-#define VFIO_ASSERT_GE(_a, _b, ...) VFIO_ASSERT_OP(_a, _b, >=, ##__VA_ARGS__)
-#define VFIO_ASSERT_TRUE(_a, ...) VFIO_ASSERT_NE(false, (_a), ##__VA_ARGS__)
-#define VFIO_ASSERT_FALSE(_a, ...) VFIO_ASSERT_EQ(false, (_a), ##__VA_ARGS__)
-#define VFIO_ASSERT_NULL(_a, ...) VFIO_ASSERT_EQ(NULL, _a, ##__VA_ARGS__)
-#define VFIO_ASSERT_NOT_NULL(_a, ...) VFIO_ASSERT_NE(NULL, _a, ##__VA_ARGS__)
-
-#define VFIO_FAIL(_fmt, ...) do { \
- fprintf(stderr, "%s:%u: FAIL\n\n", __FILE__, __LINE__); \
- VFIO_LOG_AND_EXIT(_fmt, ##__VA_ARGS__); \
-} while (0)
-
-struct vfio_iommu_mode {
- const char *name;
- const char *container_path;
- unsigned long iommu_type;
-};
-
-/*
- * Generator for VFIO selftests fixture variants that replicate across all
- * possible IOMMU modes. Tests must define FIXTURE_VARIANT_ADD_IOMMU_MODE()
- * which should then use FIXTURE_VARIANT_ADD() to create the variant.
- */
-#define FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES(...) \
-FIXTURE_VARIANT_ADD_IOMMU_MODE(vfio_type1_iommu, ##__VA_ARGS__); \
-FIXTURE_VARIANT_ADD_IOMMU_MODE(vfio_type1v2_iommu, ##__VA_ARGS__); \
-FIXTURE_VARIANT_ADD_IOMMU_MODE(iommufd_compat_type1, ##__VA_ARGS__); \
-FIXTURE_VARIANT_ADD_IOMMU_MODE(iommufd_compat_type1v2, ##__VA_ARGS__); \
-FIXTURE_VARIANT_ADD_IOMMU_MODE(iommufd, ##__VA_ARGS__)
-
-struct vfio_pci_bar {
- struct vfio_region_info info;
- void *vaddr;
-};
-
-typedef u64 iova_t;
-
-#define INVALID_IOVA UINT64_MAX
-
-struct vfio_dma_region {
- struct list_head link;
- void *vaddr;
- iova_t iova;
- u64 size;
-};
-
-struct vfio_pci_device;
-
-struct vfio_pci_driver_ops {
- const char *name;
-
- /**
- * @probe() - Check if the driver supports the given device.
- *
- * Return: 0 on success, non-0 on failure.
- */
- int (*probe)(struct vfio_pci_device *device);
-
- /**
- * @init() - Initialize the driver for @device.
- *
- * Must be called after device->driver.region has been initialized.
- */
- void (*init)(struct vfio_pci_device *device);
-
- /**
- * remove() - Deinitialize the driver for @device.
- */
- void (*remove)(struct vfio_pci_device *device);
-
- /**
- * memcpy_start() - Kick off @count repeated memcpy operations from
- * [@src, @src + @size) to [@dst, @dst + @size).
- *
- * Guarantees:
- * - The device will attempt DMA reads on [src, src + size).
- * - The device will attempt DMA writes on [dst, dst + size).
- * - The device will not generate any interrupts.
- *
- * memcpy_start() returns immediately, it does not wait for the
- * copies to complete.
- */
- void (*memcpy_start)(struct vfio_pci_device *device,
- iova_t src, iova_t dst, u64 size, u64 count);
-
- /**
- * memcpy_wait() - Wait until the memcpy operations started by
- * memcpy_start() have finished.
- *
- * Guarantees:
- * - All in-flight DMAs initiated by memcpy_start() are fully complete
- * before memcpy_wait() returns.
- *
- * Returns non-0 if the driver detects that an error occurred during the
- * memcpy, 0 otherwise.
- */
- int (*memcpy_wait)(struct vfio_pci_device *device);
-
- /**
- * send_msi() - Make the device send the MSI device->driver.msi.
- *
- * Guarantees:
- * - The device will send the MSI once.
- */
- void (*send_msi)(struct vfio_pci_device *device);
-};
-
-struct vfio_pci_driver {
- const struct vfio_pci_driver_ops *ops;
- bool initialized;
- bool memcpy_in_progress;
-
- /* Region to be used by the driver (e.g. for in-memory descriptors) */
- struct vfio_dma_region region;
-
- /* The maximum size that can be passed to memcpy_start(). */
- u64 max_memcpy_size;
-
- /* The maximum count that can be passed to memcpy_start(). */
- u64 max_memcpy_count;
-
- /* The MSI vector the device will signal in ops->send_msi(). */
- int msi;
-};
-
-struct vfio_pci_device {
- int fd;
-
- const struct vfio_iommu_mode *iommu_mode;
- int group_fd;
- int container_fd;
-
- int iommufd;
- u32 ioas_id;
-
- struct vfio_device_info info;
- struct vfio_region_info config_space;
- struct vfio_pci_bar bars[PCI_STD_NUM_BARS];
-
- struct vfio_irq_info msi_info;
- struct vfio_irq_info msix_info;
-
- struct list_head dma_regions;
-
- /* eventfds for MSI and MSI-x interrupts */
- int msi_eventfds[PCI_MSIX_FLAGS_QSIZE + 1];
-
- struct vfio_pci_driver driver;
-};
-
-struct iova_allocator {
- struct iommu_iova_range *ranges;
- u32 nranges;
- u32 range_idx;
- u64 range_offset;
-};
-
-/*
- * Return the BDF string of the device that the test should use.
- *
- * If a BDF string is provided by the user on the command line (as the last
- * element of argv[]), then this function will return that and decrement argc
- * by 1.
- *
- * Otherwise this function will attempt to use the environment variable
- * $VFIO_SELFTESTS_BDF.
- *
- * If BDF cannot be determined then the test will exit with KSFT_SKIP.
- */
-const char *vfio_selftests_get_bdf(int *argc, char *argv[]);
-const char *vfio_pci_get_cdev_path(const char *bdf);
-
-extern const char *default_iommu_mode;
-
-struct vfio_pci_device *vfio_pci_device_init(const char *bdf, const char *iommu_mode);
-void vfio_pci_device_cleanup(struct vfio_pci_device *device);
-void vfio_pci_device_reset(struct vfio_pci_device *device);
-
-struct iommu_iova_range *vfio_pci_iova_ranges(struct vfio_pci_device *device,
- u32 *nranges);
-
-struct iova_allocator *iova_allocator_init(struct vfio_pci_device *device);
-void iova_allocator_cleanup(struct iova_allocator *allocator);
-iova_t iova_allocator_alloc(struct iova_allocator *allocator, size_t size);
-
-int __vfio_pci_dma_map(struct vfio_pci_device *device,
- struct vfio_dma_region *region);
-int __vfio_pci_dma_unmap(struct vfio_pci_device *device,
- struct vfio_dma_region *region,
- u64 *unmapped);
-int __vfio_pci_dma_unmap_all(struct vfio_pci_device *device, u64 *unmapped);
-
-static inline void vfio_pci_dma_map(struct vfio_pci_device *device,
- struct vfio_dma_region *region)
-{
- VFIO_ASSERT_EQ(__vfio_pci_dma_map(device, region), 0);
-}
-
-static inline void vfio_pci_dma_unmap(struct vfio_pci_device *device,
- struct vfio_dma_region *region)
-{
- VFIO_ASSERT_EQ(__vfio_pci_dma_unmap(device, region, NULL), 0);
-}
-
-static inline void vfio_pci_dma_unmap_all(struct vfio_pci_device *device)
-{
- VFIO_ASSERT_EQ(__vfio_pci_dma_unmap_all(device, NULL), 0);
-}
-
-void vfio_pci_config_access(struct vfio_pci_device *device, bool write,
- size_t config, size_t size, void *data);
-
-#define vfio_pci_config_read(_device, _offset, _type) ({ \
- _type __data; \
- vfio_pci_config_access((_device), false, _offset, sizeof(__data), &__data); \
- __data; \
-})
-
-#define vfio_pci_config_readb(_d, _o) vfio_pci_config_read(_d, _o, u8)
-#define vfio_pci_config_readw(_d, _o) vfio_pci_config_read(_d, _o, u16)
-#define vfio_pci_config_readl(_d, _o) vfio_pci_config_read(_d, _o, u32)
-
-#define vfio_pci_config_write(_device, _offset, _value, _type) do { \
- _type __data = (_value); \
- vfio_pci_config_access((_device), true, _offset, sizeof(_type), &__data); \
-} while (0)
-
-#define vfio_pci_config_writeb(_d, _o, _v) vfio_pci_config_write(_d, _o, _v, u8)
-#define vfio_pci_config_writew(_d, _o, _v) vfio_pci_config_write(_d, _o, _v, u16)
-#define vfio_pci_config_writel(_d, _o, _v) vfio_pci_config_write(_d, _o, _v, u32)
-
-void vfio_pci_irq_enable(struct vfio_pci_device *device, u32 index,
- u32 vector, int count);
-void vfio_pci_irq_disable(struct vfio_pci_device *device, u32 index);
-void vfio_pci_irq_trigger(struct vfio_pci_device *device, u32 index, u32 vector);
-
-static inline void fcntl_set_nonblock(int fd)
-{
- int r;
-
- r = fcntl(fd, F_GETFL, 0);
- VFIO_ASSERT_NE(r, -1, "F_GETFL failed for fd %d\n", fd);
-
- r = fcntl(fd, F_SETFL, r | O_NONBLOCK);
- VFIO_ASSERT_NE(r, -1, "F_SETFL O_NONBLOCK failed for fd %d\n", fd);
-}
-
-static inline void vfio_pci_msi_enable(struct vfio_pci_device *device,
- u32 vector, int count)
-{
- vfio_pci_irq_enable(device, VFIO_PCI_MSI_IRQ_INDEX, vector, count);
-}
-
-static inline void vfio_pci_msi_disable(struct vfio_pci_device *device)
-{
- vfio_pci_irq_disable(device, VFIO_PCI_MSI_IRQ_INDEX);
-}
-
-static inline void vfio_pci_msix_enable(struct vfio_pci_device *device,
- u32 vector, int count)
-{
- vfio_pci_irq_enable(device, VFIO_PCI_MSIX_IRQ_INDEX, vector, count);
-}
-
-static inline void vfio_pci_msix_disable(struct vfio_pci_device *device)
-{
- vfio_pci_irq_disable(device, VFIO_PCI_MSIX_IRQ_INDEX);
-}
-
-iova_t __to_iova(struct vfio_pci_device *device, void *vaddr);
-iova_t to_iova(struct vfio_pci_device *device, void *vaddr);
-
-static inline bool vfio_pci_device_match(struct vfio_pci_device *device,
- u16 vendor_id, u16 device_id)
-{
- return (vendor_id == vfio_pci_config_readw(device, PCI_VENDOR_ID)) &&
- (device_id == vfio_pci_config_readw(device, PCI_DEVICE_ID));
-}
-
-void vfio_pci_driver_probe(struct vfio_pci_device *device);
-void vfio_pci_driver_init(struct vfio_pci_device *device);
-void vfio_pci_driver_remove(struct vfio_pci_device *device);
-int vfio_pci_driver_memcpy(struct vfio_pci_device *device,
- iova_t src, iova_t dst, u64 size);
-void vfio_pci_driver_memcpy_start(struct vfio_pci_device *device,
- iova_t src, iova_t dst, u64 size,
- u64 count);
-int vfio_pci_driver_memcpy_wait(struct vfio_pci_device *device);
-void vfio_pci_driver_send_msi(struct vfio_pci_device *device);
-
-#endif /* SELFTESTS_VFIO_LIB_INCLUDE_VFIO_UTIL_H */
diff --git a/tools/testing/selftests/vfio/lib/iommu.c b/tools/testing/selftests/vfio/lib/iommu.c
new file mode 100644
index 000000000000..8079d43523f3
--- /dev/null
+++ b/tools/testing/selftests/vfio/lib/iommu.c
@@ -0,0 +1,465 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <dirent.h>
+#include <fcntl.h>
+#include <libgen.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <sys/eventfd.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+
+#include <uapi/linux/types.h>
+#include <linux/limits.h>
+#include <linux/mman.h>
+#include <linux/types.h>
+#include <linux/vfio.h>
+#include <linux/iommufd.h>
+
+#include "../../../kselftest.h"
+#include <libvfio.h>
+
+const char *default_iommu_mode = "iommufd";
+
+/* Reminder: Keep in sync with FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES(). */
+static const struct iommu_mode iommu_modes[] = {
+ {
+ .name = "vfio_type1_iommu",
+ .container_path = "/dev/vfio/vfio",
+ .iommu_type = VFIO_TYPE1_IOMMU,
+ },
+ {
+ .name = "vfio_type1v2_iommu",
+ .container_path = "/dev/vfio/vfio",
+ .iommu_type = VFIO_TYPE1v2_IOMMU,
+ },
+ {
+ .name = "iommufd_compat_type1",
+ .container_path = "/dev/iommu",
+ .iommu_type = VFIO_TYPE1_IOMMU,
+ },
+ {
+ .name = "iommufd_compat_type1v2",
+ .container_path = "/dev/iommu",
+ .iommu_type = VFIO_TYPE1v2_IOMMU,
+ },
+ {
+ .name = "iommufd",
+ },
+};
+
+static const struct iommu_mode *lookup_iommu_mode(const char *iommu_mode)
+{
+ int i;
+
+ if (!iommu_mode)
+ iommu_mode = default_iommu_mode;
+
+ for (i = 0; i < ARRAY_SIZE(iommu_modes); i++) {
+ if (strcmp(iommu_mode, iommu_modes[i].name))
+ continue;
+
+ return &iommu_modes[i];
+ }
+
+ VFIO_FAIL("Unrecognized IOMMU mode: %s\n", iommu_mode);
+}
+
+int __iommu_hva2iova(struct iommu *iommu, void *vaddr, iova_t *iova)
+{
+ struct dma_region *region;
+
+ list_for_each_entry(region, &iommu->dma_regions, link) {
+ if (vaddr < region->vaddr)
+ continue;
+
+ if (vaddr >= region->vaddr + region->size)
+ continue;
+
+ if (iova)
+ *iova = region->iova + (vaddr - region->vaddr);
+
+ return 0;
+ }
+
+ return -ENOENT;
+}
+
+iova_t iommu_hva2iova(struct iommu *iommu, void *vaddr)
+{
+ iova_t iova;
+ int ret;
+
+ ret = __iommu_hva2iova(iommu, vaddr, &iova);
+ VFIO_ASSERT_EQ(ret, 0, "%p is not mapped into the iommu\n", vaddr);
+
+ return iova;
+}
+
+static int vfio_iommu_map(struct iommu *iommu, struct dma_region *region)
+{
+ struct vfio_iommu_type1_dma_map args = {
+ .argsz = sizeof(args),
+ .flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE,
+ .vaddr = (u64)region->vaddr,
+ .iova = region->iova,
+ .size = region->size,
+ };
+
+ if (ioctl(iommu->container_fd, VFIO_IOMMU_MAP_DMA, &args))
+ return -errno;
+
+ return 0;
+}
+
+static int iommufd_map(struct iommu *iommu, struct dma_region *region)
+{
+ struct iommu_ioas_map args = {
+ .size = sizeof(args),
+ .flags = IOMMU_IOAS_MAP_READABLE |
+ IOMMU_IOAS_MAP_WRITEABLE |
+ IOMMU_IOAS_MAP_FIXED_IOVA,
+ .user_va = (u64)region->vaddr,
+ .iova = region->iova,
+ .length = region->size,
+ .ioas_id = iommu->ioas_id,
+ };
+
+ if (ioctl(iommu->iommufd, IOMMU_IOAS_MAP, &args))
+ return -errno;
+
+ return 0;
+}
+
+int __iommu_map(struct iommu *iommu, struct dma_region *region)
+{
+ int ret;
+
+ if (iommu->iommufd)
+ ret = iommufd_map(iommu, region);
+ else
+ ret = vfio_iommu_map(iommu, region);
+
+ if (ret)
+ return ret;
+
+ list_add(&region->link, &iommu->dma_regions);
+
+ return 0;
+}
+
+static int __vfio_iommu_unmap(int fd, u64 iova, u64 size, u32 flags, u64 *unmapped)
+{
+ struct vfio_iommu_type1_dma_unmap args = {
+ .argsz = sizeof(args),
+ .iova = iova,
+ .size = size,
+ .flags = flags,
+ };
+
+ if (ioctl(fd, VFIO_IOMMU_UNMAP_DMA, &args))
+ return -errno;
+
+ if (unmapped)
+ *unmapped = args.size;
+
+ return 0;
+}
+
+static int vfio_iommu_unmap(struct iommu *iommu, struct dma_region *region,
+ u64 *unmapped)
+{
+ return __vfio_iommu_unmap(iommu->container_fd, region->iova,
+ region->size, 0, unmapped);
+}
+
+static int __iommufd_unmap(int fd, u64 iova, u64 length, u32 ioas_id, u64 *unmapped)
+{
+ struct iommu_ioas_unmap args = {
+ .size = sizeof(args),
+ .iova = iova,
+ .length = length,
+ .ioas_id = ioas_id,
+ };
+
+ if (ioctl(fd, IOMMU_IOAS_UNMAP, &args))
+ return -errno;
+
+ if (unmapped)
+ *unmapped = args.length;
+
+ return 0;
+}
+
+static int iommufd_unmap(struct iommu *iommu, struct dma_region *region,
+ u64 *unmapped)
+{
+ return __iommufd_unmap(iommu->iommufd, region->iova, region->size,
+ iommu->ioas_id, unmapped);
+}
+
+int __iommu_unmap(struct iommu *iommu, struct dma_region *region, u64 *unmapped)
+{
+ int ret;
+
+ if (iommu->iommufd)
+ ret = iommufd_unmap(iommu, region, unmapped);
+ else
+ ret = vfio_iommu_unmap(iommu, region, unmapped);
+
+ if (ret)
+ return ret;
+
+ list_del_init(&region->link);
+
+ return 0;
+}
+
+int __iommu_unmap_all(struct iommu *iommu, u64 *unmapped)
+{
+ int ret;
+ struct dma_region *curr, *next;
+
+ if (iommu->iommufd)
+ ret = __iommufd_unmap(iommu->iommufd, 0, UINT64_MAX,
+ iommu->ioas_id, unmapped);
+ else
+ ret = __vfio_iommu_unmap(iommu->container_fd, 0, 0,
+ VFIO_DMA_UNMAP_FLAG_ALL, unmapped);
+
+ if (ret)
+ return ret;
+
+ list_for_each_entry_safe(curr, next, &iommu->dma_regions, link)
+ list_del_init(&curr->link);
+
+ return 0;
+}
+
+static struct vfio_info_cap_header *next_cap_hdr(void *buf, u32 bufsz,
+ u32 *cap_offset)
+{
+ struct vfio_info_cap_header *hdr;
+
+ if (!*cap_offset)
+ return NULL;
+
+ VFIO_ASSERT_LT(*cap_offset, bufsz);
+ VFIO_ASSERT_GE(bufsz - *cap_offset, sizeof(*hdr));
+
+ hdr = (struct vfio_info_cap_header *)((u8 *)buf + *cap_offset);
+ *cap_offset = hdr->next;
+
+ return hdr;
+}
+
+static struct vfio_info_cap_header *vfio_iommu_info_cap_hdr(struct vfio_iommu_type1_info *info,
+ u16 cap_id)
+{
+ struct vfio_info_cap_header *hdr;
+ u32 cap_offset = info->cap_offset;
+ u32 max_depth;
+ u32 depth = 0;
+
+ if (!(info->flags & VFIO_IOMMU_INFO_CAPS))
+ return NULL;
+
+ if (cap_offset)
+ VFIO_ASSERT_GE(cap_offset, sizeof(*info));
+
+ max_depth = (info->argsz - sizeof(*info)) / sizeof(*hdr);
+
+ while ((hdr = next_cap_hdr(info, info->argsz, &cap_offset))) {
+ depth++;
+ VFIO_ASSERT_LE(depth, max_depth, "Capability chain contains a cycle\n");
+
+ if (hdr->id == cap_id)
+ return hdr;
+ }
+
+ return NULL;
+}
+
+/* Return buffer including capability chain, if present. Free with free() */
+static struct vfio_iommu_type1_info *vfio_iommu_get_info(int container_fd)
+{
+ struct vfio_iommu_type1_info *info;
+
+ info = malloc(sizeof(*info));
+ VFIO_ASSERT_NOT_NULL(info);
+
+ *info = (struct vfio_iommu_type1_info) {
+ .argsz = sizeof(*info),
+ };
+
+ ioctl_assert(container_fd, VFIO_IOMMU_GET_INFO, info);
+ VFIO_ASSERT_GE(info->argsz, sizeof(*info));
+
+ info = realloc(info, info->argsz);
+ VFIO_ASSERT_NOT_NULL(info);
+
+ ioctl_assert(container_fd, VFIO_IOMMU_GET_INFO, info);
+ VFIO_ASSERT_GE(info->argsz, sizeof(*info));
+
+ return info;
+}
+
+/*
+ * Return iova ranges for the device's container. Normalize vfio_iommu_type1 to
+ * report iommufd's iommu_iova_range. Free with free().
+ */
+static struct iommu_iova_range *vfio_iommu_iova_ranges(struct iommu *iommu,
+ u32 *nranges)
+{
+ struct vfio_iommu_type1_info_cap_iova_range *cap_range;
+ struct vfio_iommu_type1_info *info;
+ struct vfio_info_cap_header *hdr;
+ struct iommu_iova_range *ranges = NULL;
+
+ info = vfio_iommu_get_info(iommu->container_fd);
+ hdr = vfio_iommu_info_cap_hdr(info, VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE);
+ VFIO_ASSERT_NOT_NULL(hdr);
+
+ cap_range = container_of(hdr, struct vfio_iommu_type1_info_cap_iova_range, header);
+ VFIO_ASSERT_GT(cap_range->nr_iovas, 0);
+
+ ranges = calloc(cap_range->nr_iovas, sizeof(*ranges));
+ VFIO_ASSERT_NOT_NULL(ranges);
+
+ for (u32 i = 0; i < cap_range->nr_iovas; i++) {
+ ranges[i] = (struct iommu_iova_range){
+ .start = cap_range->iova_ranges[i].start,
+ .last = cap_range->iova_ranges[i].end,
+ };
+ }
+
+ *nranges = cap_range->nr_iovas;
+
+ free(info);
+ return ranges;
+}
+
+/* Return iova ranges of the device's IOAS. Free with free() */
+static struct iommu_iova_range *iommufd_iova_ranges(struct iommu *iommu,
+ u32 *nranges)
+{
+ struct iommu_iova_range *ranges;
+ int ret;
+
+ struct iommu_ioas_iova_ranges query = {
+ .size = sizeof(query),
+ .ioas_id = iommu->ioas_id,
+ };
+
+ ret = ioctl(iommu->iommufd, IOMMU_IOAS_IOVA_RANGES, &query);
+ VFIO_ASSERT_EQ(ret, -1);
+ VFIO_ASSERT_EQ(errno, EMSGSIZE);
+ VFIO_ASSERT_GT(query.num_iovas, 0);
+
+ ranges = calloc(query.num_iovas, sizeof(*ranges));
+ VFIO_ASSERT_NOT_NULL(ranges);
+
+ query.allowed_iovas = (uintptr_t)ranges;
+
+ ioctl_assert(iommu->iommufd, IOMMU_IOAS_IOVA_RANGES, &query);
+ *nranges = query.num_iovas;
+
+ return ranges;
+}
+
+static int iova_range_comp(const void *a, const void *b)
+{
+ const struct iommu_iova_range *ra = a, *rb = b;
+
+ if (ra->start < rb->start)
+ return -1;
+
+ if (ra->start > rb->start)
+ return 1;
+
+ return 0;
+}
+
+/* Return sorted IOVA ranges of the device. Free with free(). */
+struct iommu_iova_range *iommu_iova_ranges(struct iommu *iommu, u32 *nranges)
+{
+ struct iommu_iova_range *ranges;
+
+ if (iommu->iommufd)
+ ranges = iommufd_iova_ranges(iommu, nranges);
+ else
+ ranges = vfio_iommu_iova_ranges(iommu, nranges);
+
+ if (!ranges)
+ return NULL;
+
+ VFIO_ASSERT_GT(*nranges, 0);
+
+ /* Sort and check that ranges are sane and non-overlapping */
+ qsort(ranges, *nranges, sizeof(*ranges), iova_range_comp);
+ VFIO_ASSERT_LT(ranges[0].start, ranges[0].last);
+
+ for (u32 i = 1; i < *nranges; i++) {
+ VFIO_ASSERT_LT(ranges[i].start, ranges[i].last);
+ VFIO_ASSERT_LT(ranges[i - 1].last, ranges[i].start);
+ }
+
+ return ranges;
+}
+
+static u32 iommufd_ioas_alloc(int iommufd)
+{
+ struct iommu_ioas_alloc args = {
+ .size = sizeof(args),
+ };
+
+ ioctl_assert(iommufd, IOMMU_IOAS_ALLOC, &args);
+ return args.out_ioas_id;
+}
+
+struct iommu *iommu_init(const char *iommu_mode)
+{
+ const char *container_path;
+ struct iommu *iommu;
+ int version;
+
+ iommu = calloc(1, sizeof(*iommu));
+ VFIO_ASSERT_NOT_NULL(iommu);
+
+ INIT_LIST_HEAD(&iommu->dma_regions);
+
+ iommu->mode = lookup_iommu_mode(iommu_mode);
+
+ container_path = iommu->mode->container_path;
+ if (container_path) {
+ iommu->container_fd = open(container_path, O_RDWR);
+ VFIO_ASSERT_GE(iommu->container_fd, 0, "open(%s) failed\n", container_path);
+
+ version = ioctl(iommu->container_fd, VFIO_GET_API_VERSION);
+ VFIO_ASSERT_EQ(version, VFIO_API_VERSION, "Unsupported version: %d\n", version);
+ } else {
+ /*
+ * Require device->iommufd to be >0 so that a simple non-0 check can be
+ * used to check if iommufd is enabled. In practice open() will never
+ * return 0 unless stdin is closed.
+ */
+ iommu->iommufd = open("/dev/iommu", O_RDWR);
+ VFIO_ASSERT_GT(iommu->iommufd, 0);
+
+ iommu->ioas_id = iommufd_ioas_alloc(iommu->iommufd);
+ }
+
+ return iommu;
+}
+
+void iommu_cleanup(struct iommu *iommu)
+{
+ if (iommu->iommufd)
+ VFIO_ASSERT_EQ(close(iommu->iommufd), 0);
+ else
+ VFIO_ASSERT_EQ(close(iommu->container_fd), 0);
+
+ free(iommu);
+}
diff --git a/tools/testing/selftests/vfio/lib/iova_allocator.c b/tools/testing/selftests/vfio/lib/iova_allocator.c
new file mode 100644
index 000000000000..a12b0a51e9e6
--- /dev/null
+++ b/tools/testing/selftests/vfio/lib/iova_allocator.c
@@ -0,0 +1,94 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <dirent.h>
+#include <fcntl.h>
+#include <libgen.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <sys/eventfd.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+
+#include <uapi/linux/types.h>
+#include <linux/iommufd.h>
+#include <linux/limits.h>
+#include <linux/mman.h>
+#include <linux/overflow.h>
+#include <linux/types.h>
+#include <linux/vfio.h>
+
+#include <libvfio.h>
+
+struct iova_allocator *iova_allocator_init(struct iommu *iommu)
+{
+ struct iova_allocator *allocator;
+ struct iommu_iova_range *ranges;
+ u32 nranges;
+
+ ranges = iommu_iova_ranges(iommu, &nranges);
+ VFIO_ASSERT_NOT_NULL(ranges);
+
+ allocator = malloc(sizeof(*allocator));
+ VFIO_ASSERT_NOT_NULL(allocator);
+
+ *allocator = (struct iova_allocator){
+ .ranges = ranges,
+ .nranges = nranges,
+ .range_idx = 0,
+ .range_offset = 0,
+ };
+
+ return allocator;
+}
+
+void iova_allocator_cleanup(struct iova_allocator *allocator)
+{
+ free(allocator->ranges);
+ free(allocator);
+}
+
+iova_t iova_allocator_alloc(struct iova_allocator *allocator, size_t size)
+{
+ VFIO_ASSERT_GT(size, 0, "Invalid size arg, zero\n");
+ VFIO_ASSERT_EQ(size & (size - 1), 0, "Invalid size arg, non-power-of-2\n");
+
+ for (;;) {
+ struct iommu_iova_range *range;
+ iova_t iova, last;
+
+ VFIO_ASSERT_LT(allocator->range_idx, allocator->nranges,
+ "IOVA allocator out of space\n");
+
+ range = &allocator->ranges[allocator->range_idx];
+ iova = range->start + allocator->range_offset;
+
+ /* Check for sufficient space at the current offset */
+ if (check_add_overflow(iova, size - 1, &last) ||
+ last > range->last)
+ goto next_range;
+
+ /* Align iova to size */
+ iova = last & ~(size - 1);
+
+ /* Check for sufficient space at the aligned iova */
+ if (check_add_overflow(iova, size - 1, &last) ||
+ last > range->last)
+ goto next_range;
+
+ if (last == range->last) {
+ allocator->range_idx++;
+ allocator->range_offset = 0;
+ } else {
+ allocator->range_offset = last - range->start + 1;
+ }
+
+ return iova;
+
+next_range:
+ allocator->range_idx++;
+ allocator->range_offset = 0;
+ }
+}
+
diff --git a/tools/testing/selftests/vfio/lib/libvfio.c b/tools/testing/selftests/vfio/lib/libvfio.c
new file mode 100644
index 000000000000..a23a3cc5be69
--- /dev/null
+++ b/tools/testing/selftests/vfio/lib/libvfio.c
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "../../../kselftest.h"
+#include <libvfio.h>
+
+static bool is_bdf(const char *str)
+{
+ unsigned int s, b, d, f;
+ int length, count;
+
+ count = sscanf(str, "%4x:%2x:%2x.%2x%n", &s, &b, &d, &f, &length);
+ return count == 4 && length == strlen(str);
+}
+
+static char **get_bdfs_cmdline(int *argc, char *argv[], int *nr_bdfs)
+{
+ int i;
+
+ for (i = *argc - 1; i > 0 && is_bdf(argv[i]); i--)
+ continue;
+
+ i++;
+ *nr_bdfs = *argc - i;
+ *argc -= *nr_bdfs;
+
+ return *nr_bdfs ? &argv[i] : NULL;
+}
+
+static char *get_bdf_env(void)
+{
+ char *bdf;
+
+ bdf = getenv("VFIO_SELFTESTS_BDF");
+ if (!bdf)
+ return NULL;
+
+ VFIO_ASSERT_TRUE(is_bdf(bdf), "Invalid BDF: %s\n", bdf);
+ return bdf;
+}
+
+char **vfio_selftests_get_bdfs(int *argc, char *argv[], int *nr_bdfs)
+{
+ static char *env_bdf;
+ char **bdfs;
+
+ bdfs = get_bdfs_cmdline(argc, argv, nr_bdfs);
+ if (bdfs)
+ return bdfs;
+
+ env_bdf = get_bdf_env();
+ if (env_bdf) {
+ *nr_bdfs = 1;
+ return &env_bdf;
+ }
+
+ fprintf(stderr, "Unable to determine which device(s) to use, skipping test.\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "To pass the device address via environment variable:\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " export VFIO_SELFTESTS_BDF=\"segment:bus:device.function\"\n");
+ fprintf(stderr, " %s [options]\n", argv[0]);
+ fprintf(stderr, "\n");
+ fprintf(stderr, "To pass the device address(es) via argv:\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " %s [options] segment:bus:device.function ...\n", argv[0]);
+ fprintf(stderr, "\n");
+ exit(KSFT_SKIP);
+}
+
+const char *vfio_selftests_get_bdf(int *argc, char *argv[])
+{
+ int nr_bdfs;
+
+ return vfio_selftests_get_bdfs(argc, argv, &nr_bdfs)[0];
+}
diff --git a/tools/testing/selftests/vfio/lib/libvfio.mk b/tools/testing/selftests/vfio/lib/libvfio.mk
index 5d11c3a89a28..9f47bceed16f 100644
--- a/tools/testing/selftests/vfio/lib/libvfio.mk
+++ b/tools/testing/selftests/vfio/lib/libvfio.mk
@@ -1,24 +1,29 @@
include $(top_srcdir)/scripts/subarch.include
ARCH ?= $(SUBARCH)
-VFIO_DIR := $(selfdir)/vfio
+LIBVFIO_SRCDIR := $(selfdir)/vfio/lib
-LIBVFIO_C := lib/vfio_pci_device.c
-LIBVFIO_C += lib/vfio_pci_driver.c
+LIBVFIO_C := iommu.c
+LIBVFIO_C += iova_allocator.c
+LIBVFIO_C += libvfio.c
+LIBVFIO_C += vfio_pci_device.c
+LIBVFIO_C += vfio_pci_driver.c
ifeq ($(ARCH:x86_64=x86),x86)
-LIBVFIO_C += lib/drivers/ioat/ioat.c
-LIBVFIO_C += lib/drivers/dsa/dsa.c
+LIBVFIO_C += drivers/ioat/ioat.c
+LIBVFIO_C += drivers/dsa/dsa.c
endif
-LIBVFIO_O := $(patsubst %.c, $(OUTPUT)/%.o, $(LIBVFIO_C))
+LIBVFIO_OUTPUT := $(OUTPUT)/libvfio
+
+LIBVFIO_O := $(patsubst %.c, $(LIBVFIO_OUTPUT)/%.o, $(LIBVFIO_C))
LIBVFIO_O_DIRS := $(shell dirname $(LIBVFIO_O) | uniq)
$(shell mkdir -p $(LIBVFIO_O_DIRS))
-CFLAGS += -I$(VFIO_DIR)/lib/include
+CFLAGS += -I$(LIBVFIO_SRCDIR)/include
-$(LIBVFIO_O): $(OUTPUT)/%.o : $(VFIO_DIR)/%.c
+$(LIBVFIO_O): $(LIBVFIO_OUTPUT)/%.o : $(LIBVFIO_SRCDIR)/%.c
$(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c $< -o $@
-EXTRA_CLEAN += $(LIBVFIO_O)
+EXTRA_CLEAN += $(LIBVFIO_OUTPUT)
diff --git a/tools/testing/selftests/vfio/lib/vfio_pci_device.c b/tools/testing/selftests/vfio/lib/vfio_pci_device.c
index b479a359da12..13fdb4b0b10f 100644
--- a/tools/testing/selftests/vfio/lib/vfio_pci_device.c
+++ b/tools/testing/selftests/vfio/lib/vfio_pci_device.c
@@ -20,286 +20,10 @@
#include <linux/vfio.h>
#include "../../../kselftest.h"
-#include <vfio_util.h>
+#include <libvfio.h>
#define PCI_SYSFS_PATH "/sys/bus/pci/devices"
-#define ioctl_assert(_fd, _op, _arg) do { \
- void *__arg = (_arg); \
- int __ret = ioctl((_fd), (_op), (__arg)); \
- VFIO_ASSERT_EQ(__ret, 0, "ioctl(%s, %s, %s) returned %d\n", #_fd, #_op, #_arg, __ret); \
-} while (0)
-
-static struct vfio_info_cap_header *next_cap_hdr(void *buf, u32 bufsz,
- u32 *cap_offset)
-{
- struct vfio_info_cap_header *hdr;
-
- if (!*cap_offset)
- return NULL;
-
- VFIO_ASSERT_LT(*cap_offset, bufsz);
- VFIO_ASSERT_GE(bufsz - *cap_offset, sizeof(*hdr));
-
- hdr = (struct vfio_info_cap_header *)((u8 *)buf + *cap_offset);
- *cap_offset = hdr->next;
-
- return hdr;
-}
-
-static struct vfio_info_cap_header *vfio_iommu_info_cap_hdr(struct vfio_iommu_type1_info *info,
- u16 cap_id)
-{
- struct vfio_info_cap_header *hdr;
- u32 cap_offset = info->cap_offset;
- u32 max_depth;
- u32 depth = 0;
-
- if (!(info->flags & VFIO_IOMMU_INFO_CAPS))
- return NULL;
-
- if (cap_offset)
- VFIO_ASSERT_GE(cap_offset, sizeof(*info));
-
- max_depth = (info->argsz - sizeof(*info)) / sizeof(*hdr);
-
- while ((hdr = next_cap_hdr(info, info->argsz, &cap_offset))) {
- depth++;
- VFIO_ASSERT_LE(depth, max_depth, "Capability chain contains a cycle\n");
-
- if (hdr->id == cap_id)
- return hdr;
- }
-
- return NULL;
-}
-
-/* Return buffer including capability chain, if present. Free with free() */
-static struct vfio_iommu_type1_info *vfio_iommu_get_info(struct vfio_pci_device *device)
-{
- struct vfio_iommu_type1_info *info;
-
- info = malloc(sizeof(*info));
- VFIO_ASSERT_NOT_NULL(info);
-
- *info = (struct vfio_iommu_type1_info) {
- .argsz = sizeof(*info),
- };
-
- ioctl_assert(device->container_fd, VFIO_IOMMU_GET_INFO, info);
- VFIO_ASSERT_GE(info->argsz, sizeof(*info));
-
- info = realloc(info, info->argsz);
- VFIO_ASSERT_NOT_NULL(info);
-
- ioctl_assert(device->container_fd, VFIO_IOMMU_GET_INFO, info);
- VFIO_ASSERT_GE(info->argsz, sizeof(*info));
-
- return info;
-}
-
-/*
- * Return iova ranges for the device's container. Normalize vfio_iommu_type1 to
- * report iommufd's iommu_iova_range. Free with free().
- */
-static struct iommu_iova_range *vfio_iommu_iova_ranges(struct vfio_pci_device *device,
- u32 *nranges)
-{
- struct vfio_iommu_type1_info_cap_iova_range *cap_range;
- struct vfio_iommu_type1_info *info;
- struct vfio_info_cap_header *hdr;
- struct iommu_iova_range *ranges = NULL;
-
- info = vfio_iommu_get_info(device);
- hdr = vfio_iommu_info_cap_hdr(info, VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE);
- VFIO_ASSERT_NOT_NULL(hdr);
-
- cap_range = container_of(hdr, struct vfio_iommu_type1_info_cap_iova_range, header);
- VFIO_ASSERT_GT(cap_range->nr_iovas, 0);
-
- ranges = calloc(cap_range->nr_iovas, sizeof(*ranges));
- VFIO_ASSERT_NOT_NULL(ranges);
-
- for (u32 i = 0; i < cap_range->nr_iovas; i++) {
- ranges[i] = (struct iommu_iova_range){
- .start = cap_range->iova_ranges[i].start,
- .last = cap_range->iova_ranges[i].end,
- };
- }
-
- *nranges = cap_range->nr_iovas;
-
- free(info);
- return ranges;
-}
-
-/* Return iova ranges of the device's IOAS. Free with free() */
-static struct iommu_iova_range *iommufd_iova_ranges(struct vfio_pci_device *device,
- u32 *nranges)
-{
- struct iommu_iova_range *ranges;
- int ret;
-
- struct iommu_ioas_iova_ranges query = {
- .size = sizeof(query),
- .ioas_id = device->ioas_id,
- };
-
- ret = ioctl(device->iommufd, IOMMU_IOAS_IOVA_RANGES, &query);
- VFIO_ASSERT_EQ(ret, -1);
- VFIO_ASSERT_EQ(errno, EMSGSIZE);
- VFIO_ASSERT_GT(query.num_iovas, 0);
-
- ranges = calloc(query.num_iovas, sizeof(*ranges));
- VFIO_ASSERT_NOT_NULL(ranges);
-
- query.allowed_iovas = (uintptr_t)ranges;
-
- ioctl_assert(device->iommufd, IOMMU_IOAS_IOVA_RANGES, &query);
- *nranges = query.num_iovas;
-
- return ranges;
-}
-
-static int iova_range_comp(const void *a, const void *b)
-{
- const struct iommu_iova_range *ra = a, *rb = b;
-
- if (ra->start < rb->start)
- return -1;
-
- if (ra->start > rb->start)
- return 1;
-
- return 0;
-}
-
-/* Return sorted IOVA ranges of the device. Free with free(). */
-struct iommu_iova_range *vfio_pci_iova_ranges(struct vfio_pci_device *device,
- u32 *nranges)
-{
- struct iommu_iova_range *ranges;
-
- if (device->iommufd)
- ranges = iommufd_iova_ranges(device, nranges);
- else
- ranges = vfio_iommu_iova_ranges(device, nranges);
-
- if (!ranges)
- return NULL;
-
- VFIO_ASSERT_GT(*nranges, 0);
-
- /* Sort and check that ranges are sane and non-overlapping */
- qsort(ranges, *nranges, sizeof(*ranges), iova_range_comp);
- VFIO_ASSERT_LT(ranges[0].start, ranges[0].last);
-
- for (u32 i = 1; i < *nranges; i++) {
- VFIO_ASSERT_LT(ranges[i].start, ranges[i].last);
- VFIO_ASSERT_LT(ranges[i - 1].last, ranges[i].start);
- }
-
- return ranges;
-}
-
-struct iova_allocator *iova_allocator_init(struct vfio_pci_device *device)
-{
- struct iova_allocator *allocator;
- struct iommu_iova_range *ranges;
- u32 nranges;
-
- ranges = vfio_pci_iova_ranges(device, &nranges);
- VFIO_ASSERT_NOT_NULL(ranges);
-
- allocator = malloc(sizeof(*allocator));
- VFIO_ASSERT_NOT_NULL(allocator);
-
- *allocator = (struct iova_allocator){
- .ranges = ranges,
- .nranges = nranges,
- .range_idx = 0,
- .range_offset = 0,
- };
-
- return allocator;
-}
-
-void iova_allocator_cleanup(struct iova_allocator *allocator)
-{
- free(allocator->ranges);
- free(allocator);
-}
-
-iova_t iova_allocator_alloc(struct iova_allocator *allocator, size_t size)
-{
- VFIO_ASSERT_GT(size, 0, "Invalid size arg, zero\n");
- VFIO_ASSERT_EQ(size & (size - 1), 0, "Invalid size arg, non-power-of-2\n");
-
- for (;;) {
- struct iommu_iova_range *range;
- iova_t iova, last;
-
- VFIO_ASSERT_LT(allocator->range_idx, allocator->nranges,
- "IOVA allocator out of space\n");
-
- range = &allocator->ranges[allocator->range_idx];
- iova = range->start + allocator->range_offset;
-
- /* Check for sufficient space at the current offset */
- if (check_add_overflow(iova, size - 1, &last) ||
- last > range->last)
- goto next_range;
-
- /* Align iova to size */
- iova = last & ~(size - 1);
-
- /* Check for sufficient space at the aligned iova */
- if (check_add_overflow(iova, size - 1, &last) ||
- last > range->last)
- goto next_range;
-
- if (last == range->last) {
- allocator->range_idx++;
- allocator->range_offset = 0;
- } else {
- allocator->range_offset = last - range->start + 1;
- }
-
- return iova;
-
-next_range:
- allocator->range_idx++;
- allocator->range_offset = 0;
- }
-}
-
-iova_t __to_iova(struct vfio_pci_device *device, void *vaddr)
-{
- struct vfio_dma_region *region;
-
- list_for_each_entry(region, &device->dma_regions, link) {
- if (vaddr < region->vaddr)
- continue;
-
- if (vaddr >= region->vaddr + region->size)
- continue;
-
- return region->iova + (vaddr - region->vaddr);
- }
-
- return INVALID_IOVA;
-}
-
-iova_t to_iova(struct vfio_pci_device *device, void *vaddr)
-{
- iova_t iova;
-
- iova = __to_iova(device, vaddr);
- VFIO_ASSERT_NE(iova, INVALID_IOVA, "%p is not mapped into device.\n", vaddr);
-
- return iova;
-}
-
static void vfio_pci_irq_set(struct vfio_pci_device *device,
u32 index, u32 vector, u32 count, int *fds)
{
@@ -386,141 +110,6 @@ static void vfio_pci_irq_get(struct vfio_pci_device *device, u32 index,
ioctl_assert(device->fd, VFIO_DEVICE_GET_IRQ_INFO, irq_info);
}
-static int vfio_iommu_dma_map(struct vfio_pci_device *device,
- struct vfio_dma_region *region)
-{
- struct vfio_iommu_type1_dma_map args = {
- .argsz = sizeof(args),
- .flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE,
- .vaddr = (u64)region->vaddr,
- .iova = region->iova,
- .size = region->size,
- };
-
- if (ioctl(device->container_fd, VFIO_IOMMU_MAP_DMA, &args))
- return -errno;
-
- return 0;
-}
-
-static int iommufd_dma_map(struct vfio_pci_device *device,
- struct vfio_dma_region *region)
-{
- struct iommu_ioas_map args = {
- .size = sizeof(args),
- .flags = IOMMU_IOAS_MAP_READABLE |
- IOMMU_IOAS_MAP_WRITEABLE |
- IOMMU_IOAS_MAP_FIXED_IOVA,
- .user_va = (u64)region->vaddr,
- .iova = region->iova,
- .length = region->size,
- .ioas_id = device->ioas_id,
- };
-
- if (ioctl(device->iommufd, IOMMU_IOAS_MAP, &args))
- return -errno;
-
- return 0;
-}
-
-int __vfio_pci_dma_map(struct vfio_pci_device *device,
- struct vfio_dma_region *region)
-{
- int ret;
-
- if (device->iommufd)
- ret = iommufd_dma_map(device, region);
- else
- ret = vfio_iommu_dma_map(device, region);
-
- if (ret)
- return ret;
-
- list_add(&region->link, &device->dma_regions);
-
- return 0;
-}
-
-static int vfio_iommu_dma_unmap(int fd, u64 iova, u64 size, u32 flags,
- u64 *unmapped)
-{
- struct vfio_iommu_type1_dma_unmap args = {
- .argsz = sizeof(args),
- .iova = iova,
- .size = size,
- .flags = flags,
- };
-
- if (ioctl(fd, VFIO_IOMMU_UNMAP_DMA, &args))
- return -errno;
-
- if (unmapped)
- *unmapped = args.size;
-
- return 0;
-}
-
-static int iommufd_dma_unmap(int fd, u64 iova, u64 length, u32 ioas_id,
- u64 *unmapped)
-{
- struct iommu_ioas_unmap args = {
- .size = sizeof(args),
- .iova = iova,
- .length = length,
- .ioas_id = ioas_id,
- };
-
- if (ioctl(fd, IOMMU_IOAS_UNMAP, &args))
- return -errno;
-
- if (unmapped)
- *unmapped = args.length;
-
- return 0;
-}
-
-int __vfio_pci_dma_unmap(struct vfio_pci_device *device,
- struct vfio_dma_region *region, u64 *unmapped)
-{
- int ret;
-
- if (device->iommufd)
- ret = iommufd_dma_unmap(device->iommufd, region->iova,
- region->size, device->ioas_id,
- unmapped);
- else
- ret = vfio_iommu_dma_unmap(device->container_fd, region->iova,
- region->size, 0, unmapped);
-
- if (ret)
- return ret;
-
- list_del_init(&region->link);
-
- return 0;
-}
-
-int __vfio_pci_dma_unmap_all(struct vfio_pci_device *device, u64 *unmapped)
-{
- int ret;
- struct vfio_dma_region *curr, *next;
-
- if (device->iommufd)
- ret = iommufd_dma_unmap(device->iommufd, 0, UINT64_MAX,
- device->ioas_id, unmapped);
- else
- ret = vfio_iommu_dma_unmap(device->container_fd, 0, 0,
- VFIO_DMA_UNMAP_FLAG_ALL, unmapped);
-
- if (ret)
- return ret;
-
- list_for_each_entry_safe(curr, next, &device->dma_regions, link)
- list_del_init(&curr->link);
-
- return 0;
-}
-
static void vfio_pci_region_get(struct vfio_pci_device *device, int index,
struct vfio_region_info *info)
{
@@ -627,28 +216,26 @@ static void vfio_pci_group_setup(struct vfio_pci_device *device, const char *bdf
ioctl_assert(device->group_fd, VFIO_GROUP_GET_STATUS, &group_status);
VFIO_ASSERT_TRUE(group_status.flags & VFIO_GROUP_FLAGS_VIABLE);
- ioctl_assert(device->group_fd, VFIO_GROUP_SET_CONTAINER, &device->container_fd);
+ ioctl_assert(device->group_fd, VFIO_GROUP_SET_CONTAINER, &device->iommu->container_fd);
}
static void vfio_pci_container_setup(struct vfio_pci_device *device, const char *bdf)
{
- unsigned long iommu_type = device->iommu_mode->iommu_type;
- const char *path = device->iommu_mode->container_path;
- int version;
+ struct iommu *iommu = device->iommu;
+ unsigned long iommu_type = iommu->mode->iommu_type;
int ret;
- device->container_fd = open(path, O_RDWR);
- VFIO_ASSERT_GE(device->container_fd, 0, "open(%s) failed\n", path);
-
- version = ioctl(device->container_fd, VFIO_GET_API_VERSION);
- VFIO_ASSERT_EQ(version, VFIO_API_VERSION, "Unsupported version: %d\n", version);
-
vfio_pci_group_setup(device, bdf);
- ret = ioctl(device->container_fd, VFIO_CHECK_EXTENSION, iommu_type);
+ ret = ioctl(iommu->container_fd, VFIO_CHECK_EXTENSION, iommu_type);
VFIO_ASSERT_GT(ret, 0, "VFIO IOMMU type %lu not supported\n", iommu_type);
- ioctl_assert(device->container_fd, VFIO_SET_IOMMU, (void *)iommu_type);
+ /*
+ * Allow multiple threads to race to set the IOMMU type on the
+ * container. The first will succeed and the rest should fail
+ * because the IOMMU type is already set.
+ */
+ (void)ioctl(iommu->container_fd, VFIO_SET_IOMMU, (void *)iommu_type);
device->fd = ioctl(device->group_fd, VFIO_GROUP_GET_DEVICE_FD, bdf);
VFIO_ASSERT_GE(device->fd, 0);
@@ -712,52 +299,6 @@ const char *vfio_pci_get_cdev_path(const char *bdf)
return cdev_path;
}
-/* Reminder: Keep in sync with FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES(). */
-static const struct vfio_iommu_mode iommu_modes[] = {
- {
- .name = "vfio_type1_iommu",
- .container_path = "/dev/vfio/vfio",
- .iommu_type = VFIO_TYPE1_IOMMU,
- },
- {
- .name = "vfio_type1v2_iommu",
- .container_path = "/dev/vfio/vfio",
- .iommu_type = VFIO_TYPE1v2_IOMMU,
- },
- {
- .name = "iommufd_compat_type1",
- .container_path = "/dev/iommu",
- .iommu_type = VFIO_TYPE1_IOMMU,
- },
- {
- .name = "iommufd_compat_type1v2",
- .container_path = "/dev/iommu",
- .iommu_type = VFIO_TYPE1v2_IOMMU,
- },
- {
- .name = "iommufd",
- },
-};
-
-const char *default_iommu_mode = "iommufd";
-
-static const struct vfio_iommu_mode *lookup_iommu_mode(const char *iommu_mode)
-{
- int i;
-
- if (!iommu_mode)
- iommu_mode = default_iommu_mode;
-
- for (i = 0; i < ARRAY_SIZE(iommu_modes); i++) {
- if (strcmp(iommu_mode, iommu_modes[i].name))
- continue;
-
- return &iommu_modes[i];
- }
-
- VFIO_FAIL("Unrecognized IOMMU mode: %s\n", iommu_mode);
-}
-
static void vfio_device_bind_iommufd(int device_fd, int iommufd)
{
struct vfio_device_bind_iommufd args = {
@@ -768,16 +309,6 @@ static void vfio_device_bind_iommufd(int device_fd, int iommufd)
ioctl_assert(device_fd, VFIO_DEVICE_BIND_IOMMUFD, &args);
}
-static u32 iommufd_ioas_alloc(int iommufd)
-{
- struct iommu_ioas_alloc args = {
- .size = sizeof(args),
- };
-
- ioctl_assert(iommufd, IOMMU_IOAS_ALLOC, &args);
- return args.out_ioas_id;
-}
-
static void vfio_device_attach_iommufd_pt(int device_fd, u32 pt_id)
{
struct vfio_device_attach_iommufd_pt args = {
@@ -796,31 +327,22 @@ static void vfio_pci_iommufd_setup(struct vfio_pci_device *device, const char *b
VFIO_ASSERT_GE(device->fd, 0);
free((void *)cdev_path);
- /*
- * Require device->iommufd to be >0 so that a simple non-0 check can be
- * used to check if iommufd is enabled. In practice open() will never
- * return 0 unless stdin is closed.
- */
- device->iommufd = open("/dev/iommu", O_RDWR);
- VFIO_ASSERT_GT(device->iommufd, 0);
-
- vfio_device_bind_iommufd(device->fd, device->iommufd);
- device->ioas_id = iommufd_ioas_alloc(device->iommufd);
- vfio_device_attach_iommufd_pt(device->fd, device->ioas_id);
+ vfio_device_bind_iommufd(device->fd, device->iommu->iommufd);
+ vfio_device_attach_iommufd_pt(device->fd, device->iommu->ioas_id);
}
-struct vfio_pci_device *vfio_pci_device_init(const char *bdf, const char *iommu_mode)
+struct vfio_pci_device *vfio_pci_device_init(const char *bdf, struct iommu *iommu)
{
struct vfio_pci_device *device;
device = calloc(1, sizeof(*device));
VFIO_ASSERT_NOT_NULL(device);
- INIT_LIST_HEAD(&device->dma_regions);
-
- device->iommu_mode = lookup_iommu_mode(iommu_mode);
+ VFIO_ASSERT_NOT_NULL(iommu);
+ device->iommu = iommu;
+ device->bdf = bdf;
- if (device->iommu_mode->container_path)
+ if (iommu->mode->container_path)
vfio_pci_container_setup(device, bdf);
else
vfio_pci_iommufd_setup(device, bdf);
@@ -849,48 +371,8 @@ void vfio_pci_device_cleanup(struct vfio_pci_device *device)
VFIO_ASSERT_EQ(close(device->msi_eventfds[i]), 0);
}
- if (device->iommufd) {
- VFIO_ASSERT_EQ(close(device->iommufd), 0);
- } else {
+ if (device->group_fd)
VFIO_ASSERT_EQ(close(device->group_fd), 0);
- VFIO_ASSERT_EQ(close(device->container_fd), 0);
- }
free(device);
}
-
-static bool is_bdf(const char *str)
-{
- unsigned int s, b, d, f;
- int length, count;
-
- count = sscanf(str, "%4x:%2x:%2x.%2x%n", &s, &b, &d, &f, &length);
- return count == 4 && length == strlen(str);
-}
-
-const char *vfio_selftests_get_bdf(int *argc, char *argv[])
-{
- char *bdf;
-
- if (*argc > 1 && is_bdf(argv[*argc - 1]))
- return argv[--(*argc)];
-
- bdf = getenv("VFIO_SELFTESTS_BDF");
- if (bdf) {
- VFIO_ASSERT_TRUE(is_bdf(bdf), "Invalid BDF: %s\n", bdf);
- return bdf;
- }
-
- fprintf(stderr, "Unable to determine which device to use, skipping test.\n");
- fprintf(stderr, "\n");
- fprintf(stderr, "To pass the device address via environment variable:\n");
- fprintf(stderr, "\n");
- fprintf(stderr, " export VFIO_SELFTESTS_BDF=segment:bus:device.function\n");
- fprintf(stderr, " %s [options]\n", argv[0]);
- fprintf(stderr, "\n");
- fprintf(stderr, "To pass the device address via argv:\n");
- fprintf(stderr, "\n");
- fprintf(stderr, " %s [options] segment:bus:device.function\n", argv[0]);
- fprintf(stderr, "\n");
- exit(KSFT_SKIP);
-}
diff --git a/tools/testing/selftests/vfio/lib/vfio_pci_driver.c b/tools/testing/selftests/vfio/lib/vfio_pci_driver.c
index e5e8723ecb41..ca0e25efbfa1 100644
--- a/tools/testing/selftests/vfio/lib/vfio_pci_driver.c
+++ b/tools/testing/selftests/vfio/lib/vfio_pci_driver.c
@@ -1,8 +1,6 @@
// SPDX-License-Identifier: GPL-2.0-only
-#include <stdio.h>
-
#include "../../../kselftest.h"
-#include <vfio_util.h>
+#include <libvfio.h>
#ifdef __x86_64__
extern struct vfio_pci_driver_ops dsa_ops;
@@ -29,7 +27,6 @@ void vfio_pci_driver_probe(struct vfio_pci_device *device)
if (ops->probe(device))
continue;
- printf("Driver found: %s\n", ops->name);
device->driver.ops = ops;
}
}
@@ -58,17 +55,6 @@ void vfio_pci_driver_init(struct vfio_pci_device *device)
driver->ops->init(device);
driver->initialized = true;
-
- printf("%s: region: vaddr %p, iova 0x%lx, size 0x%lx\n",
- driver->ops->name,
- driver->region.vaddr,
- driver->region.iova,
- driver->region.size);
-
- printf("%s: max_memcpy_size 0x%lx, max_memcpy_count 0x%lx\n",
- driver->ops->name,
- driver->max_memcpy_size,
- driver->max_memcpy_count);
}
void vfio_pci_driver_remove(struct vfio_pci_device *device)
diff --git a/tools/testing/selftests/vfio/run.sh b/tools/testing/selftests/vfio/run.sh
deleted file mode 100755
index 0476b6d7adc3..000000000000
--- a/tools/testing/selftests/vfio/run.sh
+++ /dev/null
@@ -1,109 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-or-later
-
-# Global variables initialized in main() and then used during cleanup() when
-# the script exits.
-declare DEVICE_BDF
-declare NEW_DRIVER
-declare OLD_DRIVER
-declare OLD_NUMVFS
-declare DRIVER_OVERRIDE
-
-function write_to() {
- # Unfortunately set -x does not show redirects so use echo to manually
- # tell the user what commands are being run.
- echo "+ echo \"${2}\" > ${1}"
- echo "${2}" > ${1}
-}
-
-function bind() {
- write_to /sys/bus/pci/drivers/${2}/bind ${1}
-}
-
-function unbind() {
- write_to /sys/bus/pci/drivers/${2}/unbind ${1}
-}
-
-function set_sriov_numvfs() {
- write_to /sys/bus/pci/devices/${1}/sriov_numvfs ${2}
-}
-
-function set_driver_override() {
- write_to /sys/bus/pci/devices/${1}/driver_override ${2}
-}
-
-function clear_driver_override() {
- set_driver_override ${1} ""
-}
-
-function cleanup() {
- if [ "${NEW_DRIVER}" ]; then unbind ${DEVICE_BDF} ${NEW_DRIVER} ; fi
- if [ "${DRIVER_OVERRIDE}" ]; then clear_driver_override ${DEVICE_BDF} ; fi
- if [ "${OLD_DRIVER}" ]; then bind ${DEVICE_BDF} ${OLD_DRIVER} ; fi
- if [ "${OLD_NUMVFS}" ]; then set_sriov_numvfs ${DEVICE_BDF} ${OLD_NUMVFS} ; fi
-}
-
-function usage() {
- echo "usage: $0 [-d segment:bus:device.function] [-s] [-h] [cmd ...]" >&2
- echo >&2
- echo " -d: The BDF of the device to use for the test (required)" >&2
- echo " -h: Show this help message" >&2
- echo " -s: Drop into a shell rather than running a command" >&2
- echo >&2
- echo " cmd: The command to run and arguments to pass to it." >&2
- echo " Required when not using -s. The SBDF will be " >&2
- echo " appended to the argument list." >&2
- exit 1
-}
-
-function main() {
- local shell
-
- while getopts "d:hs" opt; do
- case $opt in
- d) DEVICE_BDF="$OPTARG" ;;
- s) shell=true ;;
- *) usage ;;
- esac
- done
-
- # Shift past all optional arguments.
- shift $((OPTIND - 1))
-
- # Check that the user passed in the command to run.
- [ ! "${shell}" ] && [ $# = 0 ] && usage
-
- # Check that the user passed in a BDF.
- [ "${DEVICE_BDF}" ] || usage
-
- trap cleanup EXIT
- set -e
-
- test -d /sys/bus/pci/devices/${DEVICE_BDF}
-
- if [ -f /sys/bus/pci/devices/${DEVICE_BDF}/sriov_numvfs ]; then
- OLD_NUMVFS=$(cat /sys/bus/pci/devices/${DEVICE_BDF}/sriov_numvfs)
- set_sriov_numvfs ${DEVICE_BDF} 0
- fi
-
- if [ -L /sys/bus/pci/devices/${DEVICE_BDF}/driver ]; then
- OLD_DRIVER=$(basename $(readlink -m /sys/bus/pci/devices/${DEVICE_BDF}/driver))
- unbind ${DEVICE_BDF} ${OLD_DRIVER}
- fi
-
- set_driver_override ${DEVICE_BDF} vfio-pci
- DRIVER_OVERRIDE=true
-
- bind ${DEVICE_BDF} vfio-pci
- NEW_DRIVER=vfio-pci
-
- echo
- if [ "${shell}" ]; then
- echo "Dropping into ${SHELL} with VFIO_SELFTESTS_BDF=${DEVICE_BDF}"
- VFIO_SELFTESTS_BDF=${DEVICE_BDF} ${SHELL}
- else
- "$@" ${DEVICE_BDF}
- fi
- echo
-}
-
-main "$@"
diff --git a/tools/testing/selftests/vfio/scripts/cleanup.sh b/tools/testing/selftests/vfio/scripts/cleanup.sh
new file mode 100755
index 000000000000..69c922d8aafb
--- /dev/null
+++ b/tools/testing/selftests/vfio/scripts/cleanup.sh
@@ -0,0 +1,41 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+source $(dirname -- "${BASH_SOURCE[0]}")/lib.sh
+
+function cleanup_devices() {
+ local device_bdf
+ local device_dir
+
+ for device_bdf in "$@"; do
+ device_dir=${DEVICES_DIR}/${device_bdf}
+
+ if [ -f ${device_dir}/vfio-pci ]; then
+ unbind ${device_bdf} vfio-pci
+ fi
+
+ if [ -f ${device_dir}/driver_override ]; then
+ clear_driver_override ${device_bdf}
+ fi
+
+ if [ -f ${device_dir}/driver ]; then
+ bind ${device_bdf} $(cat ${device_dir}/driver)
+ fi
+
+ if [ -f ${device_dir}/sriov_numvfs ]; then
+ set_sriov_numvfs ${device_bdf} $(cat ${device_dir}/sriov_numvfs)
+ fi
+
+ rm -rf ${device_dir}
+ done
+}
+
+function main() {
+ if [ $# = 0 ]; then
+ cleanup_devices $(ls ${DEVICES_DIR})
+ rmdir ${DEVICES_DIR}
+ else
+ cleanup_devices "$@"
+ fi
+}
+
+main "$@"
diff --git a/tools/testing/selftests/vfio/scripts/lib.sh b/tools/testing/selftests/vfio/scripts/lib.sh
new file mode 100755
index 000000000000..9f05f29c7b86
--- /dev/null
+++ b/tools/testing/selftests/vfio/scripts/lib.sh
@@ -0,0 +1,42 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+readonly DEVICES_DIR="${TMPDIR:-/tmp}/vfio-selftests-devices"
+
+function write_to() {
+ # Unfortunately set -x does not show redirects so use echo to manually
+ # tell the user what commands are being run.
+ echo "+ echo \"${2}\" > ${1}"
+ echo "${2}" > ${1}
+}
+
+function get_driver() {
+ if [ -L /sys/bus/pci/devices/${1}/driver ]; then
+ basename $(readlink -m /sys/bus/pci/devices/${1}/driver)
+ fi
+}
+
+function bind() {
+ write_to /sys/bus/pci/drivers/${2}/bind ${1}
+}
+
+function unbind() {
+ write_to /sys/bus/pci/drivers/${2}/unbind ${1}
+}
+
+function set_sriov_numvfs() {
+ write_to /sys/bus/pci/devices/${1}/sriov_numvfs ${2}
+}
+
+function get_sriov_numvfs() {
+ if [ -f /sys/bus/pci/devices/${1}/sriov_numvfs ]; then
+ cat /sys/bus/pci/devices/${1}/sriov_numvfs
+ fi
+}
+
+function set_driver_override() {
+ write_to /sys/bus/pci/devices/${1}/driver_override ${2}
+}
+
+function clear_driver_override() {
+ set_driver_override ${1} ""
+}
diff --git a/tools/testing/selftests/vfio/scripts/run.sh b/tools/testing/selftests/vfio/scripts/run.sh
new file mode 100755
index 000000000000..91fd38f9f6f6
--- /dev/null
+++ b/tools/testing/selftests/vfio/scripts/run.sh
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+source $(dirname -- "${BASH_SOURCE[0]}")/lib.sh
+
+function main() {
+ local device_bdfs=$(ls ${DEVICES_DIR})
+
+ if [ -z "${device_bdfs}" ]; then
+ echo "No devices found, skipping."
+ exit 4
+ fi
+
+ "$@" ${device_bdfs}
+}
+
+main "$@"
diff --git a/tools/testing/selftests/vfio/scripts/setup.sh b/tools/testing/selftests/vfio/scripts/setup.sh
new file mode 100755
index 000000000000..49a499e51cbe
--- /dev/null
+++ b/tools/testing/selftests/vfio/scripts/setup.sh
@@ -0,0 +1,48 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+set -e
+
+source $(dirname -- "${BASH_SOURCE[0]}")/lib.sh
+
+function main() {
+ local device_bdf
+ local device_dir
+ local numvfs
+ local driver
+
+ if [ $# = 0 ]; then
+ echo "usage: $0 segment:bus:device.function ..." >&2
+ exit 1
+ fi
+
+ for device_bdf in "$@"; do
+ test -d /sys/bus/pci/devices/${device_bdf}
+
+ device_dir=${DEVICES_DIR}/${device_bdf}
+ if [ -d "${device_dir}" ]; then
+ echo "${device_bdf} has already been set up, exiting."
+ exit 0
+ fi
+
+ mkdir -p ${device_dir}
+
+ numvfs=$(get_sriov_numvfs ${device_bdf})
+ if [ "${numvfs}" ]; then
+ set_sriov_numvfs ${device_bdf} 0
+ echo ${numvfs} > ${device_dir}/sriov_numvfs
+ fi
+
+ driver=$(get_driver ${device_bdf})
+ if [ "${driver}" ]; then
+ unbind ${device_bdf} ${driver}
+ echo ${driver} > ${device_dir}/driver
+ fi
+
+ set_driver_override ${device_bdf} vfio-pci
+ touch ${device_dir}/driver_override
+
+ bind ${device_bdf} vfio-pci
+ touch ${device_dir}/vfio-pci
+ done
+}
+
+main "$@"
diff --git a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c
index 102603d4407d..5397822c3dd4 100644
--- a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c
+++ b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c
@@ -10,7 +10,7 @@
#include <linux/sizes.h>
#include <linux/vfio.h>
-#include <vfio_util.h>
+#include <libvfio.h>
#include "../kselftest_harness.h"
@@ -94,6 +94,7 @@ static int iommu_mapping_get(const char *bdf, u64 iova,
}
FIXTURE(vfio_dma_mapping_test) {
+ struct iommu *iommu;
struct vfio_pci_device *device;
struct iova_allocator *iova_allocator;
};
@@ -119,21 +120,23 @@ FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES(anonymous_hugetlb_1gb, SZ_1G, MAP_HUGETLB |
FIXTURE_SETUP(vfio_dma_mapping_test)
{
- self->device = vfio_pci_device_init(device_bdf, variant->iommu_mode);
- self->iova_allocator = iova_allocator_init(self->device);
+ self->iommu = iommu_init(variant->iommu_mode);
+ self->device = vfio_pci_device_init(device_bdf, self->iommu);
+ self->iova_allocator = iova_allocator_init(self->iommu);
}
FIXTURE_TEARDOWN(vfio_dma_mapping_test)
{
iova_allocator_cleanup(self->iova_allocator);
vfio_pci_device_cleanup(self->device);
+ iommu_cleanup(self->iommu);
}
TEST_F(vfio_dma_mapping_test, dma_map_unmap)
{
const u64 size = variant->size ?: getpagesize();
const int flags = variant->mmap_flags;
- struct vfio_dma_region region;
+ struct dma_region region;
struct iommu_mapping mapping;
u64 mapping_size = size;
u64 unmapped;
@@ -150,7 +153,7 @@ TEST_F(vfio_dma_mapping_test, dma_map_unmap)
region.iova = iova_allocator_alloc(self->iova_allocator, size);
region.size = size;
- vfio_pci_dma_map(self->device, &region);
+ iommu_map(self->iommu, &region);
printf("Mapped HVA %p (size 0x%lx) at IOVA 0x%lx\n", region.vaddr, size, region.iova);
ASSERT_EQ(region.iova, to_iova(self->device, region.vaddr));
@@ -192,19 +195,20 @@ TEST_F(vfio_dma_mapping_test, dma_map_unmap)
}
unmap:
- rc = __vfio_pci_dma_unmap(self->device, &region, &unmapped);
+ rc = __iommu_unmap(self->iommu, &region, &unmapped);
ASSERT_EQ(rc, 0);
ASSERT_EQ(unmapped, region.size);
printf("Unmapped IOVA 0x%lx\n", region.iova);
- ASSERT_EQ(INVALID_IOVA, __to_iova(self->device, region.vaddr));
+ ASSERT_NE(0, __to_iova(self->device, region.vaddr, NULL));
ASSERT_NE(0, iommu_mapping_get(device_bdf, region.iova, &mapping));
ASSERT_TRUE(!munmap(region.vaddr, size));
}
FIXTURE(vfio_dma_map_limit_test) {
+ struct iommu *iommu;
struct vfio_pci_device *device;
- struct vfio_dma_region region;
+ struct dma_region region;
size_t mmap_size;
};
@@ -223,7 +227,7 @@ FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES();
FIXTURE_SETUP(vfio_dma_map_limit_test)
{
- struct vfio_dma_region *region = &self->region;
+ struct dma_region *region = &self->region;
struct iommu_iova_range *ranges;
u64 region_size = getpagesize();
iova_t last_iova;
@@ -235,12 +239,13 @@ FIXTURE_SETUP(vfio_dma_map_limit_test)
*/
self->mmap_size = 2 * region_size;
- self->device = vfio_pci_device_init(device_bdf, variant->iommu_mode);
+ self->iommu = iommu_init(variant->iommu_mode);
+ self->device = vfio_pci_device_init(device_bdf, self->iommu);
region->vaddr = mmap(NULL, self->mmap_size, PROT_READ | PROT_WRITE,
MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
ASSERT_NE(region->vaddr, MAP_FAILED);
- ranges = vfio_pci_iova_ranges(self->device, &nranges);
+ ranges = iommu_iova_ranges(self->iommu, &nranges);
VFIO_ASSERT_NOT_NULL(ranges);
last_iova = ranges[nranges - 1].last;
free(ranges);
@@ -253,49 +258,50 @@ FIXTURE_SETUP(vfio_dma_map_limit_test)
FIXTURE_TEARDOWN(vfio_dma_map_limit_test)
{
vfio_pci_device_cleanup(self->device);
+ iommu_cleanup(self->iommu);
ASSERT_EQ(munmap(self->region.vaddr, self->mmap_size), 0);
}
TEST_F(vfio_dma_map_limit_test, unmap_range)
{
- struct vfio_dma_region *region = &self->region;
+ struct dma_region *region = &self->region;
u64 unmapped;
int rc;
- vfio_pci_dma_map(self->device, region);
+ iommu_map(self->iommu, region);
ASSERT_EQ(region->iova, to_iova(self->device, region->vaddr));
- rc = __vfio_pci_dma_unmap(self->device, region, &unmapped);
+ rc = __iommu_unmap(self->iommu, region, &unmapped);
ASSERT_EQ(rc, 0);
ASSERT_EQ(unmapped, region->size);
}
TEST_F(vfio_dma_map_limit_test, unmap_all)
{
- struct vfio_dma_region *region = &self->region;
+ struct dma_region *region = &self->region;
u64 unmapped;
int rc;
- vfio_pci_dma_map(self->device, region);
+ iommu_map(self->iommu, region);
ASSERT_EQ(region->iova, to_iova(self->device, region->vaddr));
- rc = __vfio_pci_dma_unmap_all(self->device, &unmapped);
+ rc = __iommu_unmap_all(self->iommu, &unmapped);
ASSERT_EQ(rc, 0);
ASSERT_EQ(unmapped, region->size);
}
TEST_F(vfio_dma_map_limit_test, overflow)
{
- struct vfio_dma_region *region = &self->region;
+ struct dma_region *region = &self->region;
int rc;
region->iova = ~(iova_t)0 & ~(region->size - 1);
region->size = self->mmap_size;
- rc = __vfio_pci_dma_map(self->device, region);
+ rc = __iommu_map(self->iommu, region);
ASSERT_EQ(rc, -EOVERFLOW);
- rc = __vfio_pci_dma_unmap(self->device, region, NULL);
+ rc = __iommu_unmap(self->iommu, region, NULL);
ASSERT_EQ(rc, -EOVERFLOW);
}
diff --git a/tools/testing/selftests/vfio/vfio_iommufd_setup_test.c b/tools/testing/selftests/vfio/vfio_iommufd_setup_test.c
index 3655106b912d..caf1c6291f3d 100644
--- a/tools/testing/selftests/vfio/vfio_iommufd_setup_test.c
+++ b/tools/testing/selftests/vfio/vfio_iommufd_setup_test.c
@@ -10,7 +10,7 @@
#include <sys/ioctl.h>
#include <unistd.h>
-#include <vfio_util.h>
+#include <libvfio.h>
#include "../kselftest_harness.h"
static const char iommu_dev_path[] = "/dev/iommu";
diff --git a/tools/testing/selftests/vfio/vfio_pci_device_init_perf_test.c b/tools/testing/selftests/vfio/vfio_pci_device_init_perf_test.c
new file mode 100644
index 000000000000..33b0c31fe2ed
--- /dev/null
+++ b/tools/testing/selftests/vfio/vfio_pci_device_init_perf_test.c
@@ -0,0 +1,168 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <pthread.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+
+#include <linux/sizes.h>
+#include <linux/time64.h>
+#include <linux/vfio.h>
+
+#include <libvfio.h>
+
+#include "../kselftest_harness.h"
+
+static char **device_bdfs;
+static int nr_devices;
+
+struct thread_args {
+ struct iommu *iommu;
+ int device_index;
+ struct timespec start;
+ struct timespec end;
+ pthread_barrier_t *barrier;
+};
+
+FIXTURE(vfio_pci_device_init_perf_test) {
+ pthread_t *threads;
+ pthread_barrier_t barrier;
+ struct thread_args *thread_args;
+ struct iommu *iommu;
+};
+
+FIXTURE_VARIANT(vfio_pci_device_init_perf_test) {
+ const char *iommu_mode;
+};
+
+#define FIXTURE_VARIANT_ADD_IOMMU_MODE(_iommu_mode) \
+FIXTURE_VARIANT_ADD(vfio_pci_device_init_perf_test, _iommu_mode) { \
+ .iommu_mode = #_iommu_mode, \
+}
+
+FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES();
+
+FIXTURE_SETUP(vfio_pci_device_init_perf_test)
+{
+ int i;
+
+ self->iommu = iommu_init(variant->iommu_mode);
+ self->threads = calloc(nr_devices, sizeof(self->threads[0]));
+ self->thread_args = calloc(nr_devices, sizeof(self->thread_args[0]));
+
+ pthread_barrier_init(&self->barrier, NULL, nr_devices);
+
+ for (i = 0; i < nr_devices; i++) {
+ self->thread_args[i].iommu = self->iommu;
+ self->thread_args[i].barrier = &self->barrier;
+ self->thread_args[i].device_index = i;
+ }
+}
+
+FIXTURE_TEARDOWN(vfio_pci_device_init_perf_test)
+{
+ iommu_cleanup(self->iommu);
+ free(self->threads);
+ free(self->thread_args);
+}
+
+static s64 to_ns(struct timespec ts)
+{
+ return (s64)ts.tv_nsec + NSEC_PER_SEC * (s64)ts.tv_sec;
+}
+
+static struct timespec to_timespec(s64 ns)
+{
+ struct timespec ts = {
+ .tv_nsec = ns % NSEC_PER_SEC,
+ .tv_sec = ns / NSEC_PER_SEC,
+ };
+
+ return ts;
+}
+
+static struct timespec timespec_sub(struct timespec a, struct timespec b)
+{
+ return to_timespec(to_ns(a) - to_ns(b));
+}
+
+static struct timespec timespec_min(struct timespec a, struct timespec b)
+{
+ return to_ns(a) < to_ns(b) ? a : b;
+}
+
+static struct timespec timespec_max(struct timespec a, struct timespec b)
+{
+ return to_ns(a) > to_ns(b) ? a : b;
+}
+
+static void *thread_main(void *__args)
+{
+ struct thread_args *args = __args;
+ struct vfio_pci_device *device;
+
+ pthread_barrier_wait(args->barrier);
+
+ clock_gettime(CLOCK_MONOTONIC, &args->start);
+ device = vfio_pci_device_init(device_bdfs[args->device_index], args->iommu);
+ clock_gettime(CLOCK_MONOTONIC, &args->end);
+
+ pthread_barrier_wait(args->barrier);
+
+ vfio_pci_device_cleanup(device);
+ return NULL;
+}
+
+TEST_F(vfio_pci_device_init_perf_test, init)
+{
+ struct timespec start = to_timespec(INT64_MAX), end = {};
+ struct timespec min = to_timespec(INT64_MAX);
+ struct timespec max = {};
+ struct timespec avg = {};
+ struct timespec wall_time;
+ s64 thread_ns = 0;
+ int i;
+
+ for (i = 0; i < nr_devices; i++) {
+ pthread_create(&self->threads[i], NULL, thread_main,
+ &self->thread_args[i]);
+ }
+
+ for (i = 0; i < nr_devices; i++) {
+ struct thread_args *args = &self->thread_args[i];
+ struct timespec init_time;
+
+ pthread_join(self->threads[i], NULL);
+
+ start = timespec_min(start, args->start);
+ end = timespec_max(end, args->end);
+
+ init_time = timespec_sub(args->end, args->start);
+ min = timespec_min(min, init_time);
+ max = timespec_max(max, init_time);
+ thread_ns += to_ns(init_time);
+ }
+
+ avg = to_timespec(thread_ns / nr_devices);
+ wall_time = timespec_sub(end, start);
+
+ printf("Wall time: %lu.%09lus\n",
+ wall_time.tv_sec, wall_time.tv_nsec);
+ printf("Min init time (per device): %lu.%09lus\n",
+ min.tv_sec, min.tv_nsec);
+ printf("Max init time (per device): %lu.%09lus\n",
+ max.tv_sec, max.tv_nsec);
+ printf("Avg init time (per device): %lu.%09lus\n",
+ avg.tv_sec, avg.tv_nsec);
+}
+
+int main(int argc, char *argv[])
+{
+ int i;
+
+ device_bdfs = vfio_selftests_get_bdfs(&argc, argv, &nr_devices);
+
+ printf("Testing parallel initialization of %d devices:\n", nr_devices);
+ for (i = 0; i < nr_devices; i++)
+ printf(" %s\n", device_bdfs[i]);
+
+ return test_harness_run(argc, argv);
+}
diff --git a/tools/testing/selftests/vfio/vfio_pci_device_test.c b/tools/testing/selftests/vfio/vfio_pci_device_test.c
index 7a270698e4d2..ecbb669b3765 100644
--- a/tools/testing/selftests/vfio/vfio_pci_device_test.c
+++ b/tools/testing/selftests/vfio/vfio_pci_device_test.c
@@ -10,7 +10,7 @@
#include <linux/sizes.h>
#include <linux/vfio.h>
-#include <vfio_util.h>
+#include <libvfio.h>
#include "../kselftest_harness.h"
@@ -23,17 +23,20 @@ static const char *device_bdf;
#define MAX_TEST_MSI 16U
FIXTURE(vfio_pci_device_test) {
+ struct iommu *iommu;
struct vfio_pci_device *device;
};
FIXTURE_SETUP(vfio_pci_device_test)
{
- self->device = vfio_pci_device_init(device_bdf, default_iommu_mode);
+ self->iommu = iommu_init(default_iommu_mode);
+ self->device = vfio_pci_device_init(device_bdf, self->iommu);
}
FIXTURE_TEARDOWN(vfio_pci_device_test)
{
vfio_pci_device_cleanup(self->device);
+ iommu_cleanup(self->iommu);
}
#define read_pci_id_from_sysfs(_file) ({ \
@@ -99,6 +102,7 @@ TEST_F(vfio_pci_device_test, validate_bars)
}
FIXTURE(vfio_pci_irq_test) {
+ struct iommu *iommu;
struct vfio_pci_device *device;
};
@@ -116,12 +120,14 @@ FIXTURE_VARIANT_ADD(vfio_pci_irq_test, msix) {
FIXTURE_SETUP(vfio_pci_irq_test)
{
- self->device = vfio_pci_device_init(device_bdf, default_iommu_mode);
+ self->iommu = iommu_init(default_iommu_mode);
+ self->device = vfio_pci_device_init(device_bdf, self->iommu);
}
FIXTURE_TEARDOWN(vfio_pci_irq_test)
{
vfio_pci_device_cleanup(self->device);
+ iommu_cleanup(self->iommu);
}
TEST_F(vfio_pci_irq_test, enable_trigger_disable)
diff --git a/tools/testing/selftests/vfio/vfio_pci_driver_test.c b/tools/testing/selftests/vfio/vfio_pci_driver_test.c
index f69eec8b928d..f0ca8310d6a8 100644
--- a/tools/testing/selftests/vfio/vfio_pci_driver_test.c
+++ b/tools/testing/selftests/vfio/vfio_pci_driver_test.c
@@ -5,7 +5,7 @@
#include <linux/sizes.h>
#include <linux/vfio.h>
-#include <vfio_util.h>
+#include <libvfio.h>
#include "../kselftest_harness.h"
@@ -18,9 +18,9 @@ static const char *device_bdf;
ASSERT_EQ(EAGAIN, errno); \
} while (0)
-static void region_setup(struct vfio_pci_device *device,
+static void region_setup(struct iommu *iommu,
struct iova_allocator *iova_allocator,
- struct vfio_dma_region *region, u64 size)
+ struct dma_region *region, u64 size)
{
const int flags = MAP_SHARED | MAP_ANONYMOUS;
const int prot = PROT_READ | PROT_WRITE;
@@ -33,20 +33,20 @@ static void region_setup(struct vfio_pci_device *device,
region->iova = iova_allocator_alloc(iova_allocator, size);
region->size = size;
- vfio_pci_dma_map(device, region);
+ iommu_map(iommu, region);
}
-static void region_teardown(struct vfio_pci_device *device,
- struct vfio_dma_region *region)
+static void region_teardown(struct iommu *iommu, struct dma_region *region)
{
- vfio_pci_dma_unmap(device, region);
+ iommu_unmap(iommu, region);
VFIO_ASSERT_EQ(munmap(region->vaddr, region->size), 0);
}
FIXTURE(vfio_pci_driver_test) {
+ struct iommu *iommu;
struct vfio_pci_device *device;
struct iova_allocator *iova_allocator;
- struct vfio_dma_region memcpy_region;
+ struct dma_region memcpy_region;
void *vaddr;
int msi_fd;
@@ -73,13 +73,14 @@ FIXTURE_SETUP(vfio_pci_driver_test)
{
struct vfio_pci_driver *driver;
- self->device = vfio_pci_device_init(device_bdf, variant->iommu_mode);
- self->iova_allocator = iova_allocator_init(self->device);
+ self->iommu = iommu_init(variant->iommu_mode);
+ self->device = vfio_pci_device_init(device_bdf, self->iommu);
+ self->iova_allocator = iova_allocator_init(self->iommu);
driver = &self->device->driver;
- region_setup(self->device, self->iova_allocator, &self->memcpy_region, SZ_1G);
- region_setup(self->device, self->iova_allocator, &driver->region, SZ_2M);
+ region_setup(self->iommu, self->iova_allocator, &self->memcpy_region, SZ_1G);
+ region_setup(self->iommu, self->iova_allocator, &driver->region, SZ_2M);
/* Any IOVA that doesn't overlap memcpy_region and driver->region. */
self->unmapped_iova = iova_allocator_alloc(self->iova_allocator, SZ_1G);
@@ -108,11 +109,12 @@ FIXTURE_TEARDOWN(vfio_pci_driver_test)
vfio_pci_driver_remove(self->device);
- region_teardown(self->device, &self->memcpy_region);
- region_teardown(self->device, &driver->region);
+ region_teardown(self->iommu, &self->memcpy_region);
+ region_teardown(self->iommu, &driver->region);
iova_allocator_cleanup(self->iova_allocator);
vfio_pci_device_cleanup(self->device);
+ iommu_cleanup(self->iommu);
}
TEST_F(vfio_pci_driver_test, init_remove)
@@ -231,18 +233,31 @@ TEST_F_TIMEOUT(vfio_pci_driver_test, memcpy_storm, 60)
ASSERT_NO_MSI(self->msi_fd);
}
-int main(int argc, char *argv[])
+static bool device_has_selftests_driver(const char *bdf)
{
struct vfio_pci_device *device;
+ struct iommu *iommu;
+ bool has_driver;
+
+ iommu = iommu_init(default_iommu_mode);
+ device = vfio_pci_device_init(device_bdf, iommu);
+
+ has_driver = !!device->driver.ops;
+
+ vfio_pci_device_cleanup(device);
+ iommu_cleanup(iommu);
+ return has_driver;
+}
+
+int main(int argc, char *argv[])
+{
device_bdf = vfio_selftests_get_bdf(&argc, argv);
- device = vfio_pci_device_init(device_bdf, default_iommu_mode);
- if (!device->driver.ops) {
+ if (!device_has_selftests_driver(device_bdf)) {
fprintf(stderr, "No driver found for device %s\n", device_bdf);
return KSFT_SKIP;
}
- vfio_pci_device_cleanup(device);
return test_harness_run(argc, argv);
}