From 59079438a664559bb1f6f5fe85e306962ef9286e Mon Sep 17 00:00:00 2001 From: Mikhael Goikhman Date: Wed, 10 Mar 2021 23:09:09 -0800 Subject: net/mlx5: Remove unused mlx5_core_health member recover_work The code related to health->recover_work was removed in commit 63cbc552eebf ("net/mlx5: Handle SW reset of FW in error flow") Fix struct mlx5_core_health accordingly. Signed-off-by: Mikhael Goikhman Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 53b89631a1d9..8fe51b4a781e 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -438,7 +438,6 @@ struct mlx5_core_health { unsigned long flags; struct work_struct fatal_report_work; struct work_struct report_work; - struct delayed_work recover_work; struct devlink_health_reporter *fw_reporter; struct devlink_health_reporter *fw_fatal_reporter; }; -- cgit v1.2.3 From 59c904c8fffd903c1dae5fc6a402b88fa6dfc874 Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Wed, 10 Mar 2021 23:09:11 -0800 Subject: net/mlx5: E-Switch, Add eswitch pointer to each representor Store the managing E-Switch of each representor. This will be used when a representor is created on eswitch manager 0 but the vport belongs to eswitch manager 1. Signed-off-by: Mark Bloch Reviewed-by: Saeed Mahameed Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 1 + include/linux/mlx5/eswitch.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index fd5f8b830584..f6c0e7e05ad5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -3153,6 +3153,7 @@ void mlx5_eswitch_register_vport_reps(struct mlx5_eswitch *esw, esw->offloads.rep_ops[rep_type] = ops; mlx5_esw_for_all_reps(esw, i, rep) { if (likely(mlx5_eswitch_vport_has_rep(esw, i))) { + rep->esw = esw; rep_data = &rep->rep_data[rep_type]; atomic_set(&rep_data->state, REP_REGISTERED); } diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h index 994c2c8cb4fd..72d480df2a03 100644 --- a/include/linux/mlx5/eswitch.h +++ b/include/linux/mlx5/eswitch.h @@ -48,6 +48,7 @@ struct mlx5_eswitch_rep { /* Only IB rep is using vport_index */ u16 vport_index; u32 vlan_refcount; + struct mlx5_eswitch *esw; }; void mlx5_eswitch_register_vport_reps(struct mlx5_eswitch *esw, -- cgit v1.2.3 From 3a46f4fb55ffd46e475e3fc53b1252f722cf647e Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Wed, 10 Mar 2021 23:09:13 -0800 Subject: net/mlx5: E-Switch, Refactor send to vport to be more generic Now that each representor stores a pointer to the managing E-Switch use that information when creating the send-to-vport rules. Signed-off-by: Mark Bloch Reviewed-by: Saeed Mahameed Signed-off-by: Saeed Mahameed --- drivers/infiniband/hw/mlx5/ib_rep.c | 3 +-- drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 3 +-- .../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 20 ++++++++++++-------- include/linux/mlx5/eswitch.h | 4 ++-- 4 files changed, 16 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/drivers/infiniband/hw/mlx5/ib_rep.c b/drivers/infiniband/hw/mlx5/ib_rep.c index 4eae7131b0ce..db5de720bb12 100644 --- a/drivers/infiniband/hw/mlx5/ib_rep.c +++ b/drivers/infiniband/hw/mlx5/ib_rep.c @@ -123,8 +123,7 @@ struct mlx5_flow_handle *create_flow_rule_vport_sq(struct mlx5_ib_dev *dev, rep = dev->port[port - 1].rep; - return mlx5_eswitch_add_send_to_vport_rule(esw, rep->vport, - sq->base.mqp.qpn); + return mlx5_eswitch_add_send_to_vport_rule(esw, rep, sq->base.mqp.qpn); } static int mlx5r_rep_probe(struct auxiliary_device *adev, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index a132fff7a980..3d6c2bce67d2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -411,8 +411,7 @@ static int mlx5e_sqs2vport_start(struct mlx5_eswitch *esw, } /* Add re-inject rule to the PF/representor sqs */ - flow_rule = mlx5_eswitch_add_send_to_vport_rule(esw, - rep->vport, + flow_rule = mlx5_eswitch_add_send_to_vport_rule(esw, rep, sqns_array[i]); if (IS_ERR(flow_rule)) { err = PTR_ERR(flow_rule); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index f6c0e7e05ad5..6090b2609089 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -1036,7 +1036,8 @@ out: } struct mlx5_flow_handle * -mlx5_eswitch_add_send_to_vport_rule(struct mlx5_eswitch *esw, u16 vport, +mlx5_eswitch_add_send_to_vport_rule(struct mlx5_eswitch *on_esw, + struct mlx5_eswitch_rep *rep, u32 sqn) { struct mlx5_flow_act flow_act = {0}; @@ -1054,27 +1055,30 @@ mlx5_eswitch_add_send_to_vport_rule(struct mlx5_eswitch *esw, u16 vport, misc = MLX5_ADDR_OF(fte_match_param, spec->match_value, misc_parameters); MLX5_SET(fte_match_set_misc, misc, source_sqn, sqn); /* source vport is the esw manager */ - MLX5_SET(fte_match_set_misc, misc, source_port, esw->manager_vport); - if (MLX5_CAP_ESW(esw->dev, merged_eswitch)) + MLX5_SET(fte_match_set_misc, misc, source_port, rep->esw->manager_vport); + if (MLX5_CAP_ESW(on_esw->dev, merged_eswitch)) MLX5_SET(fte_match_set_misc, misc, source_eswitch_owner_vhca_id, - MLX5_CAP_GEN(esw->dev, vhca_id)); + MLX5_CAP_GEN(rep->esw->dev, vhca_id)); misc = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, misc_parameters); MLX5_SET_TO_ONES(fte_match_set_misc, misc, source_sqn); MLX5_SET_TO_ONES(fte_match_set_misc, misc, source_port); - if (MLX5_CAP_ESW(esw->dev, merged_eswitch)) + if (MLX5_CAP_ESW(on_esw->dev, merged_eswitch)) MLX5_SET_TO_ONES(fte_match_set_misc, misc, source_eswitch_owner_vhca_id); spec->match_criteria_enable = MLX5_MATCH_MISC_PARAMETERS; dest.type = MLX5_FLOW_DESTINATION_TYPE_VPORT; - dest.vport.num = vport; + dest.vport.num = rep->vport; + dest.vport.vhca_id = MLX5_CAP_GEN(rep->esw->dev, vhca_id); + dest.vport.flags |= MLX5_FLOW_DEST_VPORT_VHCA_ID; flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; - flow_rule = mlx5_add_flow_rules(esw->fdb_table.offloads.slow_fdb, + flow_rule = mlx5_add_flow_rules(on_esw->fdb_table.offloads.slow_fdb, spec, &flow_act, &dest, 1); if (IS_ERR(flow_rule)) - esw_warn(esw->dev, "FDB: Failed to add send to vport rule err %ld\n", PTR_ERR(flow_rule)); + esw_warn(on_esw->dev, "FDB: Failed to add send to vport rule err %ld\n", + PTR_ERR(flow_rule)); out: kvfree(spec); return flow_rule; diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h index 72d480df2a03..2ec0527991c8 100644 --- a/include/linux/mlx5/eswitch.h +++ b/include/linux/mlx5/eswitch.h @@ -62,8 +62,8 @@ struct mlx5_eswitch_rep *mlx5_eswitch_vport_rep(struct mlx5_eswitch *esw, u16 vport_num); void *mlx5_eswitch_uplink_get_proto_dev(struct mlx5_eswitch *esw, u8 rep_type); struct mlx5_flow_handle * -mlx5_eswitch_add_send_to_vport_rule(struct mlx5_eswitch *esw, - u16 vport_num, u32 sqn); +mlx5_eswitch_add_send_to_vport_rule(struct mlx5_eswitch *on_esw, + struct mlx5_eswitch_rep *rep, u32 sqn); u16 mlx5_eswitch_get_total_vports(const struct mlx5_core_dev *dev); -- cgit v1.2.3 From c3e666f1ada9cbfbe5465f122f9a2d63ddfd25ed Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Wed, 10 Mar 2021 23:09:14 -0800 Subject: net/mlx5: Add IFC bits needed for single FDB mode Currently we operate in a mode where each eswitch manager has a separate FDB. In order to combine these multiple FDBs we expose new caps to allow this: - Set root flow table which isn't native. - Set FDB a different selection mode when in LAG mode. Signed-off-by: Mark Bloch Reviewed-by: Saeed Mahameed Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index df5d91c8b2d4..3ee7a86f39e4 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -806,9 +806,11 @@ struct mlx5_ifc_e_switch_cap_bits { u8 vport_svlan_insert[0x1]; u8 vport_cvlan_insert_if_not_exist[0x1]; u8 vport_cvlan_insert_overwrite[0x1]; - u8 reserved_at_5[0x3]; + u8 reserved_at_5[0x2]; + u8 esw_shared_ingress_acl[0x1]; u8 esw_uplink_ingress_acl[0x1]; - u8 reserved_at_9[0x10]; + u8 root_ft_on_other_esw[0x1]; + u8 reserved_at_a[0xf]; u8 esw_functions_changed[0x1]; u8 reserved_at_1a[0x1]; u8 ecpf_vport_exists[0x1]; @@ -1502,7 +1504,8 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_at_270[0x6]; u8 lag_dct[0x2]; u8 lag_tx_port_affinity[0x1]; - u8 reserved_at_279[0x2]; + u8 lag_native_fdb_selection[0x1]; + u8 reserved_at_27a[0x1]; u8 lag_master[0x1]; u8 num_lag_ports[0x4]; @@ -10036,14 +10039,19 @@ struct mlx5_ifc_set_flow_table_root_in_bits { u8 reserved_at_60[0x20]; u8 table_type[0x8]; - u8 reserved_at_88[0x18]; + u8 reserved_at_88[0x7]; + u8 table_of_other_vport[0x1]; + u8 table_vport_number[0x10]; u8 reserved_at_a0[0x8]; u8 table_id[0x18]; u8 reserved_at_c0[0x8]; u8 underlay_qpn[0x18]; - u8 reserved_at_e0[0x120]; + u8 table_eswitch_owner_vhca_id_valid[0x1]; + u8 reserved_at_e1[0xf]; + u8 table_eswitch_owner_vhca_id[0x10]; + u8 reserved_at_100[0x100]; }; enum { @@ -10273,7 +10281,8 @@ struct mlx5_ifc_dcbx_param_bits { }; struct mlx5_ifc_lagc_bits { - u8 reserved_at_0[0x1d]; + u8 fdb_selection_mode[0x1]; + u8 reserved_at_1[0x1c]; u8 lag_state[0x3]; u8 reserved_at_20[0x14]; -- cgit v1.2.3 From 26bf30902c10473ba38f220d3401a61c56d8db3b Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Wed, 10 Mar 2021 23:09:15 -0800 Subject: net/mlx5: Use order-0 allocations for EQs Currently we are allocating high-order page for EQs. In case of fragmented system, VF hot remove/add in VMs for example, there isn't enough contiguous memory for EQs allocation, which results in crashing of the VM. Therefore, use order-0 fragments for the EQ allocations instead. Performance tests: ConnectX-5 100Gbps, CPU: Intel(R) Xeon(R) CPU E5-2697 v3 @ 2.60GHz Performance tests show no sensible degradation. Signed-off-by: Tariq Toukan Signed-off-by: Shay Drory Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/en/health.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/eq.c | 27 +++++++++++++--------- drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h | 15 ++++++++---- drivers/net/ethernet/mellanox/mlx5/core/wq.c | 5 ---- include/linux/mlx5/driver.h | 5 ++++ 5 files changed, 32 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/health.c b/drivers/net/ethernet/mellanox/mlx5/core/en/health.c index 84e501e057b4..6f4e6c34b2a2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/health.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/health.c @@ -128,7 +128,7 @@ int mlx5e_health_eq_diag_fmsg(struct mlx5_eq_comp *eq, struct devlink_fmsg *fmsg if (err) return err; - err = devlink_fmsg_u32_pair_put(fmsg, "size", eq->core.nent); + err = devlink_fmsg_u32_pair_put(fmsg, "size", eq_get_size(&eq->core)); if (err) return err; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c index 174dfbc996c6..4e8381030d77 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c @@ -271,7 +271,7 @@ static void init_eq_buf(struct mlx5_eq *eq) struct mlx5_eqe *eqe; int i; - for (i = 0; i < eq->nent; i++) { + for (i = 0; i < eq_get_size(eq); i++) { eqe = get_eqe(eq, i); eqe->owner = MLX5_EQE_OWNER_INIT_VAL; } @@ -281,8 +281,10 @@ static int create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, struct mlx5_eq_param *param) { + u8 log_eq_size = order_base_2(param->nent + MLX5_NUM_SPARE_EQE); struct mlx5_cq_table *cq_table = &eq->cq_table; u32 out[MLX5_ST_SZ_DW(create_eq_out)] = {0}; + u8 log_eq_stride = ilog2(MLX5_EQE_SIZE); struct mlx5_priv *priv = &dev->priv; u8 vecidx = param->irq_index; __be64 *pas; @@ -297,16 +299,18 @@ create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, spin_lock_init(&cq_table->lock); INIT_RADIX_TREE(&cq_table->tree, GFP_ATOMIC); - eq->nent = roundup_pow_of_two(param->nent + MLX5_NUM_SPARE_EQE); eq->cons_index = 0; - err = mlx5_buf_alloc(dev, eq->nent * MLX5_EQE_SIZE, &eq->buf); + + err = mlx5_frag_buf_alloc_node(dev, wq_get_byte_sz(log_eq_size, log_eq_stride), + &eq->frag_buf, dev->priv.numa_node); if (err) return err; + mlx5_init_fbc(eq->frag_buf.frags, log_eq_stride, log_eq_size, &eq->fbc); init_eq_buf(eq); inlen = MLX5_ST_SZ_BYTES(create_eq_in) + - MLX5_FLD_SZ_BYTES(create_eq_in, pas[0]) * eq->buf.npages; + MLX5_FLD_SZ_BYTES(create_eq_in, pas[0]) * eq->frag_buf.npages; in = kvzalloc(inlen, GFP_KERNEL); if (!in) { @@ -315,7 +319,7 @@ create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, } pas = (__be64 *)MLX5_ADDR_OF(create_eq_in, in, pas); - mlx5_fill_page_array(&eq->buf, pas); + mlx5_fill_page_frag_array(&eq->frag_buf, pas); MLX5_SET(create_eq_in, in, opcode, MLX5_CMD_OP_CREATE_EQ); if (!param->mask[0] && MLX5_CAP_GEN(dev, log_max_uctx)) @@ -326,11 +330,11 @@ create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, param->mask[i]); eqc = MLX5_ADDR_OF(create_eq_in, in, eq_context_entry); - MLX5_SET(eqc, eqc, log_eq_size, ilog2(eq->nent)); + MLX5_SET(eqc, eqc, log_eq_size, eq->fbc.log_sz); MLX5_SET(eqc, eqc, uar_page, priv->uar->index); MLX5_SET(eqc, eqc, intr, vecidx); MLX5_SET(eqc, eqc, log_page_size, - eq->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); + eq->frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out)); if (err) @@ -356,7 +360,7 @@ err_in: kvfree(in); err_buf: - mlx5_buf_free(dev, &eq->buf); + mlx5_frag_buf_free(dev, &eq->frag_buf); return err; } @@ -413,7 +417,7 @@ static int destroy_unmap_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq) eq->eqn); synchronize_irq(eq->irqn); - mlx5_buf_free(dev, &eq->buf); + mlx5_frag_buf_free(dev, &eq->frag_buf); return err; } @@ -764,10 +768,11 @@ EXPORT_SYMBOL(mlx5_eq_destroy_generic); struct mlx5_eqe *mlx5_eq_get_eqe(struct mlx5_eq *eq, u32 cc) { u32 ci = eq->cons_index + cc; + u32 nent = eq_get_size(eq); struct mlx5_eqe *eqe; - eqe = get_eqe(eq, ci & (eq->nent - 1)); - eqe = ((eqe->owner & 1) ^ !!(ci & eq->nent)) ? NULL : eqe; + eqe = get_eqe(eq, ci & (nent - 1)); + eqe = ((eqe->owner & 1) ^ !!(ci & nent)) ? NULL : eqe; /* Make sure we read EQ entry contents after we've * checked the ownership bit. */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h index 81f2cc4ca1da..f607a3858ef5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h @@ -22,15 +22,15 @@ struct mlx5_cq_table { }; struct mlx5_eq { + struct mlx5_frag_buf_ctrl fbc; + struct mlx5_frag_buf frag_buf; struct mlx5_core_dev *dev; struct mlx5_cq_table cq_table; __be32 __iomem *doorbell; u32 cons_index; - struct mlx5_frag_buf buf; unsigned int vecidx; unsigned int irqn; u8 eqn; - int nent; struct mlx5_rsc_debug *dbg; }; @@ -47,16 +47,21 @@ struct mlx5_eq_comp { struct list_head list; }; +static inline u32 eq_get_size(struct mlx5_eq *eq) +{ + return eq->fbc.sz_m1 + 1; +} + static inline struct mlx5_eqe *get_eqe(struct mlx5_eq *eq, u32 entry) { - return mlx5_buf_offset(&eq->buf, entry * MLX5_EQE_SIZE); + return mlx5_frag_buf_get_wqe(&eq->fbc, entry); } static inline struct mlx5_eqe *next_eqe_sw(struct mlx5_eq *eq) { - struct mlx5_eqe *eqe = get_eqe(eq, eq->cons_index & (eq->nent - 1)); + struct mlx5_eqe *eqe = get_eqe(eq, eq->cons_index & eq->fbc.sz_m1); - return ((eqe->owner & 1) ^ !!(eq->cons_index & eq->nent)) ? NULL : eqe; + return (eqe->owner ^ (eq->cons_index >> eq->fbc.log_sz)) & 1 ? NULL : eqe; } static inline void eq_update_ci(struct mlx5_eq *eq, int arm) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/wq.c b/drivers/net/ethernet/mellanox/mlx5/core/wq.c index 01f075fac276..3091dd014650 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/wq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/wq.c @@ -34,11 +34,6 @@ #include "wq.h" #include "mlx5_core.h" -static u32 wq_get_byte_sz(u8 log_sz, u8 log_stride) -{ - return ((u32)1 << log_sz) << log_stride; -} - int mlx5_wq_cyc_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param, void *wqc, struct mlx5_wq_cyc *wq, struct mlx5_wq_ctrl *wq_ctrl) diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 8fe51b4a781e..5c0422930b01 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -873,6 +873,11 @@ static inline u32 mlx5_base_mkey(const u32 key) return key & 0xffffff00u; } +static inline u32 wq_get_byte_sz(u8 log_sz, u8 log_stride) +{ + return ((u32)1 << log_sz) << log_stride; +} + static inline void mlx5_init_fbc_offset(struct mlx5_buf_list *frags, u8 log_stride, u8 log_sz, u16 strides_offset, -- cgit v1.2.3 From c3d5c2d96d69f2578d6fbf66e39cc2cf840d9812 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 4 Apr 2021 10:22:18 +0300 Subject: PCI/IOV: Add sysfs MSI-X vector assignment interface A typical cloud provider SR-IOV use case is to create many VFs for use by guest VMs. The VFs may not be assigned to a VM until a customer requests a VM of a certain size, e.g., number of CPUs. A VF may need MSI-X vectors proportional to the number of CPUs in the VM, but there is no standard way to change the number of MSI-X vectors supported by a VF. Some Mellanox ConnectX devices support dynamic assignment of MSI-X vectors to SR-IOV VFs. This can be done by the PF driver after VFs are enabled, and it can be done without affecting VFs that are already in use. The hardware supports a limited pool of MSI-X vectors that can be assigned to the PF or to individual VFs. This is device-specific behavior that requires support in the PF driver. Add a read-only "sriov_vf_total_msix" sysfs file for the PF and a writable "sriov_vf_msix_count" file for each VF. Management software may use these to learn how many MSI-X vectors are available and to dynamically assign them to VFs before the VFs are passed through to a VM. If the PF driver implements the ->sriov_get_vf_total_msix() callback, "sriov_vf_total_msix" contains the total number of MSI-X vectors available for distribution among VFs. If no driver is bound to the VF, writing "N" to "sriov_vf_msix_count" uses the PF driver ->sriov_set_msix_vec_count() callback to assign "N" MSI-X vectors to the VF. When a VF driver subsequently reads the MSI-X Message Control register, it will see the new Table Size "N". Link: https://lore.kernel.org/linux-pci/20210314124256.70253-2-leon@kernel.org Acked-by: Bjorn Helgaas Signed-off-by: Leon Romanovsky --- Documentation/ABI/testing/sysfs-bus-pci | 29 +++++++++ drivers/pci/iov.c | 102 ++++++++++++++++++++++++++++++-- drivers/pci/pci-sysfs.c | 3 +- drivers/pci/pci.h | 3 +- include/linux/pci.h | 8 +++ 5 files changed, 137 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci index 25c9c39770c6..e5cfd170b491 100644 --- a/Documentation/ABI/testing/sysfs-bus-pci +++ b/Documentation/ABI/testing/sysfs-bus-pci @@ -375,3 +375,32 @@ Description: The value comes from the PCI kernel device state and can be one of: "unknown", "error", "D0", D1", "D2", "D3hot", "D3cold". The file is read only. + +What: /sys/bus/pci/devices/.../sriov_vf_total_msix +Date: January 2021 +Contact: Leon Romanovsky +Description: + This file is associated with a SR-IOV physical function (PF). + It contains the total number of MSI-X vectors available for + assignment to all virtual functions (VFs) associated with PF. + The value will be zero if the device doesn't support this + functionality. For supported devices, the value will be + constant and won't be changed after MSI-X vectors assignment. + +What: /sys/bus/pci/devices/.../sriov_vf_msix_count +Date: January 2021 +Contact: Leon Romanovsky +Description: + This file is associated with a SR-IOV virtual function (VF). + It allows configuration of the number of MSI-X vectors for + the VF. This allows devices that have a global pool of MSI-X + vectors to optimally divide them between VFs based on VF usage. + + The values accepted are: + * > 0 - this number will be reported as the Table Size in the + VF's MSI-X capability + * < 0 - not valid + * = 0 - will reset to the device default value + + The file is writable if the PF is bound to a driver that + implements ->sriov_set_msix_vec_count(). diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c index 4afd4ee4f7f0..afc06e6ce115 100644 --- a/drivers/pci/iov.c +++ b/drivers/pci/iov.c @@ -31,6 +31,7 @@ int pci_iov_virtfn_devfn(struct pci_dev *dev, int vf_id) return (dev->devfn + dev->sriov->offset + dev->sriov->stride * vf_id) & 0xff; } +EXPORT_SYMBOL_GPL(pci_iov_virtfn_devfn); /* * Per SR-IOV spec sec 3.3.10 and 3.3.11, First VF Offset and VF Stride may @@ -157,6 +158,92 @@ failed: return rc; } +#ifdef CONFIG_PCI_MSI +static ssize_t sriov_vf_total_msix_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct pci_dev *pdev = to_pci_dev(dev); + u32 vf_total_msix = 0; + + device_lock(dev); + if (!pdev->driver || !pdev->driver->sriov_get_vf_total_msix) + goto unlock; + + vf_total_msix = pdev->driver->sriov_get_vf_total_msix(pdev); +unlock: + device_unlock(dev); + return sysfs_emit(buf, "%u\n", vf_total_msix); +} +static DEVICE_ATTR_RO(sriov_vf_total_msix); + +static ssize_t sriov_vf_msix_count_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct pci_dev *vf_dev = to_pci_dev(dev); + struct pci_dev *pdev = pci_physfn(vf_dev); + int val, ret; + + ret = kstrtoint(buf, 0, &val); + if (ret) + return ret; + + if (val < 0) + return -EINVAL; + + device_lock(&pdev->dev); + if (!pdev->driver || !pdev->driver->sriov_set_msix_vec_count) { + ret = -EOPNOTSUPP; + goto err_pdev; + } + + device_lock(&vf_dev->dev); + if (vf_dev->driver) { + /* + * A driver is already attached to this VF and has configured + * itself based on the current MSI-X vector count. Changing + * the vector size could mess up the driver, so block it. + */ + ret = -EBUSY; + goto err_dev; + } + + ret = pdev->driver->sriov_set_msix_vec_count(vf_dev, val); + +err_dev: + device_unlock(&vf_dev->dev); +err_pdev: + device_unlock(&pdev->dev); + return ret ? : count; +} +static DEVICE_ATTR_WO(sriov_vf_msix_count); +#endif + +static struct attribute *sriov_vf_dev_attrs[] = { +#ifdef CONFIG_PCI_MSI + &dev_attr_sriov_vf_msix_count.attr, +#endif + NULL, +}; + +static umode_t sriov_vf_attrs_are_visible(struct kobject *kobj, + struct attribute *a, int n) +{ + struct device *dev = kobj_to_dev(kobj); + struct pci_dev *pdev = to_pci_dev(dev); + + if (!pdev->is_virtfn) + return 0; + + return a->mode; +} + +const struct attribute_group sriov_vf_dev_attr_group = { + .attrs = sriov_vf_dev_attrs, + .is_visible = sriov_vf_attrs_are_visible, +}; + int pci_iov_add_virtfn(struct pci_dev *dev, int id) { int i; @@ -400,18 +487,21 @@ static DEVICE_ATTR_RO(sriov_stride); static DEVICE_ATTR_RO(sriov_vf_device); static DEVICE_ATTR_RW(sriov_drivers_autoprobe); -static struct attribute *sriov_dev_attrs[] = { +static struct attribute *sriov_pf_dev_attrs[] = { &dev_attr_sriov_totalvfs.attr, &dev_attr_sriov_numvfs.attr, &dev_attr_sriov_offset.attr, &dev_attr_sriov_stride.attr, &dev_attr_sriov_vf_device.attr, &dev_attr_sriov_drivers_autoprobe.attr, +#ifdef CONFIG_PCI_MSI + &dev_attr_sriov_vf_total_msix.attr, +#endif NULL, }; -static umode_t sriov_attrs_are_visible(struct kobject *kobj, - struct attribute *a, int n) +static umode_t sriov_pf_attrs_are_visible(struct kobject *kobj, + struct attribute *a, int n) { struct device *dev = kobj_to_dev(kobj); @@ -421,9 +511,9 @@ static umode_t sriov_attrs_are_visible(struct kobject *kobj, return a->mode; } -const struct attribute_group sriov_dev_attr_group = { - .attrs = sriov_dev_attrs, - .is_visible = sriov_attrs_are_visible, +const struct attribute_group sriov_pf_dev_attr_group = { + .attrs = sriov_pf_dev_attrs, + .is_visible = sriov_pf_attrs_are_visible, }; int __weak pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs) diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index f8afd54ca3e1..a6b8fbbba6d2 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -1567,7 +1567,8 @@ static const struct attribute_group *pci_dev_attr_groups[] = { &pci_dev_attr_group, &pci_dev_hp_attr_group, #ifdef CONFIG_PCI_IOV - &sriov_dev_attr_group, + &sriov_pf_dev_attr_group, + &sriov_vf_dev_attr_group, #endif &pci_bridge_attr_group, &pcie_dev_attr_group, diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index ef7c4661314f..afb87b917f07 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -501,7 +501,8 @@ void pci_iov_update_resource(struct pci_dev *dev, int resno); resource_size_t pci_sriov_resource_alignment(struct pci_dev *dev, int resno); void pci_restore_iov_state(struct pci_dev *dev); int pci_iov_bus_range(struct pci_bus *bus); -extern const struct attribute_group sriov_dev_attr_group; +extern const struct attribute_group sriov_pf_dev_attr_group; +extern const struct attribute_group sriov_vf_dev_attr_group; #else static inline int pci_iov_init(struct pci_dev *dev) { diff --git a/include/linux/pci.h b/include/linux/pci.h index 86c799c97b77..9b575a676888 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -856,6 +856,12 @@ struct module; * e.g. drivers/net/e100.c. * @sriov_configure: Optional driver callback to allow configuration of * number of VFs to enable via sysfs "sriov_numvfs" file. + * @sriov_set_msix_vec_count: PF Driver callback to change number of MSI-X + * vectors on a VF. Triggered via sysfs "sriov_vf_msix_count". + * This will change MSI-X Table Size in the VF Message Control + * registers. + * @sriov_get_vf_total_msix: PF driver callback to get the total number of + * MSI-X vectors available for distribution to the VFs. * @err_handler: See Documentation/PCI/pci-error-recovery.rst * @groups: Sysfs attribute groups. * @driver: Driver model structure. @@ -871,6 +877,8 @@ struct pci_driver { int (*resume)(struct pci_dev *dev); /* Device woken up */ void (*shutdown)(struct pci_dev *dev); int (*sriov_configure)(struct pci_dev *dev, int num_vfs); /* On PF */ + int (*sriov_set_msix_vec_count)(struct pci_dev *vf, int msix_vec_count); /* On PF */ + u32 (*sriov_get_vf_total_msix)(struct pci_dev *pf); const struct pci_error_handlers *err_handler; const struct attribute_group **groups; struct device_driver driver; -- cgit v1.2.3 From 0b989c1e37053196676b2238f82195bd5a339d58 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 14 Mar 2021 14:42:54 +0200 Subject: net/mlx5: Add dynamic MSI-X capabilities bits These new fields declare the number of MSI-X vectors that is possible to allocate on the VF through PF configuration. Value must be in range defined by min_dynamic_vf_msix_table_size and max_dynamic_vf_msix_table_size. The driver should continue to query its MSI-X table through PCI configuration header. Link: https://lore.kernel.org/linux-pci/20210314124256.70253-3-leon@kernel.org Acked-by: Bjorn Helgaas Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 3ee7a86f39e4..432290b58a0b 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1683,7 +1683,16 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_at_6e0[0x10]; u8 sf_base_id[0x10]; - u8 reserved_at_700[0x80]; + u8 reserved_at_700[0x8]; + u8 num_total_dynamic_vf_msix[0x18]; + u8 reserved_at_720[0x14]; + u8 dynamic_msix_table_size[0xc]; + u8 reserved_at_740[0xc]; + u8 min_dynamic_vf_msix_table_size[0x4]; + u8 reserved_at_750[0x4]; + u8 max_dynamic_vf_msix_table_size[0xc]; + + u8 reserved_at_760[0x20]; u8 vhca_tunnel_commands[0x40]; u8 reserved_at_7c0[0x40]; }; -- cgit v1.2.3