// SPDX-License-Identifier: GPL-2.0 // Copyright (C) 2025 Arm Ltd. #define pr_fmt(fmt) "%s:%s: " fmt, KBUILD_MODNAME, __func__ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "mpam_internal.h" DECLARE_WAIT_QUEUE_HEAD(resctrl_mon_ctx_waiters); /* * The classes we've picked to map to resctrl resources, wrapped * in with their resctrl structure. * Class pointer may be NULL. */ static struct mpam_resctrl_res mpam_resctrl_controls[RDT_NUM_RESOURCES]; #define for_each_mpam_resctrl_control(res, rid) \ for (rid = 0, res = &mpam_resctrl_controls[rid]; \ rid < RDT_NUM_RESOURCES; \ rid++, res = &mpam_resctrl_controls[rid]) /* * The classes we've picked to map to resctrl events. * Resctrl believes all the worlds a Xeon, and these are all on the L3. This * array lets us find the actual class backing the event counters. e.g. * the only memory bandwidth counters may be on the memory controller, but to * make use of them, we pretend they are on L3. Restrict the events considered * to those supported by MPAM. * Class pointer may be NULL. */ #define MPAM_MAX_EVENT QOS_L3_MBM_TOTAL_EVENT_ID static struct mpam_resctrl_mon mpam_resctrl_counters[MPAM_MAX_EVENT + 1]; #define for_each_mpam_resctrl_mon(mon, eventid) \ for (eventid = QOS_FIRST_EVENT, mon = &mpam_resctrl_counters[eventid]; \ eventid <= MPAM_MAX_EVENT; \ eventid++, mon = &mpam_resctrl_counters[eventid]) /* The lock for modifying resctrl's domain lists from cpuhp callbacks. */ static DEFINE_MUTEX(domain_list_lock); /* * MPAM emulates CDP by setting different PARTID in the I/D fields of MPAM0_EL1. * This applies globally to all traffic the CPU generates. */ static bool cdp_enabled; /* * We use cacheinfo to discover the size of the caches and their id. cacheinfo * populates this from a device_initcall(). mpam_resctrl_setup() must wait. */ static bool cacheinfo_ready; static DECLARE_WAIT_QUEUE_HEAD(wait_cacheinfo_ready); /* * If resctrl_init() succeeded, resctrl_exit() can be used to remove support * for the filesystem in the event of an error. */ static bool resctrl_enabled; bool resctrl_arch_alloc_capable(void) { struct mpam_resctrl_res *res; enum resctrl_res_level rid; for_each_mpam_resctrl_control(res, rid) { if (res->resctrl_res.alloc_capable) return true; } return false; } bool resctrl_arch_mon_capable(void) { struct mpam_resctrl_res *res = &mpam_resctrl_controls[RDT_RESOURCE_L3]; struct rdt_resource *l3 = &res->resctrl_res; /* All monitors are presented as being on the L3 cache */ return l3->mon_capable; } bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt) { return false; } void resctrl_arch_mon_event_config_read(void *info) { } void resctrl_arch_mon_event_config_write(void *info) { } void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_l3_mon_domain *d) { } void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_l3_mon_domain *d, u32 closid, u32 rmid, enum resctrl_event_id eventid) { } void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d, u32 closid, u32 rmid, int cntr_id, enum resctrl_event_id eventid) { } void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d, enum resctrl_event_id evtid, u32 rmid, u32 closid, u32 cntr_id, bool assign) { } int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_l3_mon_domain *d, u32 unused, u32 rmid, int cntr_id, enum resctrl_event_id eventid, u64 *val) { return -EOPNOTSUPP; } bool resctrl_arch_mbm_cntr_assign_enabled(struct rdt_resource *r) { return false; } int resctrl_arch_mbm_cntr_assign_set(struct rdt_resource *r, bool enable) { return -EINVAL; } int resctrl_arch_io_alloc_enable(struct rdt_resource *r, bool enable) { return -EOPNOTSUPP; } bool resctrl_arch_get_io_alloc_enabled(struct rdt_resource *r) { return false; } void resctrl_arch_pre_mount(void) { } bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level rid) { return mpam_resctrl_controls[rid].cdp_enabled; } /** * resctrl_reset_task_closids() - Reset the PARTID/PMG values for all tasks. * * At boot, all existing tasks use partid zero for D and I. * To enable/disable CDP emulation, all these tasks need relabelling. */ static void resctrl_reset_task_closids(void) { struct task_struct *p, *t; read_lock(&tasklist_lock); for_each_process_thread(p, t) { resctrl_arch_set_closid_rmid(t, RESCTRL_RESERVED_CLOSID, RESCTRL_RESERVED_RMID); } read_unlock(&tasklist_lock); } int resctrl_arch_set_cdp_enabled(enum resctrl_res_level rid, bool enable) { u32 partid_i = RESCTRL_RESERVED_CLOSID, partid_d = RESCTRL_RESERVED_CLOSID; struct mpam_resctrl_res *res = &mpam_resctrl_controls[RDT_RESOURCE_L3]; struct rdt_resource *l3 = &res->resctrl_res; int cpu; if (!IS_ENABLED(CONFIG_EXPERT) && enable) { /* * If the resctrl fs is mounted more than once, sequentially, * then CDP can lead to the use of out of range PARTIDs. */ pr_warn("CDP not supported\n"); return -EOPNOTSUPP; } if (enable) pr_warn("CDP is an expert feature and may cause MPAM to malfunction.\n"); /* * resctrl_arch_set_cdp_enabled() is only called with enable set to * false on error and unmount. */ cdp_enabled = enable; mpam_resctrl_controls[rid].cdp_enabled = enable; if (enable) l3->mon.num_rmid = resctrl_arch_system_num_rmid_idx() / 2; else l3->mon.num_rmid = resctrl_arch_system_num_rmid_idx(); /* The mbw_max feature can't hide cdp as it's a per-partid maximum. */ if (cdp_enabled && !mpam_resctrl_controls[RDT_RESOURCE_MBA].cdp_enabled) mpam_resctrl_controls[RDT_RESOURCE_MBA].resctrl_res.alloc_capable = false; if (mpam_resctrl_controls[RDT_RESOURCE_MBA].cdp_enabled && mpam_resctrl_controls[RDT_RESOURCE_MBA].class) mpam_resctrl_controls[RDT_RESOURCE_MBA].resctrl_res.alloc_capable = true; if (enable) { if (mpam_partid_max < 1) return -EINVAL; partid_d = resctrl_get_config_index(RESCTRL_RESERVED_CLOSID, CDP_DATA); partid_i = resctrl_get_config_index(RESCTRL_RESERVED_CLOSID, CDP_CODE); } mpam_set_task_partid_pmg(current, partid_d, partid_i, 0, 0); WRITE_ONCE(arm64_mpam_global_default, mpam_get_regval(current)); resctrl_reset_task_closids(); for_each_possible_cpu(cpu) mpam_set_cpu_defaults(cpu, partid_d, partid_i, 0, 0); on_each_cpu(resctrl_arch_sync_cpu_closid_rmid, NULL, 1); return 0; } static bool mpam_resctrl_hide_cdp(enum resctrl_res_level rid) { return cdp_enabled && !resctrl_arch_get_cdp_enabled(rid); } /* * MSC may raise an error interrupt if it sees an out or range partid/pmg, * and go on to truncate the value. Regardless of what the hardware supports, * only the system wide safe value is safe to use. */ u32 resctrl_arch_get_num_closid(struct rdt_resource *ignored) { return mpam_partid_max + 1; } u32 resctrl_arch_system_num_rmid_idx(void) { return (mpam_pmg_max + 1) * (mpam_partid_max + 1); } u32 resctrl_arch_rmid_idx_encode(u32 closid, u32 rmid) { return closid * (mpam_pmg_max + 1) + rmid; } void resctrl_arch_rmid_idx_decode(u32 idx, u32 *closid, u32 *rmid) { *closid = idx / (mpam_pmg_max + 1); *rmid = idx % (mpam_pmg_max + 1); } void resctrl_arch_sched_in(struct task_struct *tsk) { lockdep_assert_preemption_disabled(); mpam_thread_switch(tsk); } void resctrl_arch_set_cpu_default_closid_rmid(int cpu, u32 closid, u32 rmid) { WARN_ON_ONCE(closid > U16_MAX); WARN_ON_ONCE(rmid > U8_MAX); if (!cdp_enabled) { mpam_set_cpu_defaults(cpu, closid, closid, rmid, rmid); } else { /* * When CDP is enabled, resctrl halves the closid range and we * use odd/even partid for one closid. */ u32 partid_d = resctrl_get_config_index(closid, CDP_DATA); u32 partid_i = resctrl_get_config_index(closid, CDP_CODE); mpam_set_cpu_defaults(cpu, partid_d, partid_i, rmid, rmid); } } void resctrl_arch_sync_cpu_closid_rmid(void *info) { struct resctrl_cpu_defaults *r = info; lockdep_assert_preemption_disabled(); if (r) { resctrl_arch_set_cpu_default_closid_rmid(smp_processor_id(), r->closid, r->rmid); } resctrl_arch_sched_in(current); } void resctrl_arch_set_closid_rmid(struct task_struct *tsk, u32 closid, u32 rmid) { WARN_ON_ONCE(closid > U16_MAX); WARN_ON_ONCE(rmid > U8_MAX); if (!cdp_enabled) { mpam_set_task_partid_pmg(tsk, closid, closid, rmid, rmid); } else { u32 partid_d = resctrl_get_config_index(closid, CDP_DATA); u32 partid_i = resctrl_get_config_index(closid, CDP_CODE); mpam_set_task_partid_pmg(tsk, partid_d, partid_i, rmid, rmid); } } bool resctrl_arch_match_closid(struct task_struct *tsk, u32 closid) { u64 regval = mpam_get_regval(tsk); u32 tsk_closid = FIELD_GET(MPAM0_EL1_PARTID_D, regval); if (cdp_enabled) tsk_closid >>= 1; return tsk_closid == closid; } /* The task's pmg is not unique, the partid must be considered too */ bool resctrl_arch_match_rmid(struct task_struct *tsk, u32 closid, u32 rmid) { u64 regval = mpam_get_regval(tsk); u32 tsk_closid = FIELD_GET(MPAM0_EL1_PARTID_D, regval); u32 tsk_rmid = FIELD_GET(MPAM0_EL1_PMG_D, regval); if (cdp_enabled) tsk_closid >>= 1; return (tsk_closid == closid) && (tsk_rmid == rmid); } struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l) { if (l >= RDT_NUM_RESOURCES) return NULL; return &mpam_resctrl_controls[l].resctrl_res; } static int resctrl_arch_mon_ctx_alloc_no_wait(enum resctrl_event_id evtid) { struct mpam_resctrl_mon *mon = &mpam_resctrl_counters[evtid]; if (!mpam_is_enabled()) return -EINVAL; if (!mon->class) return -EINVAL; switch (evtid) { case QOS_L3_OCCUP_EVENT_ID: /* With CDP, one monitor gets used for both code/data reads */ return mpam_alloc_csu_mon(mon->class); case QOS_L3_MBM_LOCAL_EVENT_ID: case QOS_L3_MBM_TOTAL_EVENT_ID: return USE_PRE_ALLOCATED; default: return -EOPNOTSUPP; } } void *resctrl_arch_mon_ctx_alloc(struct rdt_resource *r, enum resctrl_event_id evtid) { DEFINE_WAIT(wait); int *ret; ret = kmalloc_obj(*ret); if (!ret) return ERR_PTR(-ENOMEM); do { prepare_to_wait(&resctrl_mon_ctx_waiters, &wait, TASK_INTERRUPTIBLE); *ret = resctrl_arch_mon_ctx_alloc_no_wait(evtid); if (*ret == -ENOSPC) schedule(); } while (*ret == -ENOSPC && !signal_pending(current)); finish_wait(&resctrl_mon_ctx_waiters, &wait); return ret; } static void resctrl_arch_mon_ctx_free_no_wait(enum resctrl_event_id evtid, u32 mon_idx) { struct mpam_resctrl_mon *mon = &mpam_resctrl_counters[evtid]; if (!mpam_is_enabled()) return; if (!mon->class) return; if (evtid == QOS_L3_OCCUP_EVENT_ID) mpam_free_csu_mon(mon->class, mon_idx); wake_up(&resctrl_mon_ctx_waiters); } void resctrl_arch_mon_ctx_free(struct rdt_resource *r, enum resctrl_event_id evtid, void *arch_mon_ctx) { u32 mon_idx = *(u32 *)arch_mon_ctx; kfree(arch_mon_ctx); resctrl_arch_mon_ctx_free_no_wait(evtid, mon_idx); } static int __read_mon(struct mpam_resctrl_mon *mon, struct mpam_component *mon_comp, enum mpam_device_features mon_type, int mon_idx, enum resctrl_conf_type cdp_type, u32 closid, u32 rmid, u64 *val) { struct mon_cfg cfg; if (!mpam_is_enabled()) return -EINVAL; /* Shift closid to account for CDP */ closid = resctrl_get_config_index(closid, cdp_type); if (irqs_disabled()) { /* Check if we can access this domain without an IPI */ return -EIO; } cfg = (struct mon_cfg) { .mon = mon_idx, .match_pmg = true, .partid = closid, .pmg = rmid, }; return mpam_msmon_read(mon_comp, &cfg, mon_type, val); } static int read_mon_cdp_safe(struct mpam_resctrl_mon *mon, struct mpam_component *mon_comp, enum mpam_device_features mon_type, int mon_idx, u32 closid, u32 rmid, u64 *val) { if (cdp_enabled) { u64 code_val = 0, data_val = 0; int err; err = __read_mon(mon, mon_comp, mon_type, mon_idx, CDP_CODE, closid, rmid, &code_val); if (err) return err; err = __read_mon(mon, mon_comp, mon_type, mon_idx, CDP_DATA, closid, rmid, &data_val); if (err) return err; *val += code_val + data_val; return 0; } return __read_mon(mon, mon_comp, mon_type, mon_idx, CDP_NONE, closid, rmid, val); } /* MBWU when not in ABMC mode (not supported), and CSU counters. */ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain_hdr *hdr, u32 closid, u32 rmid, enum resctrl_event_id eventid, void *arch_priv, u64 *val, void *arch_mon_ctx) { struct mpam_resctrl_dom *l3_dom; struct mpam_component *mon_comp; u32 mon_idx = *(u32 *)arch_mon_ctx; enum mpam_device_features mon_type; struct mpam_resctrl_mon *mon = &mpam_resctrl_counters[eventid]; resctrl_arch_rmid_read_context_check(); if (!mpam_is_enabled()) return -EINVAL; if (eventid >= QOS_NUM_EVENTS || !mon->class) return -EINVAL; l3_dom = container_of(hdr, struct mpam_resctrl_dom, resctrl_mon_dom.hdr); mon_comp = l3_dom->mon_comp[eventid]; if (eventid != QOS_L3_OCCUP_EVENT_ID) return -EINVAL; mon_type = mpam_feat_msmon_csu; return read_mon_cdp_safe(mon, mon_comp, mon_type, mon_idx, closid, rmid, val); } /* * The rmid realloc threshold should be for the smallest cache exposed to * resctrl. */ static int update_rmid_limits(struct mpam_class *class) { u32 num_unique_pmg = resctrl_arch_system_num_rmid_idx(); struct mpam_props *cprops = &class->props; struct cacheinfo *ci; lockdep_assert_cpus_held(); if (!mpam_has_feature(mpam_feat_msmon_csu, cprops)) return 0; /* * Assume cache levels are the same size for all CPUs... * The check just requires any online CPU and it can't go offline as we * hold the cpu lock. */ ci = get_cpu_cacheinfo_level(raw_smp_processor_id(), class->level); if (!ci || ci->size == 0) { pr_debug("Could not read cache size for class %u\n", class->level); return -EINVAL; } if (!resctrl_rmid_realloc_limit || ci->size < resctrl_rmid_realloc_limit) { resctrl_rmid_realloc_limit = ci->size; resctrl_rmid_realloc_threshold = ci->size / num_unique_pmg; } return 0; } static bool cache_has_usable_cpor(struct mpam_class *class) { struct mpam_props *cprops = &class->props; if (!mpam_has_feature(mpam_feat_cpor_part, cprops)) return false; /* resctrl uses u32 for all bitmap configurations */ return class->props.cpbm_wd <= 32; } static bool mba_class_use_mbw_max(struct mpam_props *cprops) { return (mpam_has_feature(mpam_feat_mbw_max, cprops) && cprops->bwa_wd); } static bool class_has_usable_mba(struct mpam_props *cprops) { return mba_class_use_mbw_max(cprops); } static bool cache_has_usable_csu(struct mpam_class *class) { struct mpam_props *cprops; if (!class) return false; cprops = &class->props; if (!mpam_has_feature(mpam_feat_msmon_csu, cprops)) return false; /* * CSU counters settle on the value, so we can get away with * having only one. */ if (!cprops->num_csu_mon) return false; return true; } /* * Calculate the worst-case percentage change from each implemented step * in the control. */ static u32 get_mba_granularity(struct mpam_props *cprops) { if (!mba_class_use_mbw_max(cprops)) return 0; /* * bwa_wd is the number of bits implemented in the 0.xxx * fixed point fraction. 1 bit is 50%, 2 is 25% etc. */ return DIV_ROUND_UP(MAX_MBA_BW, 1 << cprops->bwa_wd); } /* * Each fixed-point hardware value architecturally represents a range * of values: the full range 0% - 100% is split contiguously into * (1 << cprops->bwa_wd) equal bands. * * Although the bwa_bwd fields have 6 bits the maximum valid value is 16 * as it reports the width of fields that are at most 16 bits. When * fewer than 16 bits are valid the least significant bits are * ignored. The implied binary point is kept between bits 15 and 16 and * so the valid bits are leftmost. * * See ARM IHI0099B.a "MPAM system component specification", Section 9.3, * "The fixed-point fractional format" for more information. * * Find the nearest percentage value to the upper bound of the selected band: */ static u32 mbw_max_to_percent(u16 mbw_max, struct mpam_props *cprops) { u32 val = mbw_max; val >>= 16 - cprops->bwa_wd; val += 1; val *= MAX_MBA_BW; val = DIV_ROUND_CLOSEST(val, 1 << cprops->bwa_wd); return val; } /* * Find the band whose upper bound is closest to the specified percentage. * * A round-to-nearest policy is followed here as a balanced compromise * between unexpected under-commit of the resource (where the total of * a set of resource allocations after conversion is less than the * expected total, due to rounding of the individual converted * percentages) and over-commit (where the total of the converted * allocations is greater than expected). */ static u16 percent_to_mbw_max(u8 pc, struct mpam_props *cprops) { u32 val = pc; val <<= cprops->bwa_wd; val = DIV_ROUND_CLOSEST(val, MAX_MBA_BW); val = max(val, 1) - 1; val <<= 16 - cprops->bwa_wd; return val; } static u32 get_mba_min(struct mpam_props *cprops) { if (!mba_class_use_mbw_max(cprops)) { WARN_ON_ONCE(1); return 0; } return mbw_max_to_percent(0, cprops); } /* Find the L3 cache that has affinity with this CPU */ static int find_l3_equivalent_bitmask(int cpu, cpumask_var_t tmp_cpumask) { u32 cache_id = get_cpu_cacheinfo_id(cpu, 3); lockdep_assert_cpus_held(); return mpam_get_cpumask_from_cache_id(cache_id, 3, tmp_cpumask); } /* * topology_matches_l3() - Is the provided class the same shape as L3 * @victim: The class we'd like to pretend is L3. * * resctrl expects all the world's a Xeon, and all counters are on the * L3. We allow some mapping counters on other classes. This requires * that the CPU->domain mapping is the same kind of shape. * * Using cacheinfo directly would make this work even if resctrl can't * use the L3 - but cacheinfo can't tell us anything about offline CPUs. * Using the L3 resctrl domain list also depends on CPUs being online. * Using the mpam_class we picked for L3 so we can use its domain list * assumes that there are MPAM controls on the L3. * Instead, this path eventually uses the mpam_get_cpumask_from_cache_id() * helper which can tell us about offline CPUs ... but getting the cache_id * to start with relies on at least one CPU per L3 cache being online at * boot. * * Walk the victim component list and compare the affinity mask with the * corresponding L3. The topology matches if each victim:component's affinity * mask is the same as the CPU's corresponding L3's. These lists/masks are * computed from firmware tables so don't change at runtime. */ static bool topology_matches_l3(struct mpam_class *victim) { int cpu, err; struct mpam_component *victim_iter; lockdep_assert_cpus_held(); cpumask_var_t __free(free_cpumask_var) tmp_cpumask = CPUMASK_VAR_NULL; if (!alloc_cpumask_var(&tmp_cpumask, GFP_KERNEL)) return false; guard(srcu)(&mpam_srcu); list_for_each_entry_srcu(victim_iter, &victim->components, class_list, srcu_read_lock_held(&mpam_srcu)) { if (cpumask_empty(&victim_iter->affinity)) { pr_debug("class %u has CPU-less component %u - can't match L3!\n", victim->level, victim_iter->comp_id); return false; } cpu = cpumask_any_and(&victim_iter->affinity, cpu_online_mask); if (WARN_ON_ONCE(cpu >= nr_cpu_ids)) return false; cpumask_clear(tmp_cpumask); err = find_l3_equivalent_bitmask(cpu, tmp_cpumask); if (err) { pr_debug("Failed to find L3's equivalent component to class %u component %u\n", victim->level, victim_iter->comp_id); return false; } /* Any differing bits in the affinity mask? */ if (!cpumask_equal(tmp_cpumask, &victim_iter->affinity)) { pr_debug("class %u component %u has Mismatched CPU mask with L3 equivalent\n" "L3:%*pbl != victim:%*pbl\n", victim->level, victim_iter->comp_id, cpumask_pr_args(tmp_cpumask), cpumask_pr_args(&victim_iter->affinity)); return false; } } return true; } /* * Test if the traffic for a class matches that at egress from the L3. For * MSC at memory controllers this is only possible if there is a single L3 * as otherwise the counters at the memory can include bandwidth from the * non-local L3. */ static bool traffic_matches_l3(struct mpam_class *class) { int err, cpu; lockdep_assert_cpus_held(); if (class->type == MPAM_CLASS_CACHE && class->level == 3) return true; if (class->type == MPAM_CLASS_CACHE && class->level != 3) { pr_debug("class %u is a different cache from L3\n", class->level); return false; } if (class->type != MPAM_CLASS_MEMORY) { pr_debug("class %u is neither of type cache or memory\n", class->level); return false; } cpumask_var_t __free(free_cpumask_var) tmp_cpumask = CPUMASK_VAR_NULL; if (!alloc_cpumask_var(&tmp_cpumask, GFP_KERNEL)) { pr_debug("cpumask allocation failed\n"); return false; } cpu = cpumask_any_and(&class->affinity, cpu_online_mask); err = find_l3_equivalent_bitmask(cpu, tmp_cpumask); if (err) { pr_debug("Failed to find L3 downstream to cpu %d\n", cpu); return false; } if (!cpumask_equal(tmp_cpumask, cpu_possible_mask)) { pr_debug("There is more than one L3\n"); return false; } /* Be strict; the traffic might stop in the intermediate cache. */ if (get_cpu_cacheinfo_id(cpu, 4) != -1) { pr_debug("L3 isn't the last level of cache\n"); return false; } if (num_possible_nodes() > 1) { pr_debug("There is more than one numa node\n"); return false; } #ifdef CONFIG_HMEM_REPORTING if (node_devices[cpu_to_node(cpu)]->cache_dev) { pr_debug("There is a memory side cache\n"); return false; } #endif return true; } /* Test whether we can export MPAM_CLASS_CACHE:{2,3}? */ static void mpam_resctrl_pick_caches(void) { struct mpam_class *class; struct mpam_resctrl_res *res; lockdep_assert_cpus_held(); guard(srcu)(&mpam_srcu); list_for_each_entry_srcu(class, &mpam_classes, classes_list, srcu_read_lock_held(&mpam_srcu)) { if (class->type != MPAM_CLASS_CACHE) { pr_debug("class %u is not a cache\n", class->level); continue; } if (class->level != 2 && class->level != 3) { pr_debug("class %u is not L2 or L3\n", class->level); continue; } if (!cache_has_usable_cpor(class)) { pr_debug("class %u cache misses CPOR\n", class->level); continue; } if (!cpumask_equal(&class->affinity, cpu_possible_mask)) { pr_debug("class %u has missing CPUs, mask %*pb != %*pb\n", class->level, cpumask_pr_args(&class->affinity), cpumask_pr_args(cpu_possible_mask)); continue; } if (class->level == 2) res = &mpam_resctrl_controls[RDT_RESOURCE_L2]; else res = &mpam_resctrl_controls[RDT_RESOURCE_L3]; res->class = class; } } static void mpam_resctrl_pick_mba(void) { struct mpam_class *class, *candidate_class = NULL; struct mpam_resctrl_res *res; lockdep_assert_cpus_held(); guard(srcu)(&mpam_srcu); list_for_each_entry_srcu(class, &mpam_classes, classes_list, srcu_read_lock_held(&mpam_srcu)) { struct mpam_props *cprops = &class->props; if (class->level != 3 && class->type == MPAM_CLASS_CACHE) { pr_debug("class %u is a cache but not the L3\n", class->level); continue; } if (!class_has_usable_mba(cprops)) { pr_debug("class %u has no bandwidth control\n", class->level); continue; } if (!cpumask_equal(&class->affinity, cpu_possible_mask)) { pr_debug("class %u has missing CPUs\n", class->level); continue; } if (!topology_matches_l3(class)) { pr_debug("class %u topology doesn't match L3\n", class->level); continue; } if (!traffic_matches_l3(class)) { pr_debug("class %u traffic doesn't match L3 egress\n", class->level); continue; } /* * Pick a resource to be MBA that as close as possible to * the L3. mbm_total counts the bandwidth leaving the L3 * cache and MBA should correspond as closely as possible * for proper operation of mba_sc. */ if (!candidate_class || class->level < candidate_class->level) candidate_class = class; } if (candidate_class) { pr_debug("selected class %u to back MBA\n", candidate_class->level); res = &mpam_resctrl_controls[RDT_RESOURCE_MBA]; res->class = candidate_class; } } static void counter_update_class(enum resctrl_event_id evt_id, struct mpam_class *class) { struct mpam_class *existing_class = mpam_resctrl_counters[evt_id].class; if (existing_class) { if (class->level == 3) { pr_debug("Existing class is L3 - L3 wins\n"); return; } if (existing_class->level < class->level) { pr_debug("Existing class is closer to L3, %u versus %u - closer is better\n", existing_class->level, class->level); return; } } mpam_resctrl_counters[evt_id].class = class; } static void mpam_resctrl_pick_counters(void) { struct mpam_class *class; lockdep_assert_cpus_held(); guard(srcu)(&mpam_srcu); list_for_each_entry_srcu(class, &mpam_classes, classes_list, srcu_read_lock_held(&mpam_srcu)) { /* The name of the resource is L3... */ if (class->type == MPAM_CLASS_CACHE && class->level != 3) { pr_debug("class %u is a cache but not the L3", class->level); continue; } if (!cpumask_equal(&class->affinity, cpu_possible_mask)) { pr_debug("class %u does not cover all CPUs", class->level); continue; } if (cache_has_usable_csu(class)) { pr_debug("class %u has usable CSU", class->level); /* CSU counters only make sense on a cache. */ switch (class->type) { case MPAM_CLASS_CACHE: if (update_rmid_limits(class)) break; counter_update_class(QOS_L3_OCCUP_EVENT_ID, class); break; default: break; } } } } static int mpam_resctrl_control_init(struct mpam_resctrl_res *res) { struct mpam_class *class = res->class; struct mpam_props *cprops = &class->props; struct rdt_resource *r = &res->resctrl_res; switch (r->rid) { case RDT_RESOURCE_L2: case RDT_RESOURCE_L3: r->schema_fmt = RESCTRL_SCHEMA_BITMAP; r->cache.arch_has_sparse_bitmasks = true; r->cache.cbm_len = class->props.cpbm_wd; /* mpam_devices will reject empty bitmaps */ r->cache.min_cbm_bits = 1; if (r->rid == RDT_RESOURCE_L2) { r->name = "L2"; r->ctrl_scope = RESCTRL_L2_CACHE; r->cdp_capable = true; } else { r->name = "L3"; r->ctrl_scope = RESCTRL_L3_CACHE; r->cdp_capable = true; } /* * Which bits are shared with other ...things... Unknown * devices use partid-0 which uses all the bitmap fields. Until * we have configured the SMMU and GIC not to do this 'all the * bits' is the correct answer here. */ r->cache.shareable_bits = resctrl_get_default_ctrl(r); r->alloc_capable = true; break; case RDT_RESOURCE_MBA: r->schema_fmt = RESCTRL_SCHEMA_RANGE; r->ctrl_scope = RESCTRL_L3_CACHE; r->membw.delay_linear = true; r->membw.throttle_mode = THREAD_THROTTLE_UNDEFINED; r->membw.min_bw = get_mba_min(cprops); r->membw.max_bw = MAX_MBA_BW; r->membw.bw_gran = get_mba_granularity(cprops); r->name = "MB"; r->alloc_capable = true; break; default: return -EINVAL; } return 0; } static int mpam_resctrl_pick_domain_id(int cpu, struct mpam_component *comp) { struct mpam_class *class = comp->class; if (class->type == MPAM_CLASS_CACHE) return comp->comp_id; if (topology_matches_l3(class)) { /* Use the corresponding L3 component ID as the domain ID */ int id = get_cpu_cacheinfo_id(cpu, 3); /* Implies topology_matches_l3() made a mistake */ if (WARN_ON_ONCE(id == -1)) return comp->comp_id; return id; } /* Otherwise, expose the ID used by the firmware table code. */ return comp->comp_id; } static int mpam_resctrl_monitor_init(struct mpam_resctrl_mon *mon, enum resctrl_event_id type) { struct mpam_resctrl_res *res = &mpam_resctrl_controls[RDT_RESOURCE_L3]; struct rdt_resource *l3 = &res->resctrl_res; lockdep_assert_cpus_held(); /* * There also needs to be an L3 cache present. * The check just requires any online CPU and it can't go offline as we * hold the cpu lock. */ if (get_cpu_cacheinfo_id(raw_smp_processor_id(), 3) == -1) return 0; /* * If there are no MPAM resources on L3, force it into existence. * topology_matches_l3() already ensures this looks like the L3. * The domain-ids will be fixed up by mpam_resctrl_domain_hdr_init(). */ if (!res->class) { pr_warn_once("Faking L3 MSC to enable counters.\n"); res->class = mpam_resctrl_counters[type].class; } /* * Called multiple times!, once per event type that has a * monitoring class. * Setting name is necessary on monitor only platforms. */ l3->name = "L3"; l3->mon_scope = RESCTRL_L3_CACHE; /* * num-rmid is the upper bound for the number of monitoring groups that * can exist simultaneously, including the default monitoring group for * each control group. Hence, advertise the whole rmid_idx space even * though each control group has its own pmg/rmid space. Unfortunately, * this does mean userspace needs to know the architecture to correctly * interpret this value. */ l3->mon.num_rmid = resctrl_arch_system_num_rmid_idx(); if (resctrl_enable_mon_event(type, false, 0, NULL)) l3->mon_capable = true; return 0; } u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, u32 closid, enum resctrl_conf_type type) { u32 partid; struct mpam_config *cfg; struct mpam_props *cprops; struct mpam_resctrl_res *res; struct mpam_resctrl_dom *dom; enum mpam_device_features configured_by; lockdep_assert_cpus_held(); if (!mpam_is_enabled()) return resctrl_get_default_ctrl(r); res = container_of(r, struct mpam_resctrl_res, resctrl_res); dom = container_of(d, struct mpam_resctrl_dom, resctrl_ctrl_dom); cprops = &res->class->props; /* * When CDP is enabled, but the resource doesn't support it, * the control is cloned across both partids. * Pick one at random to read: */ if (mpam_resctrl_hide_cdp(r->rid)) type = CDP_DATA; partid = resctrl_get_config_index(closid, type); cfg = &dom->ctrl_comp->cfg[partid]; switch (r->rid) { case RDT_RESOURCE_L2: case RDT_RESOURCE_L3: configured_by = mpam_feat_cpor_part; break; case RDT_RESOURCE_MBA: if (mpam_has_feature(mpam_feat_mbw_max, cprops)) { configured_by = mpam_feat_mbw_max; break; } fallthrough; default: return resctrl_get_default_ctrl(r); } if (!r->alloc_capable || partid >= resctrl_arch_get_num_closid(r) || !mpam_has_feature(configured_by, cfg)) return resctrl_get_default_ctrl(r); switch (configured_by) { case mpam_feat_cpor_part: return cfg->cpbm; case mpam_feat_mbw_max: return mbw_max_to_percent(cfg->mbw_max, cprops); default: return resctrl_get_default_ctrl(r); } } int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d, u32 closid, enum resctrl_conf_type t, u32 cfg_val) { int err; u32 partid; struct mpam_config cfg; struct mpam_props *cprops; struct mpam_resctrl_res *res; struct mpam_resctrl_dom *dom; lockdep_assert_cpus_held(); lockdep_assert_irqs_enabled(); if (!mpam_is_enabled()) return -EINVAL; /* * No need to check the CPU as mpam_apply_config() doesn't care, and * resctrl_arch_update_domains() relies on this. */ res = container_of(r, struct mpam_resctrl_res, resctrl_res); dom = container_of(d, struct mpam_resctrl_dom, resctrl_ctrl_dom); cprops = &res->class->props; if (mpam_resctrl_hide_cdp(r->rid)) t = CDP_DATA; partid = resctrl_get_config_index(closid, t); if (!r->alloc_capable || partid >= resctrl_arch_get_num_closid(r)) { pr_debug("Not alloc capable or computed PARTID out of range\n"); return -EINVAL; } /* * Copy the current config to avoid clearing other resources when the * same component is exposed multiple times through resctrl. */ cfg = dom->ctrl_comp->cfg[partid]; switch (r->rid) { case RDT_RESOURCE_L2: case RDT_RESOURCE_L3: cfg.cpbm = cfg_val; mpam_set_feature(mpam_feat_cpor_part, &cfg); break; case RDT_RESOURCE_MBA: if (mpam_has_feature(mpam_feat_mbw_max, cprops)) { cfg.mbw_max = percent_to_mbw_max(cfg_val, cprops); mpam_set_feature(mpam_feat_mbw_max, &cfg); break; } fallthrough; default: return -EINVAL; } /* * When CDP is enabled, but the resource doesn't support it, we need to * apply the same configuration to the other partid. */ if (mpam_resctrl_hide_cdp(r->rid)) { partid = resctrl_get_config_index(closid, CDP_CODE); err = mpam_apply_config(dom->ctrl_comp, partid, &cfg); if (err) return err; partid = resctrl_get_config_index(closid, CDP_DATA); return mpam_apply_config(dom->ctrl_comp, partid, &cfg); } return mpam_apply_config(dom->ctrl_comp, partid, &cfg); } int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid) { int err; struct rdt_ctrl_domain *d; lockdep_assert_cpus_held(); lockdep_assert_irqs_enabled(); if (!mpam_is_enabled()) return -EINVAL; list_for_each_entry_rcu(d, &r->ctrl_domains, hdr.list) { for (enum resctrl_conf_type t = 0; t < CDP_NUM_TYPES; t++) { struct resctrl_staged_config *cfg = &d->staged_config[t]; if (!cfg->have_new_ctrl) continue; err = resctrl_arch_update_one(r, d, closid, t, cfg->new_ctrl); if (err) return err; } } return 0; } void resctrl_arch_reset_all_ctrls(struct rdt_resource *r) { struct mpam_resctrl_res *res; lockdep_assert_cpus_held(); if (!mpam_is_enabled()) return; res = container_of(r, struct mpam_resctrl_res, resctrl_res); mpam_reset_class_locked(res->class); } static void mpam_resctrl_domain_hdr_init(int cpu, struct mpam_component *comp, enum resctrl_res_level rid, struct rdt_domain_hdr *hdr) { lockdep_assert_cpus_held(); INIT_LIST_HEAD(&hdr->list); hdr->id = mpam_resctrl_pick_domain_id(cpu, comp); hdr->rid = rid; cpumask_set_cpu(cpu, &hdr->cpu_mask); } static void mpam_resctrl_online_domain_hdr(unsigned int cpu, struct rdt_domain_hdr *hdr) { lockdep_assert_cpus_held(); cpumask_set_cpu(cpu, &hdr->cpu_mask); } /** * mpam_resctrl_offline_domain_hdr() - Update the domain header to remove a CPU. * @cpu: The CPU to remove from the domain. * @hdr: The domain's header. * * Removes @cpu from the header mask. If this was the last CPU in the domain, * the domain header is removed from its parent list and true is returned, * indicating the parent structure can be freed. * If there are other CPUs in the domain, returns false. */ static bool mpam_resctrl_offline_domain_hdr(unsigned int cpu, struct rdt_domain_hdr *hdr) { lockdep_assert_held(&domain_list_lock); cpumask_clear_cpu(cpu, &hdr->cpu_mask); if (cpumask_empty(&hdr->cpu_mask)) { list_del_rcu(&hdr->list); synchronize_rcu(); return true; } return false; } static void mpam_resctrl_domain_insert(struct list_head *list, struct rdt_domain_hdr *new) { struct rdt_domain_hdr *err; struct list_head *pos = NULL; lockdep_assert_held(&domain_list_lock); err = resctrl_find_domain(list, new->id, &pos); if (WARN_ON_ONCE(err)) return; list_add_tail_rcu(&new->list, pos); } static struct mpam_component *find_component(struct mpam_class *class, int cpu) { struct mpam_component *comp; guard(srcu)(&mpam_srcu); list_for_each_entry_srcu(comp, &class->components, class_list, srcu_read_lock_held(&mpam_srcu)) { if (cpumask_test_cpu(cpu, &comp->affinity)) return comp; } return NULL; } static struct mpam_resctrl_dom * mpam_resctrl_alloc_domain(unsigned int cpu, struct mpam_resctrl_res *res) { int err; struct mpam_resctrl_dom *dom; struct rdt_l3_mon_domain *mon_d; struct rdt_ctrl_domain *ctrl_d; struct mpam_class *class = res->class; struct mpam_component *comp_iter, *ctrl_comp; struct rdt_resource *r = &res->resctrl_res; lockdep_assert_held(&domain_list_lock); ctrl_comp = NULL; guard(srcu)(&mpam_srcu); list_for_each_entry_srcu(comp_iter, &class->components, class_list, srcu_read_lock_held(&mpam_srcu)) { if (cpumask_test_cpu(cpu, &comp_iter->affinity)) { ctrl_comp = comp_iter; break; } } /* class has no component for this CPU */ if (WARN_ON_ONCE(!ctrl_comp)) return ERR_PTR(-EINVAL); dom = kzalloc_node(sizeof(*dom), GFP_KERNEL, cpu_to_node(cpu)); if (!dom) return ERR_PTR(-ENOMEM); if (r->alloc_capable) { dom->ctrl_comp = ctrl_comp; ctrl_d = &dom->resctrl_ctrl_dom; mpam_resctrl_domain_hdr_init(cpu, ctrl_comp, r->rid, &ctrl_d->hdr); ctrl_d->hdr.type = RESCTRL_CTRL_DOMAIN; err = resctrl_online_ctrl_domain(r, ctrl_d); if (err) goto free_domain; mpam_resctrl_domain_insert(&r->ctrl_domains, &ctrl_d->hdr); } else { pr_debug("Skipped control domain online - no controls\n"); } if (r->mon_capable) { struct mpam_component *any_mon_comp; struct mpam_resctrl_mon *mon; enum resctrl_event_id eventid; /* * Even if the monitor domain is backed by a different * component, the L3 component IDs need to be used... only * there may be no ctrl_comp for the L3. * Search each event's class list for a component with * overlapping CPUs and set up the dom->mon_comp array. */ for_each_mpam_resctrl_mon(mon, eventid) { struct mpam_component *mon_comp; if (!mon->class) continue; // dummy resource mon_comp = find_component(mon->class, cpu); dom->mon_comp[eventid] = mon_comp; if (mon_comp) any_mon_comp = mon_comp; } if (!any_mon_comp) { WARN_ON_ONCE(0); err = -EFAULT; goto offline_ctrl_domain; } mon_d = &dom->resctrl_mon_dom; mpam_resctrl_domain_hdr_init(cpu, any_mon_comp, r->rid, &mon_d->hdr); mon_d->hdr.type = RESCTRL_MON_DOMAIN; err = resctrl_online_mon_domain(r, &mon_d->hdr); if (err) goto offline_ctrl_domain; mpam_resctrl_domain_insert(&r->mon_domains, &mon_d->hdr); } else { pr_debug("Skipped monitor domain online - no monitors\n"); } return dom; offline_ctrl_domain: if (r->alloc_capable) { mpam_resctrl_offline_domain_hdr(cpu, &ctrl_d->hdr); resctrl_offline_ctrl_domain(r, ctrl_d); } free_domain: kfree(dom); dom = ERR_PTR(err); return dom; } /* * We know all the monitors are associated with the L3, even if there are no * controls and therefore no control component. Find the cache-id for the CPU * and use that to search for existing resctrl domains. * This relies on mpam_resctrl_pick_domain_id() using the L3 cache-id * for anything that is not a cache. */ static struct mpam_resctrl_dom *mpam_resctrl_get_mon_domain_from_cpu(int cpu) { int cache_id; struct mpam_resctrl_dom *dom; struct mpam_resctrl_res *l3 = &mpam_resctrl_controls[RDT_RESOURCE_L3]; lockdep_assert_cpus_held(); if (!l3->class) return NULL; cache_id = get_cpu_cacheinfo_id(cpu, 3); if (cache_id < 0) return NULL; list_for_each_entry_rcu(dom, &l3->resctrl_res.mon_domains, resctrl_mon_dom.hdr.list) { if (dom->resctrl_mon_dom.hdr.id == cache_id) return dom; } return NULL; } static struct mpam_resctrl_dom * mpam_resctrl_get_domain_from_cpu(int cpu, struct mpam_resctrl_res *res) { struct mpam_resctrl_dom *dom; struct rdt_resource *r = &res->resctrl_res; lockdep_assert_cpus_held(); list_for_each_entry_rcu(dom, &r->ctrl_domains, resctrl_ctrl_dom.hdr.list) { if (cpumask_test_cpu(cpu, &dom->ctrl_comp->affinity)) return dom; } if (r->rid != RDT_RESOURCE_L3) return NULL; /* Search the mon domain list too - needed on monitor only platforms. */ return mpam_resctrl_get_mon_domain_from_cpu(cpu); } int mpam_resctrl_online_cpu(unsigned int cpu) { struct mpam_resctrl_res *res; enum resctrl_res_level rid; guard(mutex)(&domain_list_lock); for_each_mpam_resctrl_control(res, rid) { struct mpam_resctrl_dom *dom; struct rdt_resource *r = &res->resctrl_res; if (!res->class) continue; // dummy_resource; dom = mpam_resctrl_get_domain_from_cpu(cpu, res); if (!dom) { dom = mpam_resctrl_alloc_domain(cpu, res); if (IS_ERR(dom)) return PTR_ERR(dom); } else { if (r->alloc_capable) { struct rdt_ctrl_domain *ctrl_d = &dom->resctrl_ctrl_dom; mpam_resctrl_online_domain_hdr(cpu, &ctrl_d->hdr); } if (r->mon_capable) { struct rdt_l3_mon_domain *mon_d = &dom->resctrl_mon_dom; mpam_resctrl_online_domain_hdr(cpu, &mon_d->hdr); } } } resctrl_online_cpu(cpu); return 0; } void mpam_resctrl_offline_cpu(unsigned int cpu) { struct mpam_resctrl_res *res; enum resctrl_res_level rid; resctrl_offline_cpu(cpu); guard(mutex)(&domain_list_lock); for_each_mpam_resctrl_control(res, rid) { struct mpam_resctrl_dom *dom; struct rdt_l3_mon_domain *mon_d; struct rdt_ctrl_domain *ctrl_d; bool ctrl_dom_empty, mon_dom_empty; struct rdt_resource *r = &res->resctrl_res; if (!res->class) continue; // dummy resource dom = mpam_resctrl_get_domain_from_cpu(cpu, res); if (WARN_ON_ONCE(!dom)) continue; if (r->alloc_capable) { ctrl_d = &dom->resctrl_ctrl_dom; ctrl_dom_empty = mpam_resctrl_offline_domain_hdr(cpu, &ctrl_d->hdr); if (ctrl_dom_empty) resctrl_offline_ctrl_domain(&res->resctrl_res, ctrl_d); } else { ctrl_dom_empty = true; } if (r->mon_capable) { mon_d = &dom->resctrl_mon_dom; mon_dom_empty = mpam_resctrl_offline_domain_hdr(cpu, &mon_d->hdr); if (mon_dom_empty) resctrl_offline_mon_domain(&res->resctrl_res, &mon_d->hdr); } else { mon_dom_empty = true; } if (ctrl_dom_empty && mon_dom_empty) kfree(dom); } } int mpam_resctrl_setup(void) { int err = 0; struct mpam_resctrl_res *res; enum resctrl_res_level rid; struct mpam_resctrl_mon *mon; enum resctrl_event_id eventid; wait_event(wait_cacheinfo_ready, cacheinfo_ready); cpus_read_lock(); for_each_mpam_resctrl_control(res, rid) { INIT_LIST_HEAD_RCU(&res->resctrl_res.ctrl_domains); INIT_LIST_HEAD_RCU(&res->resctrl_res.mon_domains); res->resctrl_res.rid = rid; } /* Find some classes to use for controls */ mpam_resctrl_pick_caches(); mpam_resctrl_pick_mba(); /* Initialise the resctrl structures from the classes */ for_each_mpam_resctrl_control(res, rid) { if (!res->class) continue; // dummy resource err = mpam_resctrl_control_init(res); if (err) { pr_debug("Failed to initialise rid %u\n", rid); goto internal_error; } } /* Find some classes to use for monitors */ mpam_resctrl_pick_counters(); for_each_mpam_resctrl_mon(mon, eventid) { if (!mon->class) continue; // dummy resource err = mpam_resctrl_monitor_init(mon, eventid); if (err) { pr_debug("Failed to initialise event %u\n", eventid); goto internal_error; } } cpus_read_unlock(); if (!resctrl_arch_alloc_capable() && !resctrl_arch_mon_capable()) { pr_debug("No alloc(%u) or monitor(%u) found - resctrl not supported\n", resctrl_arch_alloc_capable(), resctrl_arch_mon_capable()); return -EOPNOTSUPP; } err = resctrl_init(); if (err) return err; WRITE_ONCE(resctrl_enabled, true); return 0; internal_error: cpus_read_unlock(); pr_debug("Internal error %d - resctrl not supported\n", err); return err; } void mpam_resctrl_exit(void) { if (!READ_ONCE(resctrl_enabled)) return; WRITE_ONCE(resctrl_enabled, false); resctrl_exit(); } /* * The driver is detaching an MSC from this class, if resctrl was using it, * pull on resctrl_exit(). */ void mpam_resctrl_teardown_class(struct mpam_class *class) { struct mpam_resctrl_res *res; enum resctrl_res_level rid; struct mpam_resctrl_mon *mon; enum resctrl_event_id eventid; might_sleep(); for_each_mpam_resctrl_control(res, rid) { if (res->class == class) { res->class = NULL; break; } } for_each_mpam_resctrl_mon(mon, eventid) { if (mon->class == class) { mon->class = NULL; break; } } } static int __init __cacheinfo_ready(void) { cacheinfo_ready = true; wake_up(&wait_cacheinfo_ready); return 0; } device_initcall_sync(__cacheinfo_ready); #ifdef CONFIG_MPAM_KUNIT_TEST #include "test_mpam_resctrl.c" #endif