From 93d7a3b4be30c705c39377b9e75b24a184f6c4fe Mon Sep 17 00:00:00 2001 From: Mika Kahola Date: Mon, 17 Jun 2019 11:24:13 +0300 Subject: drm/i915/icl: Add missing device ID We are missing PCI device ID for SKU ICLLP U GT 1.5F (0x8A54) as per BSPec. BSpec: 19092 Signed-off-by: Mika Kahola Reviewed-by: Clint Taylor Signed-off-by: Maarten Lankhorst Link: https://patchwork.freedesktop.org/patch/msgid/20190617082413.22549-1-mika.kahola@intel.com --- include/drm/i915_pciids.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/drm/i915_pciids.h b/include/drm/i915_pciids.h index 6d60ea68c171..6c342ac470c8 100644 --- a/include/drm/i915_pciids.h +++ b/include/drm/i915_pciids.h @@ -568,7 +568,8 @@ INTEL_VGA_DEVICE(0x8A56, info), \ INTEL_VGA_DEVICE(0x8A71, info), \ INTEL_VGA_DEVICE(0x8A70, info), \ - INTEL_VGA_DEVICE(0x8A53, info) + INTEL_VGA_DEVICE(0x8A53, info), \ + INTEL_VGA_DEVICE(0x8A54, info) #define INTEL_ICL_11_IDS(info) \ INTEL_ICL_PORT_F_IDS(info), \ -- cgit v1.2.3 From bf73fc0fa9cf78e37d6ee99e8d12bfa2083594d6 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Wed, 3 Jul 2019 15:37:02 +0100 Subject: drm/i915: Show support for accurate sw PMU busyness tracking Expose whether or not we support the PMU software tracking in our scheduler capabilities, so userspace can query at runtime. v2: Use I915_SCHEDULER_CAP_ENGINE_BUSY_STATS for a less ambiguous capability name. Signed-off-by: Chris Wilson Cc: Tvrtko Ursulin Reviewed-by: Tvrtko Ursulin Link: https://patchwork.freedesktop.org/patch/msgid/20190703143702.11339-1-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/gt/intel_engine_cs.c | 7 ++++--- drivers/gpu/drm/i915/i915_pmu.c | 4 +--- include/uapi/drm/i915_drm.h | 1 + 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c index c1fb5fa3952e..7d6d6e62e9cc 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c @@ -688,9 +688,10 @@ void intel_engines_set_scheduler_caps(struct drm_i915_private *i915) u8 engine; u8 sched; } map[] = { -#define MAP(x, y) { ilog2(I915_ENGINE_HAS_##x), ilog2(I915_SCHEDULER_CAP_##y) } - MAP(PREEMPTION, PREEMPTION), - MAP(SEMAPHORES, SEMAPHORES), +#define MAP(x, y) { ilog2(I915_ENGINE_##x), ilog2(I915_SCHEDULER_CAP_##y) } + MAP(HAS_PREEMPTION, PREEMPTION), + MAP(HAS_SEMAPHORES, SEMAPHORES), + MAP(SUPPORTS_STATS, ENGINE_BUSY_STATS), #undef MAP }; struct intel_engine_cs *engine; diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c index 8fe46ee920a0..eff86483bec0 100644 --- a/drivers/gpu/drm/i915/i915_pmu.c +++ b/drivers/gpu/drm/i915/i915_pmu.c @@ -102,10 +102,8 @@ static bool pmu_needs_timer(struct drm_i915_private *i915, bool gpu_active) /* * Also there is software busyness tracking available we do not * need the timer for I915_SAMPLE_BUSY counter. - * - * Use RCS as proxy for all engines. */ - else if (intel_engine_supports_stats(i915->engine[RCS0])) + else if (i915->caps.scheduler & I915_SCHEDULER_CAP_ENGINE_BUSY_STATS) enable &= ~BIT(I915_SAMPLE_BUSY); /* diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index 328d05e77d9f..469dc512cca3 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -521,6 +521,7 @@ typedef struct drm_i915_irq_wait { #define I915_SCHEDULER_CAP_PRIORITY (1ul << 1) #define I915_SCHEDULER_CAP_PREEMPTION (1ul << 2) #define I915_SCHEDULER_CAP_SEMAPHORES (1ul << 3) +#define I915_SCHEDULER_CAP_ENGINE_BUSY_STATS (1ul << 4) #define I915_PARAM_HUC_STATUS 42 -- cgit v1.2.3 From 163fa23435cc9c705a71001d4aa15f3f945554a1 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 3 Jul 2019 16:25:52 +0800 Subject: percpu: Make pcpu_setup_first_chunk() void function pcpu_setup_first_chunk() will panic or BUG_ON if the are some error and doesn't return any error, hence it can be defined to return void. Reported-by: kbuild test robot Signed-off-by: Kefeng Wang Signed-off-by: Dennis Zhou [Dennis: fixed kbuild warning for pcpu_page_first_chunk()] --- arch/ia64/mm/contig.c | 5 +---- arch/ia64/mm/discontig.c | 5 +---- include/linux/percpu.h | 2 +- mm/percpu.c | 19 +++++++------------ 4 files changed, 10 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/arch/ia64/mm/contig.c b/arch/ia64/mm/contig.c index d29fb6b9fa33..db09a693f094 100644 --- a/arch/ia64/mm/contig.c +++ b/arch/ia64/mm/contig.c @@ -134,10 +134,7 @@ setup_per_cpu_areas(void) ai->atom_size = PAGE_SIZE; ai->alloc_size = PERCPU_PAGE_SIZE; - rc = pcpu_setup_first_chunk(ai, __per_cpu_start + __per_cpu_offset[0]); - if (rc) - panic("failed to setup percpu area (err=%d)", rc); - + pcpu_setup_first_chunk(ai, __per_cpu_start + __per_cpu_offset[0]); pcpu_free_alloc_info(ai); } #else diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c index 05490dd073e6..004dee231874 100644 --- a/arch/ia64/mm/discontig.c +++ b/arch/ia64/mm/discontig.c @@ -245,10 +245,7 @@ void __init setup_per_cpu_areas(void) gi->cpu_map = &cpu_map[unit]; } - rc = pcpu_setup_first_chunk(ai, base); - if (rc) - panic("failed to setup percpu area (err=%d)", rc); - + pcpu_setup_first_chunk(ai, base); pcpu_free_alloc_info(ai); } #endif diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 9909dc0e273a..5e76af742c80 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -105,7 +105,7 @@ extern struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups, int nr_units); extern void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai); -extern int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, +extern void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, void *base_addr); #ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK diff --git a/mm/percpu.c b/mm/percpu.c index 9821241fdede..5a918a4b1da0 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -2267,12 +2267,9 @@ static void pcpu_dump_alloc_info(const char *lvl, * share the same vm, but use offset regions in the area allocation map. * The chunk serving the dynamic region is circulated in the chunk slots * and available for dynamic allocation like any other chunk. - * - * RETURNS: - * 0 on success, -errno on failure. */ -int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, - void *base_addr) +void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, + void *base_addr) { size_t size_sum = ai->static_size + ai->reserved_size + ai->dyn_size; size_t static_size, dyn_size; @@ -2457,7 +2454,6 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, /* we're done */ pcpu_base_addr = base_addr; - return 0; } #ifdef CONFIG_SMP @@ -2710,7 +2706,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, struct pcpu_alloc_info *ai; size_t size_sum, areas_size; unsigned long max_distance; - int group, i, highest_group, rc; + int group, i, highest_group, rc = 0; ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size, cpu_distance_fn); @@ -2795,7 +2791,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, PFN_DOWN(size_sum), ai->static_size, ai->reserved_size, ai->dyn_size, ai->unit_size); - rc = pcpu_setup_first_chunk(ai, base); + pcpu_setup_first_chunk(ai, base); goto out_free; out_free_areas: @@ -2839,7 +2835,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size, int unit_pages; size_t pages_size; struct page **pages; - int unit, i, j, rc; + int unit, i, j, rc = 0; int upa; int nr_g0_units; @@ -2920,7 +2916,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size, unit_pages, psize_str, ai->static_size, ai->reserved_size, ai->dyn_size); - rc = pcpu_setup_first_chunk(ai, vm.addr); + pcpu_setup_first_chunk(ai, vm.addr); goto out_free_ar; enomem: @@ -3014,8 +3010,7 @@ void __init setup_per_cpu_areas(void) ai->groups[0].nr_units = 1; ai->groups[0].cpu_map[0] = 0; - if (pcpu_setup_first_chunk(ai, fc) < 0) - panic("Failed to initialize percpu areas."); + pcpu_setup_first_chunk(ai, fc); pcpu_free_alloc_info(ai); } -- cgit v1.2.3 From 52db6685932e326ed607644ab7ebdae8c194adda Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Wed, 10 Jul 2019 16:59:55 +0900 Subject: ASoC: simple_card_utils.h: care NULL dai at asoc_simple_debug_dai() props->xxx_dai might be NULL when DPCM. This patch cares it for debug. Fixes: commit 0580dde59438 ("ASoC: simple-card-utils: add asoc_simple_debug_info()") Signed-off-by: Kuninori Morimoto Link: https://lore.kernel.org/r/87o922gw4u.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- include/sound/simple_card_utils.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/sound/simple_card_utils.h b/include/sound/simple_card_utils.h index 954563ee2277..985a5f583de4 100644 --- a/include/sound/simple_card_utils.h +++ b/include/sound/simple_card_utils.h @@ -141,6 +141,10 @@ inline void asoc_simple_debug_dai(struct asoc_simple_priv *priv, { struct device *dev = simple_priv_to_dev(priv); + /* dai might be NULL */ + if (!dai) + return; + if (dai->name) dev_dbg(dev, "%s dai name = %s\n", name, dai->name); -- cgit v1.2.3 From 22be8233b34f4f468934c5fefcbe6151766fb8f2 Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Thu, 11 Jul 2019 04:53:25 -0400 Subject: media: videodev2.h: change V4L2_PIX_FMT_BGRA444 define: fourcc was already in use The V4L2_PIX_FMT_BGRA444 define clashed with the pre-existing V4L2_PIX_FMT_SGRBG12 which strangely enough used the same fourcc, even though that fourcc made no sense for a Bayer format. In any case, you can't have duplicates, so change the fourcc of V4L2_PIX_FMT_BGRA444. Signed-off-by: Hans Verkuil Cc: # for v5.2 and up Fixes: 6c84f9b1d2900 ("media: v4l: Add definitions for missing 16-bit RGB4444 formats") Reviewed-by: Laurent Pinchart Reviewed-by: Kieran Bingham Signed-off-by: Mauro Carvalho Chehab --- include/uapi/linux/videodev2.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h index 9d9705ceda76..2427bc4d8eba 100644 --- a/include/uapi/linux/videodev2.h +++ b/include/uapi/linux/videodev2.h @@ -518,7 +518,13 @@ struct v4l2_pix_format { #define V4L2_PIX_FMT_RGBX444 v4l2_fourcc('R', 'X', '1', '2') /* 16 rrrrgggg bbbbxxxx */ #define V4L2_PIX_FMT_ABGR444 v4l2_fourcc('A', 'B', '1', '2') /* 16 aaaabbbb ggggrrrr */ #define V4L2_PIX_FMT_XBGR444 v4l2_fourcc('X', 'B', '1', '2') /* 16 xxxxbbbb ggggrrrr */ -#define V4L2_PIX_FMT_BGRA444 v4l2_fourcc('B', 'A', '1', '2') /* 16 bbbbgggg rrrraaaa */ + +/* + * Originally this had 'BA12' as fourcc, but this clashed with the older + * V4L2_PIX_FMT_SGRBG12 which inexplicably used that same fourcc. + * So use 'GA12' instead for V4L2_PIX_FMT_BGRA444. + */ +#define V4L2_PIX_FMT_BGRA444 v4l2_fourcc('G', 'A', '1', '2') /* 16 bbbbgggg rrrraaaa */ #define V4L2_PIX_FMT_BGRX444 v4l2_fourcc('B', 'X', '1', '2') /* 16 bbbbgggg rrrrxxxx */ #define V4L2_PIX_FMT_RGB555 v4l2_fourcc('R', 'G', 'B', 'O') /* 16 RGB-5-5-5 */ #define V4L2_PIX_FMT_ARGB555 v4l2_fourcc('A', 'R', '1', '5') /* 16 ARGB-1-5-5-5 */ -- cgit v1.2.3 From 9747f0c2fb9e1a6d7c907d34a373924c84093afa Mon Sep 17 00:00:00 2001 From: Lucas De Marchi Date: Thu, 11 Jul 2019 10:30:59 -0700 Subject: drm/i915/tgl: Add TGL PCI IDs Current list of PCI IDs for Tiger Lake. Cc: Rodrigo Vivi Signed-off-by: Lucas De Marchi Reviewed-by: Rodrigo Vivi Reviewed-by: Mika Kahola Link: https://patchwork.freedesktop.org/patch/msgid/20190711173115.28296-6-lucas.demarchi@intel.com --- drivers/gpu/drm/i915/i915_pci.c | 1 + include/drm/i915_pciids.h | 10 ++++++++++ 2 files changed, 11 insertions(+) (limited to 'include') diff --git a/drivers/gpu/drm/i915/i915_pci.c b/drivers/gpu/drm/i915/i915_pci.c index da926485845d..e83c94cf2744 100644 --- a/drivers/gpu/drm/i915/i915_pci.c +++ b/drivers/gpu/drm/i915/i915_pci.c @@ -865,6 +865,7 @@ static const struct pci_device_id pciidlist[] = { INTEL_CNL_IDS(&intel_cannonlake_info), INTEL_ICL_11_IDS(&intel_icelake_11_info), INTEL_EHL_IDS(&intel_elkhartlake_info), + INTEL_TGL_12_IDS(&intel_tigerlake_12_info), {0, 0, 0} }; MODULE_DEVICE_TABLE(pci, pciidlist); diff --git a/include/drm/i915_pciids.h b/include/drm/i915_pciids.h index 6c342ac470c8..a70c982ddff9 100644 --- a/include/drm/i915_pciids.h +++ b/include/drm/i915_pciids.h @@ -583,4 +583,14 @@ INTEL_VGA_DEVICE(0x4551, info), \ INTEL_VGA_DEVICE(0x4541, info) +/* TGL */ +#define INTEL_TGL_12_IDS(info) \ + INTEL_VGA_DEVICE(0x9A49, info), \ + INTEL_VGA_DEVICE(0x9A40, info), \ + INTEL_VGA_DEVICE(0x9A59, info), \ + INTEL_VGA_DEVICE(0x9A60, info), \ + INTEL_VGA_DEVICE(0x9A68, info), \ + INTEL_VGA_DEVICE(0x9A70, info), \ + INTEL_VGA_DEVICE(0x9A78, info) + #endif /* _I915_PCIIDS_H */ -- cgit v1.2.3 From 6c8337dafaa9a328216c62a79c9a03176af3ce70 Mon Sep 17 00:00:00 2001 From: Vandita Kulkarni Date: Thu, 11 Jul 2019 10:31:06 -0700 Subject: drm/i915/tgl: Add additional ports for Tiger Lake MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are 2 new additional typeC ports in Tiger Lake and PORT-C is now a combophy port. This results in 6 typeC ports and 3 combophy ports. These 6 TC ports can be DP alternate mode, DP over thunderbolt, native DP on legacy DP connector or native HDMI on legacy connector. v2: Rebase on new modular FIA code (Lucas) v3: Also add new port in port_identifier(), even though it can't possibly be used there (requested by José) v4: Add conversion port->tc_port in helper function after introction of phy namespace (Lucas) Cc: Anusha Srivatsa Signed-off-by: Vandita Kulkarni Signed-off-by: Lucas De Marchi Reviewed-by: Matt Roper Link: https://patchwork.freedesktop.org/patch/msgid/20190711173115.28296-13-lucas.demarchi@intel.com --- drivers/gpu/drm/i915/display/intel_ddi.c | 12 ++++++++++++ drivers/gpu/drm/i915/display/intel_display.c | 3 +++ drivers/gpu/drm/i915/display/intel_display.h | 8 ++++++++ include/drm/i915_component.h | 2 +- include/drm/i915_drm.h | 3 +++ 5 files changed, 27 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/gpu/drm/i915/display/intel_ddi.c b/drivers/gpu/drm/i915/display/intel_ddi.c index 1662e5c2be1c..8445244aa593 100644 --- a/drivers/gpu/drm/i915/display/intel_ddi.c +++ b/drivers/gpu/drm/i915/display/intel_ddi.c @@ -4286,6 +4286,18 @@ void intel_ddi_init(struct drm_i915_private *dev_priv, enum port port) intel_dig_port->ddi_io_power_domain = POWER_DOMAIN_PORT_DDI_F_IO; break; + case PORT_G: + intel_dig_port->ddi_io_power_domain = + POWER_DOMAIN_PORT_DDI_G_IO; + break; + case PORT_H: + intel_dig_port->ddi_io_power_domain = + POWER_DOMAIN_PORT_DDI_H_IO; + break; + case PORT_I: + intel_dig_port->ddi_io_power_domain = + POWER_DOMAIN_PORT_DDI_I_IO; + break; default: MISSING_CASE(port); } diff --git a/drivers/gpu/drm/i915/display/intel_display.c b/drivers/gpu/drm/i915/display/intel_display.c index c13784b81fb1..13ff4177beba 100644 --- a/drivers/gpu/drm/i915/display/intel_display.c +++ b/drivers/gpu/drm/i915/display/intel_display.c @@ -6706,6 +6706,9 @@ enum tc_port intel_port_to_tc(struct drm_i915_private *dev_priv, enum port port) if (!intel_phy_is_tc(dev_priv, intel_port_to_phy(dev_priv, port))) return PORT_TC_NONE; + if (INTEL_GEN(dev_priv) >= 12) + return port - PORT_D; + return port - PORT_C; } diff --git a/drivers/gpu/drm/i915/display/intel_display.h b/drivers/gpu/drm/i915/display/intel_display.h index 1f75b0a627fd..72ce27079a56 100644 --- a/drivers/gpu/drm/i915/display/intel_display.h +++ b/drivers/gpu/drm/i915/display/intel_display.h @@ -177,6 +177,12 @@ static inline const char *port_identifier(enum port port) return "Port E"; case PORT_F: return "Port F"; + case PORT_G: + return "Port G"; + case PORT_H: + return "Port H"; + case PORT_I: + return "Port I"; default: return ""; } @@ -189,6 +195,8 @@ enum tc_port { PORT_TC2, PORT_TC3, PORT_TC4, + PORT_TC5, + PORT_TC6, I915_MAX_TC_PORTS }; diff --git a/include/drm/i915_component.h b/include/drm/i915_component.h index dcb95bd9dee6..55c3b123581b 100644 --- a/include/drm/i915_component.h +++ b/include/drm/i915_component.h @@ -34,7 +34,7 @@ enum i915_component_type { /* MAX_PORT is the number of port * It must be sync with I915_MAX_PORTS defined i915_drv.h */ -#define MAX_PORTS 6 +#define MAX_PORTS 9 /** * struct i915_audio_component - Used for direct communication between i915 and hda drivers diff --git a/include/drm/i915_drm.h b/include/drm/i915_drm.h index 7523e9a7b6e2..eb30062359d1 100644 --- a/include/drm/i915_drm.h +++ b/include/drm/i915_drm.h @@ -109,6 +109,9 @@ enum port { PORT_D, PORT_E, PORT_F, + PORT_G, + PORT_H, + PORT_I, I915_MAX_PORTS }; -- cgit v1.2.3 From b5893ffc274be966f95aa35f35916fa8725af154 Mon Sep 17 00:00:00 2001 From: Janusz Krzysztofik Date: Fri, 12 Jul 2019 13:24:25 +0200 Subject: drm/i915: Drop extern qualifiers from header function prototypes Follow dim checkpatch recommendation so it doesn't complain on that now and again on header file modifications. v2: drop testing leftover (Chris) Signed-off-by: Janusz Krzysztofik Reviewed-by: Chris Wilson Signed-off-by: Chris Wilson Link: https://patchwork.freedesktop.org/patch/msgid/20190712112429.740-2-janusz.krzysztofik@linux.intel.com --- drivers/gpu/drm/i915/gem/i915_gem_object.h | 2 +- drivers/gpu/drm/i915/gvt/gtt.h | 13 +++++---- drivers/gpu/drm/i915/i915_drv.h | 47 ++++++++++++++---------------- drivers/gpu/drm/i915/i915_irq.h | 4 +-- drivers/gpu/drm/i915/oa/i915_oa_bdw.h | 2 +- drivers/gpu/drm/i915/oa/i915_oa_bxt.h | 2 +- drivers/gpu/drm/i915/oa/i915_oa_cflgt2.h | 2 +- drivers/gpu/drm/i915/oa/i915_oa_cflgt3.h | 2 +- drivers/gpu/drm/i915/oa/i915_oa_chv.h | 2 +- drivers/gpu/drm/i915/oa/i915_oa_cnl.h | 2 +- drivers/gpu/drm/i915/oa/i915_oa_glk.h | 2 +- drivers/gpu/drm/i915/oa/i915_oa_hsw.h | 2 +- drivers/gpu/drm/i915/oa/i915_oa_icl.h | 2 +- drivers/gpu/drm/i915/oa/i915_oa_kblgt2.h | 2 +- drivers/gpu/drm/i915/oa/i915_oa_kblgt3.h | 2 +- drivers/gpu/drm/i915/oa/i915_oa_sklgt2.h | 2 +- drivers/gpu/drm/i915/oa/i915_oa_sklgt3.h | 2 +- drivers/gpu/drm/i915/oa/i915_oa_sklgt4.h | 2 +- include/drm/i915_drm.h | 10 +++---- 19 files changed, 51 insertions(+), 53 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object.h b/drivers/gpu/drm/i915/gem/i915_gem_object.h index 20754c15412a..67aea07ea019 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_object.h +++ b/drivers/gpu/drm/i915/gem/i915_gem_object.h @@ -81,7 +81,7 @@ i915_gem_object_lookup(struct drm_file *file, u32 handle) } __deprecated -extern struct drm_gem_object * +struct drm_gem_object * drm_gem_object_lookup(struct drm_file *file, u32 handle); __attribute__((nonnull)) diff --git a/drivers/gpu/drm/i915/gvt/gtt.h b/drivers/gpu/drm/i915/gvt/gtt.h index 42d0394f0de2..88789316807d 100644 --- a/drivers/gpu/drm/i915/gvt/gtt.h +++ b/drivers/gpu/drm/i915/gvt/gtt.h @@ -205,17 +205,18 @@ struct intel_vgpu_gtt { struct intel_vgpu_scratch_pt scratch_pt[GTT_TYPE_MAX]; }; -extern int intel_vgpu_init_gtt(struct intel_vgpu *vgpu); -extern void intel_vgpu_clean_gtt(struct intel_vgpu *vgpu); +int intel_vgpu_init_gtt(struct intel_vgpu *vgpu); +void intel_vgpu_clean_gtt(struct intel_vgpu *vgpu); void intel_vgpu_reset_ggtt(struct intel_vgpu *vgpu, bool invalidate_old); void intel_vgpu_invalidate_ppgtt(struct intel_vgpu *vgpu); -extern int intel_gvt_init_gtt(struct intel_gvt *gvt); +int intel_gvt_init_gtt(struct intel_gvt *gvt); void intel_vgpu_reset_gtt(struct intel_vgpu *vgpu); -extern void intel_gvt_clean_gtt(struct intel_gvt *gvt); +void intel_gvt_clean_gtt(struct intel_gvt *gvt); -extern struct intel_vgpu_mm *intel_gvt_find_ppgtt_mm(struct intel_vgpu *vgpu, - int page_table_level, void *root_entry); +struct intel_vgpu_mm *intel_gvt_find_ppgtt_mm(struct intel_vgpu *vgpu, + int page_table_level, + void *root_entry); struct intel_vgpu_oos_page { struct intel_vgpu_ppgtt_spt *spt; diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 56527a7a1666..1a6b4e14a405 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -2390,19 +2390,17 @@ __i915_printk(struct drm_i915_private *dev_priv, const char *level, __i915_printk(dev_priv, KERN_ERR, fmt, ##__VA_ARGS__) #ifdef CONFIG_COMPAT -extern long i915_compat_ioctl(struct file *filp, unsigned int cmd, - unsigned long arg); +long i915_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); #else #define i915_compat_ioctl NULL #endif extern const struct dev_pm_ops i915_pm_ops; -extern int i915_driver_load(struct pci_dev *pdev, - const struct pci_device_id *ent); -extern void i915_driver_unload(struct drm_device *dev); +int i915_driver_load(struct pci_dev *pdev, const struct pci_device_id *ent); +void i915_driver_unload(struct drm_device *dev); -extern void intel_engine_init_hangcheck(struct intel_engine_cs *engine); -extern void intel_hangcheck_init(struct drm_i915_private *dev_priv); +void intel_engine_init_hangcheck(struct intel_engine_cs *engine); +void intel_hangcheck_init(struct drm_i915_private *dev_priv); int vlv_force_gfx_clock(struct drm_i915_private *dev_priv, bool on); u32 intel_calculate_mcr_s_ss_select(struct drm_i915_private *dev_priv); @@ -2672,14 +2670,14 @@ int intel_engine_cmd_parser(struct intel_engine_cs *engine, bool is_master); /* i915_perf.c */ -extern void i915_perf_init(struct drm_i915_private *dev_priv); -extern void i915_perf_fini(struct drm_i915_private *dev_priv); -extern void i915_perf_register(struct drm_i915_private *dev_priv); -extern void i915_perf_unregister(struct drm_i915_private *dev_priv); +void i915_perf_init(struct drm_i915_private *dev_priv); +void i915_perf_fini(struct drm_i915_private *dev_priv); +void i915_perf_register(struct drm_i915_private *dev_priv); +void i915_perf_unregister(struct drm_i915_private *dev_priv); /* i915_suspend.c */ -extern int i915_save_state(struct drm_i915_private *dev_priv); -extern int i915_restore_state(struct drm_i915_private *dev_priv); +int i915_save_state(struct drm_i915_private *dev_priv); +int i915_restore_state(struct drm_i915_private *dev_priv); /* i915_sysfs.c */ void i915_setup_sysfs(struct drm_i915_private *dev_priv); @@ -2693,23 +2691,22 @@ mkwrite_device_info(struct drm_i915_private *dev_priv) } /* modesetting */ -extern void intel_modeset_init_hw(struct drm_device *dev); -extern int intel_modeset_init(struct drm_device *dev); -extern void intel_modeset_cleanup(struct drm_device *dev); -extern int intel_modeset_vga_set_state(struct drm_i915_private *dev_priv, - bool state); -extern void intel_display_resume(struct drm_device *dev); -extern void i915_redisable_vga(struct drm_i915_private *dev_priv); -extern void i915_redisable_vga_power_on(struct drm_i915_private *dev_priv); -extern void intel_init_pch_refclk(struct drm_i915_private *dev_priv); +void intel_modeset_init_hw(struct drm_device *dev); +int intel_modeset_init(struct drm_device *dev); +void intel_modeset_cleanup(struct drm_device *dev); +int intel_modeset_vga_set_state(struct drm_i915_private *dev_priv, bool state); +void intel_display_resume(struct drm_device *dev); +void i915_redisable_vga(struct drm_i915_private *dev_priv); +void i915_redisable_vga_power_on(struct drm_i915_private *dev_priv); +void intel_init_pch_refclk(struct drm_i915_private *dev_priv); int i915_reg_read_ioctl(struct drm_device *dev, void *data, struct drm_file *file); -extern struct intel_display_error_state * +struct intel_display_error_state * intel_display_capture_error_state(struct drm_i915_private *dev_priv); -extern void intel_display_print_error_state(struct drm_i915_error_state_buf *e, - struct intel_display_error_state *error); +void intel_display_print_error_state(struct drm_i915_error_state_buf *e, + struct intel_display_error_state *error); #define __I915_REG_OP(op__, dev_priv__, ...) \ intel_uncore_##op__(&(dev_priv__)->uncore, __VA_ARGS__) diff --git a/drivers/gpu/drm/i915/i915_irq.h b/drivers/gpu/drm/i915/i915_irq.h index d93fa4e75442..4f803f910177 100644 --- a/drivers/gpu/drm/i915/i915_irq.h +++ b/drivers/gpu/drm/i915/i915_irq.h @@ -13,8 +13,8 @@ struct drm_i915_private; struct intel_crtc; -extern void intel_irq_init(struct drm_i915_private *dev_priv); -extern void intel_irq_fini(struct drm_i915_private *dev_priv); +void intel_irq_init(struct drm_i915_private *dev_priv); +void intel_irq_fini(struct drm_i915_private *dev_priv); int intel_irq_install(struct drm_i915_private *dev_priv); void intel_irq_uninstall(struct drm_i915_private *dev_priv); diff --git a/drivers/gpu/drm/i915/oa/i915_oa_bdw.h b/drivers/gpu/drm/i915/oa/i915_oa_bdw.h index 0e667f1a8aa1..b5ed68882588 100644 --- a/drivers/gpu/drm/i915/oa/i915_oa_bdw.h +++ b/drivers/gpu/drm/i915/oa/i915_oa_bdw.h @@ -10,6 +10,6 @@ #ifndef __I915_OA_BDW_H__ #define __I915_OA_BDW_H__ -extern void i915_perf_load_test_config_bdw(struct drm_i915_private *dev_priv); +void i915_perf_load_test_config_bdw(struct drm_i915_private *dev_priv); #endif diff --git a/drivers/gpu/drm/i915/oa/i915_oa_bxt.h b/drivers/gpu/drm/i915/oa/i915_oa_bxt.h index 679e92cf4f1d..43c3e4ab030a 100644 --- a/drivers/gpu/drm/i915/oa/i915_oa_bxt.h +++ b/drivers/gpu/drm/i915/oa/i915_oa_bxt.h @@ -10,6 +10,6 @@ #ifndef __I915_OA_BXT_H__ #define __I915_OA_BXT_H__ -extern void i915_perf_load_test_config_bxt(struct drm_i915_private *dev_priv); +void i915_perf_load_test_config_bxt(struct drm_i915_private *dev_priv); #endif diff --git a/drivers/gpu/drm/i915/oa/i915_oa_cflgt2.h b/drivers/gpu/drm/i915/oa/i915_oa_cflgt2.h index 4d6025559bbe..1b4b563bc585 100644 --- a/drivers/gpu/drm/i915/oa/i915_oa_cflgt2.h +++ b/drivers/gpu/drm/i915/oa/i915_oa_cflgt2.h @@ -10,6 +10,6 @@ #ifndef __I915_OA_CFLGT2_H__ #define __I915_OA_CFLGT2_H__ -extern void i915_perf_load_test_config_cflgt2(struct drm_i915_private *dev_priv); +void i915_perf_load_test_config_cflgt2(struct drm_i915_private *dev_priv); #endif diff --git a/drivers/gpu/drm/i915/oa/i915_oa_cflgt3.h b/drivers/gpu/drm/i915/oa/i915_oa_cflgt3.h index 0697f4077402..500565e055cd 100644 --- a/drivers/gpu/drm/i915/oa/i915_oa_cflgt3.h +++ b/drivers/gpu/drm/i915/oa/i915_oa_cflgt3.h @@ -10,6 +10,6 @@ #ifndef __I915_OA_CFLGT3_H__ #define __I915_OA_CFLGT3_H__ -extern void i915_perf_load_test_config_cflgt3(struct drm_i915_private *dev_priv); +void i915_perf_load_test_config_cflgt3(struct drm_i915_private *dev_priv); #endif diff --git a/drivers/gpu/drm/i915/oa/i915_oa_chv.h b/drivers/gpu/drm/i915/oa/i915_oa_chv.h index 0986eae3135f..ad85d6a6a573 100644 --- a/drivers/gpu/drm/i915/oa/i915_oa_chv.h +++ b/drivers/gpu/drm/i915/oa/i915_oa_chv.h @@ -10,6 +10,6 @@ #ifndef __I915_OA_CHV_H__ #define __I915_OA_CHV_H__ -extern void i915_perf_load_test_config_chv(struct drm_i915_private *dev_priv); +void i915_perf_load_test_config_chv(struct drm_i915_private *dev_priv); #endif diff --git a/drivers/gpu/drm/i915/oa/i915_oa_cnl.h b/drivers/gpu/drm/i915/oa/i915_oa_cnl.h index e830a406aff2..9faaca38b587 100644 --- a/drivers/gpu/drm/i915/oa/i915_oa_cnl.h +++ b/drivers/gpu/drm/i915/oa/i915_oa_cnl.h @@ -10,6 +10,6 @@ #ifndef __I915_OA_CNL_H__ #define __I915_OA_CNL_H__ -extern void i915_perf_load_test_config_cnl(struct drm_i915_private *dev_priv); +void i915_perf_load_test_config_cnl(struct drm_i915_private *dev_priv); #endif diff --git a/drivers/gpu/drm/i915/oa/i915_oa_glk.h b/drivers/gpu/drm/i915/oa/i915_oa_glk.h index 06dedf991edb..cc13a1e9fd3e 100644 --- a/drivers/gpu/drm/i915/oa/i915_oa_glk.h +++ b/drivers/gpu/drm/i915/oa/i915_oa_glk.h @@ -10,6 +10,6 @@ #ifndef __I915_OA_GLK_H__ #define __I915_OA_GLK_H__ -extern void i915_perf_load_test_config_glk(struct drm_i915_private *dev_priv); +void i915_perf_load_test_config_glk(struct drm_i915_private *dev_priv); #endif diff --git a/drivers/gpu/drm/i915/oa/i915_oa_hsw.h b/drivers/gpu/drm/i915/oa/i915_oa_hsw.h index 3d0c870cd0bd..f0ddcc79c761 100644 --- a/drivers/gpu/drm/i915/oa/i915_oa_hsw.h +++ b/drivers/gpu/drm/i915/oa/i915_oa_hsw.h @@ -10,6 +10,6 @@ #ifndef __I915_OA_HSW_H__ #define __I915_OA_HSW_H__ -extern void i915_perf_load_test_config_hsw(struct drm_i915_private *dev_priv); +void i915_perf_load_test_config_hsw(struct drm_i915_private *dev_priv); #endif diff --git a/drivers/gpu/drm/i915/oa/i915_oa_icl.h b/drivers/gpu/drm/i915/oa/i915_oa_icl.h index 24eaa97d61ba..e501651d385b 100644 --- a/drivers/gpu/drm/i915/oa/i915_oa_icl.h +++ b/drivers/gpu/drm/i915/oa/i915_oa_icl.h @@ -10,6 +10,6 @@ #ifndef __I915_OA_ICL_H__ #define __I915_OA_ICL_H__ -extern void i915_perf_load_test_config_icl(struct drm_i915_private *dev_priv); +void i915_perf_load_test_config_icl(struct drm_i915_private *dev_priv); #endif diff --git a/drivers/gpu/drm/i915/oa/i915_oa_kblgt2.h b/drivers/gpu/drm/i915/oa/i915_oa_kblgt2.h index a55398a904de..dc460e6e0fae 100644 --- a/drivers/gpu/drm/i915/oa/i915_oa_kblgt2.h +++ b/drivers/gpu/drm/i915/oa/i915_oa_kblgt2.h @@ -10,6 +10,6 @@ #ifndef __I915_OA_KBLGT2_H__ #define __I915_OA_KBLGT2_H__ -extern void i915_perf_load_test_config_kblgt2(struct drm_i915_private *dev_priv); +void i915_perf_load_test_config_kblgt2(struct drm_i915_private *dev_priv); #endif diff --git a/drivers/gpu/drm/i915/oa/i915_oa_kblgt3.h b/drivers/gpu/drm/i915/oa/i915_oa_kblgt3.h index 3ddd3483b7cc..5926992b735a 100644 --- a/drivers/gpu/drm/i915/oa/i915_oa_kblgt3.h +++ b/drivers/gpu/drm/i915/oa/i915_oa_kblgt3.h @@ -10,6 +10,6 @@ #ifndef __I915_OA_KBLGT3_H__ #define __I915_OA_KBLGT3_H__ -extern void i915_perf_load_test_config_kblgt3(struct drm_i915_private *dev_priv); +void i915_perf_load_test_config_kblgt3(struct drm_i915_private *dev_priv); #endif diff --git a/drivers/gpu/drm/i915/oa/i915_oa_sklgt2.h b/drivers/gpu/drm/i915/oa/i915_oa_sklgt2.h index be6256037239..353db35b36c1 100644 --- a/drivers/gpu/drm/i915/oa/i915_oa_sklgt2.h +++ b/drivers/gpu/drm/i915/oa/i915_oa_sklgt2.h @@ -10,6 +10,6 @@ #ifndef __I915_OA_SKLGT2_H__ #define __I915_OA_SKLGT2_H__ -extern void i915_perf_load_test_config_sklgt2(struct drm_i915_private *dev_priv); +void i915_perf_load_test_config_sklgt2(struct drm_i915_private *dev_priv); #endif diff --git a/drivers/gpu/drm/i915/oa/i915_oa_sklgt3.h b/drivers/gpu/drm/i915/oa/i915_oa_sklgt3.h index 650beb068e56..52f94c674b62 100644 --- a/drivers/gpu/drm/i915/oa/i915_oa_sklgt3.h +++ b/drivers/gpu/drm/i915/oa/i915_oa_sklgt3.h @@ -10,6 +10,6 @@ #ifndef __I915_OA_SKLGT3_H__ #define __I915_OA_SKLGT3_H__ -extern void i915_perf_load_test_config_sklgt3(struct drm_i915_private *dev_priv); +void i915_perf_load_test_config_sklgt3(struct drm_i915_private *dev_priv); #endif diff --git a/drivers/gpu/drm/i915/oa/i915_oa_sklgt4.h b/drivers/gpu/drm/i915/oa/i915_oa_sklgt4.h index 8dcf849d131e..8e364820cc63 100644 --- a/drivers/gpu/drm/i915/oa/i915_oa_sklgt4.h +++ b/drivers/gpu/drm/i915/oa/i915_oa_sklgt4.h @@ -10,6 +10,6 @@ #ifndef __I915_OA_SKLGT4_H__ #define __I915_OA_SKLGT4_H__ -extern void i915_perf_load_test_config_sklgt4(struct drm_i915_private *dev_priv); +void i915_perf_load_test_config_sklgt4(struct drm_i915_private *dev_priv); #endif diff --git a/include/drm/i915_drm.h b/include/drm/i915_drm.h index eb30062359d1..23274cf92712 100644 --- a/include/drm/i915_drm.h +++ b/include/drm/i915_drm.h @@ -30,11 +30,11 @@ #include /* For use by IPS driver */ -extern unsigned long i915_read_mch_val(void); -extern bool i915_gpu_raise(void); -extern bool i915_gpu_lower(void); -extern bool i915_gpu_busy(void); -extern bool i915_gpu_turbo_disable(void); +unsigned long i915_read_mch_val(void); +bool i915_gpu_raise(void); +bool i915_gpu_lower(void); +bool i915_gpu_busy(void); +bool i915_gpu_turbo_disable(void); /* Exported from arch/x86/kernel/early-quirks.c */ extern struct resource intel_graphics_stolen_res; -- cgit v1.2.3 From 3c00fb0bf0e0f061715c04ad609de93ddc046aa1 Mon Sep 17 00:00:00 2001 From: xiao ruizhu Date: Thu, 4 Jul 2019 11:31:13 +0800 Subject: netfilter: nf_conntrack_sip: fix expectation clash When conntracks change during a dialog, SDP messages may be sent from different conntracks to establish expects with identical tuples. In this case expects conflict may be detected for the 2nd SDP message and end up with a process failure. The fixing here is to reuse an existing expect who has the same tuple for a different conntrack if any. Here are two scenarios for the case. 1) SERVER CPE | INVITE SDP | 5060 |<----------------------|5060 | 100 Trying | 5060 |---------------------->|5060 | 183 SDP | 5060 |---------------------->|5060 ===> Conntrack 1 | PRACK | 50601 |<----------------------|5060 | 200 OK (PRACK) | 50601 |---------------------->|5060 | 200 OK (INVITE) | 5060 |---------------------->|5060 | ACK | 50601 |<----------------------|5060 | | |<--- RTP stream ------>| | | | INVITE SDP (t38) | 50601 |---------------------->|5060 ===> Conntrack 2 With a certain configuration in the CPE, SIP messages "183 with SDP" and "re-INVITE with SDP t38" will go through the sip helper to create expects for RTP and RTCP. It is okay to create RTP and RTCP expects for "183", whose master connection source port is 5060, and destination port is 5060. In the "183" message, port in Contact header changes to 50601 (from the original 5060). So the following requests e.g. PRACK and ACK are sent to port 50601. It is a different conntrack (let call Conntrack 2) from the original INVITE (let call Conntrack 1) due to the port difference. In this example, after the call is established, there is RTP stream but no RTCP stream for Conntrack 1, so the RTP expect created upon "183" is cleared, and RTCP expect created for Conntrack 1 retains. When "re-INVITE with SDP t38" arrives to create RTP&RTCP expects, current ALG implementation will call nf_ct_expect_related() for RTP and RTCP. The expects tuples are identical to those for Conntrack 1. RTP expect for Conntrack 2 succeeds in creation as the one for Conntrack 1 has been removed. RTCP expect for Conntrack 2 fails in creation because it has idential tuples and 'conflict' with the one retained for Conntrack 1. And then result in a failure in processing of the re-INVITE. 2) SERVER A CPE | REGISTER | 5060 |<------------------| 5060 ==> CT1 | 200 | 5060 |------------------>| 5060 | | | INVITE SDP(1) | 5060 |<------------------| 5060 | 300(multi choice) | 5060 |------------------>| 5060 SERVER B | ACK | 5060 |<------------------| 5060 | INVITE SDP(2) | 5060 |-------------------->| 5060 ==> CT2 | 100 | 5060 |<--------------------| 5060 | 200(contact changes)| 5060 |<--------------------| 5060 | ACK | 5060 |-------------------->| 50601 ==> CT3 | | |<--- RTP stream ---->| | | | BYE | 5060 |<--------------------| 50601 | 200 | 5060 |-------------------->| 50601 | INVITE SDP(3) | 5060 |<------------------| 5060 ==> CT1 CPE sends an INVITE request(1) to Server A, and creates a RTP&RTCP expect pair for this Conntrack 1 (CT1). Server A responds 300 to redirect to Server B. The RTP&RTCP expect pairs created on CT1 are removed upon 300 response. CPE sends the INVITE request(2) to Server B, and creates an expect pair for the new conntrack (due to destination address difference), let call CT2. Server B changes the port to 50601 in 200 OK response, and the following requests ACK and BYE from CPE are sent to 50601. The call is established. There is RTP stream and no RTCP stream. So RTP expect is removed and RTCP expect for CT2 retains. As BYE request is sent from port 50601, it is another conntrack, let call CT3, different from CT2 due to the port difference. So the BYE request will not remove the RTCP expect for CT2. Then another outgoing call is made, with the same RTP port being used (not definitely but possibly). CPE firstly sends the INVITE request(3) to Server A, and tries to create a RTP&RTCP expect pairs for this CT1. In current ALG implementation, the RTCP expect for CT1 fails in creation because it 'conflicts' with the residual one for CT2. As a result the INVITE request fails to send. Signed-off-by: xiao ruizhu Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_expect.h | 12 +++++++++--- net/ipv4/netfilter/nf_nat_h323.c | 12 ++++++------ net/netfilter/ipvs/ip_vs_nfct.c | 2 +- net/netfilter/nf_conntrack_amanda.c | 2 +- net/netfilter/nf_conntrack_broadcast.c | 2 +- net/netfilter/nf_conntrack_expect.c | 26 +++++++++++++++++++------- net/netfilter/nf_conntrack_ftp.c | 2 +- net/netfilter/nf_conntrack_h323_main.c | 18 +++++++++--------- net/netfilter/nf_conntrack_irc.c | 2 +- net/netfilter/nf_conntrack_netlink.c | 4 ++-- net/netfilter/nf_conntrack_pptp.c | 4 ++-- net/netfilter/nf_conntrack_sane.c | 2 +- net/netfilter/nf_conntrack_sip.c | 10 +++++++--- net/netfilter/nf_conntrack_tftp.c | 2 +- net/netfilter/nf_nat_amanda.c | 2 +- net/netfilter/nf_nat_ftp.c | 2 +- net/netfilter/nf_nat_irc.c | 2 +- net/netfilter/nf_nat_sip.c | 8 +++++--- net/netfilter/nf_nat_tftp.c | 2 +- net/netfilter/nft_ct.c | 2 +- 20 files changed, 71 insertions(+), 47 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack_expect.h b/include/net/netfilter/nf_conntrack_expect.h index 93ce6b0daaba..573429be4d59 100644 --- a/include/net/netfilter/nf_conntrack_expect.h +++ b/include/net/netfilter/nf_conntrack_expect.h @@ -76,6 +76,11 @@ struct nf_conntrack_expect_policy { #define NF_CT_EXPECT_CLASS_DEFAULT 0 #define NF_CT_EXPECT_MAX_CNT 255 +/* Allow to reuse expectations with the same tuples from different master + * conntracks. + */ +#define NF_CT_EXP_F_SKIP_MASTER 0x1 + int nf_conntrack_expect_pernet_init(struct net *net); void nf_conntrack_expect_pernet_fini(struct net *net); @@ -122,10 +127,11 @@ void nf_ct_expect_init(struct nf_conntrack_expect *, unsigned int, u_int8_t, u_int8_t, const __be16 *, const __be16 *); void nf_ct_expect_put(struct nf_conntrack_expect *exp); int nf_ct_expect_related_report(struct nf_conntrack_expect *expect, - u32 portid, int report); -static inline int nf_ct_expect_related(struct nf_conntrack_expect *expect) + u32 portid, int report, unsigned int flags); +static inline int nf_ct_expect_related(struct nf_conntrack_expect *expect, + unsigned int flags) { - return nf_ct_expect_related_report(expect, 0, 0); + return nf_ct_expect_related_report(expect, 0, 0, flags); } #endif /*_NF_CONNTRACK_EXPECT_H*/ diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c index 87b711fd5a44..3e2685c120c7 100644 --- a/net/ipv4/netfilter/nf_nat_h323.c +++ b/net/ipv4/netfilter/nf_nat_h323.c @@ -221,11 +221,11 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct, int ret; rtp_exp->tuple.dst.u.udp.port = htons(nated_port); - ret = nf_ct_expect_related(rtp_exp); + ret = nf_ct_expect_related(rtp_exp, 0); if (ret == 0) { rtcp_exp->tuple.dst.u.udp.port = htons(nated_port + 1); - ret = nf_ct_expect_related(rtcp_exp); + ret = nf_ct_expect_related(rtcp_exp, 0); if (ret == 0) break; else if (ret == -EBUSY) { @@ -296,7 +296,7 @@ static int nat_t120(struct sk_buff *skb, struct nf_conn *ct, int ret; exp->tuple.dst.u.tcp.port = htons(nated_port); - ret = nf_ct_expect_related(exp); + ret = nf_ct_expect_related(exp, 0); if (ret == 0) break; else if (ret != -EBUSY) { @@ -352,7 +352,7 @@ static int nat_h245(struct sk_buff *skb, struct nf_conn *ct, int ret; exp->tuple.dst.u.tcp.port = htons(nated_port); - ret = nf_ct_expect_related(exp); + ret = nf_ct_expect_related(exp, 0); if (ret == 0) break; else if (ret != -EBUSY) { @@ -444,7 +444,7 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct, int ret; exp->tuple.dst.u.tcp.port = htons(nated_port); - ret = nf_ct_expect_related(exp); + ret = nf_ct_expect_related(exp, 0); if (ret == 0) break; else if (ret != -EBUSY) { @@ -537,7 +537,7 @@ static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct, int ret; exp->tuple.dst.u.tcp.port = htons(nated_port); - ret = nf_ct_expect_related(exp); + ret = nf_ct_expect_related(exp, 0); if (ret == 0) break; else if (ret != -EBUSY) { diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c index 403541996952..08adcb222986 100644 --- a/net/netfilter/ipvs/ip_vs_nfct.c +++ b/net/netfilter/ipvs/ip_vs_nfct.c @@ -231,7 +231,7 @@ void ip_vs_nfct_expect_related(struct sk_buff *skb, struct nf_conn *ct, IP_VS_DBG_BUF(7, "%s: ct=%p, expect tuple=" FMT_TUPLE "\n", __func__, ct, ARG_TUPLE(&exp->tuple)); - nf_ct_expect_related(exp); + nf_ct_expect_related(exp, 0); nf_ct_expect_put(exp); } EXPORT_SYMBOL(ip_vs_nfct_expect_related); diff --git a/net/netfilter/nf_conntrack_amanda.c b/net/netfilter/nf_conntrack_amanda.c index 42ee659d0d1e..d011d2eb0848 100644 --- a/net/netfilter/nf_conntrack_amanda.c +++ b/net/netfilter/nf_conntrack_amanda.c @@ -159,7 +159,7 @@ static int amanda_help(struct sk_buff *skb, if (nf_nat_amanda && ct->status & IPS_NAT_MASK) ret = nf_nat_amanda(skb, ctinfo, protoff, off - dataoff, len, exp); - else if (nf_ct_expect_related(exp) != 0) { + else if (nf_ct_expect_related(exp, 0) != 0) { nf_ct_helper_log(skb, ct, "cannot add expectation"); ret = NF_DROP; } diff --git a/net/netfilter/nf_conntrack_broadcast.c b/net/netfilter/nf_conntrack_broadcast.c index 921a7b95be68..1ba6becc3079 100644 --- a/net/netfilter/nf_conntrack_broadcast.c +++ b/net/netfilter/nf_conntrack_broadcast.c @@ -68,7 +68,7 @@ int nf_conntrack_broadcast_help(struct sk_buff *skb, exp->class = NF_CT_EXPECT_CLASS_DEFAULT; exp->helper = NULL; - nf_ct_expect_related(exp); + nf_ct_expect_related(exp, 0); nf_ct_expect_put(exp); nf_ct_refresh(ct, skb, timeout * HZ); diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index ffd1f4906c4f..65364de915d1 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -249,13 +249,22 @@ static inline int expect_clash(const struct nf_conntrack_expect *a, static inline int expect_matches(const struct nf_conntrack_expect *a, const struct nf_conntrack_expect *b) { - return a->master == b->master && - nf_ct_tuple_equal(&a->tuple, &b->tuple) && + return nf_ct_tuple_equal(&a->tuple, &b->tuple) && nf_ct_tuple_mask_equal(&a->mask, &b->mask) && net_eq(nf_ct_net(a->master), nf_ct_net(b->master)) && nf_ct_zone_equal_any(a->master, nf_ct_zone(b->master)); } +static bool master_matches(const struct nf_conntrack_expect *a, + const struct nf_conntrack_expect *b, + unsigned int flags) +{ + if (flags & NF_CT_EXP_F_SKIP_MASTER) + return true; + + return a->master == b->master; +} + /* Generally a bad idea to call this: could have matched already. */ void nf_ct_unexpect_related(struct nf_conntrack_expect *exp) { @@ -399,7 +408,8 @@ static void evict_oldest_expect(struct nf_conn *master, nf_ct_remove_expect(last); } -static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect) +static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect, + unsigned int flags) { const struct nf_conntrack_expect_policy *p; struct nf_conntrack_expect *i; @@ -417,8 +427,10 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect) } h = nf_ct_expect_dst_hash(net, &expect->tuple); hlist_for_each_entry_safe(i, next, &nf_ct_expect_hash[h], hnode) { - if (expect_matches(i, expect)) { - if (i->class != expect->class) + if (master_matches(i, expect, flags) && + expect_matches(i, expect)) { + if (i->class != expect->class || + i->master != expect->master) return -EALREADY; if (nf_ct_remove_expect(i)) @@ -453,12 +465,12 @@ out: } int nf_ct_expect_related_report(struct nf_conntrack_expect *expect, - u32 portid, int report) + u32 portid, int report, unsigned int flags) { int ret; spin_lock_bh(&nf_conntrack_expect_lock); - ret = __nf_ct_expect_check(expect); + ret = __nf_ct_expect_check(expect, flags); if (ret < 0) goto out; diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c index 8c6c11bab5b6..0ecb3e289ef2 100644 --- a/net/netfilter/nf_conntrack_ftp.c +++ b/net/netfilter/nf_conntrack_ftp.c @@ -525,7 +525,7 @@ skip_nl_seq: protoff, matchoff, matchlen, exp); else { /* Can't expect this? Best to drop packet now. */ - if (nf_ct_expect_related(exp) != 0) { + if (nf_ct_expect_related(exp, 0) != 0) { nf_ct_helper_log(skb, ct, "cannot add expectation"); ret = NF_DROP; } else diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c index 6497e5fc0871..8ba037b76ad3 100644 --- a/net/netfilter/nf_conntrack_h323_main.c +++ b/net/netfilter/nf_conntrack_h323_main.c @@ -305,8 +305,8 @@ static int expect_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct, ret = nat_rtp_rtcp(skb, ct, ctinfo, protoff, data, dataoff, taddr, port, rtp_port, rtp_exp, rtcp_exp); } else { /* Conntrack only */ - if (nf_ct_expect_related(rtp_exp) == 0) { - if (nf_ct_expect_related(rtcp_exp) == 0) { + if (nf_ct_expect_related(rtp_exp, 0) == 0) { + if (nf_ct_expect_related(rtcp_exp, 0) == 0) { pr_debug("nf_ct_h323: expect RTP "); nf_ct_dump_tuple(&rtp_exp->tuple); pr_debug("nf_ct_h323: expect RTCP "); @@ -364,7 +364,7 @@ static int expect_t120(struct sk_buff *skb, ret = nat_t120(skb, ct, ctinfo, protoff, data, dataoff, taddr, port, exp); } else { /* Conntrack only */ - if (nf_ct_expect_related(exp) == 0) { + if (nf_ct_expect_related(exp, 0) == 0) { pr_debug("nf_ct_h323: expect T.120 "); nf_ct_dump_tuple(&exp->tuple); } else @@ -701,7 +701,7 @@ static int expect_h245(struct sk_buff *skb, struct nf_conn *ct, ret = nat_h245(skb, ct, ctinfo, protoff, data, dataoff, taddr, port, exp); } else { /* Conntrack only */ - if (nf_ct_expect_related(exp) == 0) { + if (nf_ct_expect_related(exp, 0) == 0) { pr_debug("nf_ct_q931: expect H.245 "); nf_ct_dump_tuple(&exp->tuple); } else @@ -825,7 +825,7 @@ static int expect_callforwarding(struct sk_buff *skb, protoff, data, dataoff, taddr, port, exp); } else { /* Conntrack only */ - if (nf_ct_expect_related(exp) == 0) { + if (nf_ct_expect_related(exp, 0) == 0) { pr_debug("nf_ct_q931: expect Call Forwarding "); nf_ct_dump_tuple(&exp->tuple); } else @@ -1284,7 +1284,7 @@ static int expect_q931(struct sk_buff *skb, struct nf_conn *ct, ret = nat_q931(skb, ct, ctinfo, protoff, data, taddr, i, port, exp); } else { /* Conntrack only */ - if (nf_ct_expect_related(exp) == 0) { + if (nf_ct_expect_related(exp, 0) == 0) { pr_debug("nf_ct_ras: expect Q.931 "); nf_ct_dump_tuple(&exp->tuple); @@ -1349,7 +1349,7 @@ static int process_gcf(struct sk_buff *skb, struct nf_conn *ct, IPPROTO_UDP, NULL, &port); exp->helper = nf_conntrack_helper_ras; - if (nf_ct_expect_related(exp) == 0) { + if (nf_ct_expect_related(exp, 0) == 0) { pr_debug("nf_ct_ras: expect RAS "); nf_ct_dump_tuple(&exp->tuple); } else @@ -1561,7 +1561,7 @@ static int process_acf(struct sk_buff *skb, struct nf_conn *ct, exp->flags = NF_CT_EXPECT_PERMANENT; exp->helper = nf_conntrack_helper_q931; - if (nf_ct_expect_related(exp) == 0) { + if (nf_ct_expect_related(exp, 0) == 0) { pr_debug("nf_ct_ras: expect Q.931 "); nf_ct_dump_tuple(&exp->tuple); } else @@ -1615,7 +1615,7 @@ static int process_lcf(struct sk_buff *skb, struct nf_conn *ct, exp->flags = NF_CT_EXPECT_PERMANENT; exp->helper = nf_conntrack_helper_q931; - if (nf_ct_expect_related(exp) == 0) { + if (nf_ct_expect_related(exp, 0) == 0) { pr_debug("nf_ct_ras: expect Q.931 "); nf_ct_dump_tuple(&exp->tuple); } else diff --git a/net/netfilter/nf_conntrack_irc.c b/net/netfilter/nf_conntrack_irc.c index 7ac156f1f3bc..e40988a2f22f 100644 --- a/net/netfilter/nf_conntrack_irc.c +++ b/net/netfilter/nf_conntrack_irc.c @@ -213,7 +213,7 @@ static int help(struct sk_buff *skb, unsigned int protoff, addr_beg_p - ib_ptr, addr_end_p - addr_beg_p, exp); - else if (nf_ct_expect_related(exp) != 0) { + else if (nf_ct_expect_related(exp, 0) != 0) { nf_ct_helper_log(skb, ct, "cannot add expectation"); ret = NF_DROP; diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 1b77444d5b52..6aa01eb6fe99 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -2616,7 +2616,7 @@ ctnetlink_glue_attach_expect(const struct nlattr *attr, struct nf_conn *ct, if (IS_ERR(exp)) return PTR_ERR(exp); - err = nf_ct_expect_related_report(exp, portid, report); + err = nf_ct_expect_related_report(exp, portid, report, 0); nf_ct_expect_put(exp); return err; } @@ -3367,7 +3367,7 @@ ctnetlink_create_expect(struct net *net, goto err_rcu; } - err = nf_ct_expect_related_report(exp, portid, report); + err = nf_ct_expect_related_report(exp, portid, report, 0); nf_ct_expect_put(exp); err_rcu: rcu_read_unlock(); diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c index b22042ad0fca..a971183f11af 100644 --- a/net/netfilter/nf_conntrack_pptp.c +++ b/net/netfilter/nf_conntrack_pptp.c @@ -234,9 +234,9 @@ static int exp_gre(struct nf_conn *ct, __be16 callid, __be16 peer_callid) nf_nat_pptp_exp_gre = rcu_dereference(nf_nat_pptp_hook_exp_gre); if (nf_nat_pptp_exp_gre && ct->status & IPS_NAT_MASK) nf_nat_pptp_exp_gre(exp_orig, exp_reply); - if (nf_ct_expect_related(exp_orig) != 0) + if (nf_ct_expect_related(exp_orig, 0) != 0) goto out_put_both; - if (nf_ct_expect_related(exp_reply) != 0) + if (nf_ct_expect_related(exp_reply, 0) != 0) goto out_unexpect_orig; /* Add GRE keymap entries */ diff --git a/net/netfilter/nf_conntrack_sane.c b/net/netfilter/nf_conntrack_sane.c index 81448c3db661..1aebd6569d4e 100644 --- a/net/netfilter/nf_conntrack_sane.c +++ b/net/netfilter/nf_conntrack_sane.c @@ -153,7 +153,7 @@ static int help(struct sk_buff *skb, nf_ct_dump_tuple(&exp->tuple); /* Can't expect this? Best to drop packet now. */ - if (nf_ct_expect_related(exp) != 0) { + if (nf_ct_expect_related(exp, 0) != 0) { nf_ct_helper_log(skb, ct, "cannot add expectation"); ret = NF_DROP; } diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index 107251731809..b83dc9bf0a5d 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -977,11 +977,15 @@ static int set_expected_rtp_rtcp(struct sk_buff *skb, unsigned int protoff, /* -EALREADY handling works around end-points that send * SDP messages with identical port but different media type, * we pretend expectation was set up. + * It also works in the case that SDP messages are sent with + * identical expect tuples but for different master conntracks. */ - int errp = nf_ct_expect_related(rtp_exp); + int errp = nf_ct_expect_related(rtp_exp, + NF_CT_EXP_F_SKIP_MASTER); if (errp == 0 || errp == -EALREADY) { - int errcp = nf_ct_expect_related(rtcp_exp); + int errcp = nf_ct_expect_related(rtcp_exp, + NF_CT_EXP_F_SKIP_MASTER); if (errcp == 0 || errcp == -EALREADY) ret = NF_ACCEPT; @@ -1296,7 +1300,7 @@ static int process_register_request(struct sk_buff *skb, unsigned int protoff, ret = hooks->expect(skb, protoff, dataoff, dptr, datalen, exp, matchoff, matchlen); else { - if (nf_ct_expect_related(exp) != 0) { + if (nf_ct_expect_related(exp, 0) != 0) { nf_ct_helper_log(skb, ct, "cannot add expectation"); ret = NF_DROP; } else diff --git a/net/netfilter/nf_conntrack_tftp.c b/net/netfilter/nf_conntrack_tftp.c index df6d6d61bd58..80ee53f29f68 100644 --- a/net/netfilter/nf_conntrack_tftp.c +++ b/net/netfilter/nf_conntrack_tftp.c @@ -78,7 +78,7 @@ static int tftp_help(struct sk_buff *skb, nf_nat_tftp = rcu_dereference(nf_nat_tftp_hook); if (nf_nat_tftp && ct->status & IPS_NAT_MASK) ret = nf_nat_tftp(skb, ctinfo, exp); - else if (nf_ct_expect_related(exp) != 0) { + else if (nf_ct_expect_related(exp, 0) != 0) { nf_ct_helper_log(skb, ct, "cannot add expectation"); ret = NF_DROP; } diff --git a/net/netfilter/nf_nat_amanda.c b/net/netfilter/nf_nat_amanda.c index a352604d6186..3bc7e0854efe 100644 --- a/net/netfilter/nf_nat_amanda.c +++ b/net/netfilter/nf_nat_amanda.c @@ -48,7 +48,7 @@ static unsigned int help(struct sk_buff *skb, int res; exp->tuple.dst.u.tcp.port = htons(port); - res = nf_ct_expect_related(exp); + res = nf_ct_expect_related(exp, 0); if (res == 0) break; else if (res != -EBUSY) { diff --git a/net/netfilter/nf_nat_ftp.c b/net/netfilter/nf_nat_ftp.c index d48484a9d52d..aace6768a64e 100644 --- a/net/netfilter/nf_nat_ftp.c +++ b/net/netfilter/nf_nat_ftp.c @@ -91,7 +91,7 @@ static unsigned int nf_nat_ftp(struct sk_buff *skb, int ret; exp->tuple.dst.u.tcp.port = htons(port); - ret = nf_ct_expect_related(exp); + ret = nf_ct_expect_related(exp, 0); if (ret == 0) break; else if (ret != -EBUSY) { diff --git a/net/netfilter/nf_nat_irc.c b/net/netfilter/nf_nat_irc.c index dfb7ef8845bd..c691ab8d234c 100644 --- a/net/netfilter/nf_nat_irc.c +++ b/net/netfilter/nf_nat_irc.c @@ -53,7 +53,7 @@ static unsigned int help(struct sk_buff *skb, int ret; exp->tuple.dst.u.tcp.port = htons(port); - ret = nf_ct_expect_related(exp); + ret = nf_ct_expect_related(exp, 0); if (ret == 0) break; else if (ret != -EBUSY) { diff --git a/net/netfilter/nf_nat_sip.c b/net/netfilter/nf_nat_sip.c index e338d91980d8..f0a735e86851 100644 --- a/net/netfilter/nf_nat_sip.c +++ b/net/netfilter/nf_nat_sip.c @@ -414,7 +414,7 @@ static unsigned int nf_nat_sip_expect(struct sk_buff *skb, unsigned int protoff, int ret; exp->tuple.dst.u.udp.port = htons(port); - ret = nf_ct_expect_related(exp); + ret = nf_ct_expect_related(exp, NF_CT_EXP_F_SKIP_MASTER); if (ret == 0) break; else if (ret != -EBUSY) { @@ -607,7 +607,8 @@ static unsigned int nf_nat_sdp_media(struct sk_buff *skb, unsigned int protoff, int ret; rtp_exp->tuple.dst.u.udp.port = htons(port); - ret = nf_ct_expect_related(rtp_exp); + ret = nf_ct_expect_related(rtp_exp, + NF_CT_EXP_F_SKIP_MASTER); if (ret == -EBUSY) continue; else if (ret < 0) { @@ -615,7 +616,8 @@ static unsigned int nf_nat_sdp_media(struct sk_buff *skb, unsigned int protoff, break; } rtcp_exp->tuple.dst.u.udp.port = htons(port + 1); - ret = nf_ct_expect_related(rtcp_exp); + ret = nf_ct_expect_related(rtcp_exp, + NF_CT_EXP_F_SKIP_MASTER); if (ret == 0) break; else if (ret == -EBUSY) { diff --git a/net/netfilter/nf_nat_tftp.c b/net/netfilter/nf_nat_tftp.c index 833a11f68031..1a591132d6eb 100644 --- a/net/netfilter/nf_nat_tftp.c +++ b/net/netfilter/nf_nat_tftp.c @@ -30,7 +30,7 @@ static unsigned int help(struct sk_buff *skb, = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.udp.port; exp->dir = IP_CT_DIR_REPLY; exp->expectfn = nf_nat_follow_master; - if (nf_ct_expect_related(exp) != 0) { + if (nf_ct_expect_related(exp, 0) != 0) { nf_ct_helper_log(skb, exp->master, "cannot add expectation"); return NF_DROP; } diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 827ab6196df9..46ca8bcca1bd 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -1252,7 +1252,7 @@ static void nft_ct_expect_obj_eval(struct nft_object *obj, priv->l4proto, NULL, &priv->dport); exp->timeout.expires = jiffies + priv->timeout * HZ; - if (nf_ct_expect_related(exp) != 0) + if (nf_ct_expect_related(exp, 0) != 0) regs->verdict.code = NF_DROP; } -- cgit v1.2.3 From 05ba4c895363db795f3d54f2da0de56d6520e52d Mon Sep 17 00:00:00 2001 From: Yonatan Goldschmidt Date: Mon, 8 Jul 2019 15:57:09 -0700 Subject: netfilter: Update obsolete comments referring to ip_conntrack In 9fb9cbb1082d ("[NETFILTER]: Add nf_conntrack subsystem.") the new generic nf_conntrack was introduced, and it came to supersede the old ip_conntrack. This change updates (some) of the obsolete comments referring to old file/function names of the ip_conntrack mechanism, as well as removes a few self-referencing comments that we shouldn't maintain anymore. I did not update any comments referring to historical actions (e.g, comments like "this file was derived from ..." were left untouched, even if the referenced file is no longer here). Signed-off-by: Yonatan Goldschmidt Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nf_conntrack_h323_asn1.h | 3 +-- net/ipv4/netfilter/ipt_CLUSTERIP.c | 4 ++-- net/netfilter/Kconfig | 6 ++---- net/netfilter/nf_conntrack_core.c | 4 +--- net/netfilter/nf_conntrack_h323_asn1.c | 5 ++--- net/netfilter/nf_conntrack_proto_gre.c | 2 -- net/netfilter/nf_conntrack_proto_icmp.c | 2 +- net/netfilter/nf_nat_core.c | 2 +- 8 files changed, 10 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter/nf_conntrack_h323_asn1.h b/include/linux/netfilter/nf_conntrack_h323_asn1.h index 91d6275292a5..19df78341fb3 100644 --- a/include/linux/netfilter/nf_conntrack_h323_asn1.h +++ b/include/linux/netfilter/nf_conntrack_h323_asn1.h @@ -1,7 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /**************************************************************************** - * ip_conntrack_h323_asn1.h - BER and PER decoding library for H.323 - * conntrack/NAT module. + * BER and PER decoding library for H.323 conntrack/NAT module. * * Copyright (c) 2006 by Jing Min Zhao * diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 4d6bf7ac0792..6bdb1ab8af61 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -416,8 +416,8 @@ clusterip_tg(struct sk_buff *skb, const struct xt_action_param *par) ctinfo == IP_CT_RELATED_REPLY)) return XT_CONTINUE; - /* ip_conntrack_icmp guarantees us that we only have ICMP_ECHO, - * TIMESTAMP, INFO_REQUEST or ADDRESS type icmp packets from here + /* nf_conntrack_proto_icmp guarantees us that we only have ICMP_ECHO, + * TIMESTAMP, INFO_REQUEST or ICMP_ADDRESS type icmp packets from here * on, which all have an ID field [relevant for hashing]. */ hash = clusterip_hashfn(skb, cipinfo->config); diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 32a45c03786e..0d65f4d39494 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -223,8 +223,6 @@ config NF_CONNTRACK_FTP of Network Address Translation on them. This is FTP support on Layer 3 independent connection tracking. - Layer 3 independent connection tracking is experimental scheme - which generalize ip_conntrack to support other layer 3 protocols. To compile it as a module, choose M here. If unsure, say N. @@ -338,7 +336,7 @@ config NF_CONNTRACK_SIP help SIP is an application-layer control protocol that can establish, modify, and terminate multimedia sessions (conferences) such as - Internet telephony calls. With the ip_conntrack_sip and + Internet telephony calls. With the nf_conntrack_sip and the nf_nat_sip modules you can support the protocol on a connection tracking/NATing firewall. @@ -1313,7 +1311,7 @@ config NETFILTER_XT_MATCH_HELPER depends on NETFILTER_ADVANCED help Helper matching allows you to match packets in dynamic connections - tracked by a conntrack-helper, ie. ip_conntrack_ftp + tracked by a conntrack-helper, ie. nf_conntrack_ftp To compile it as a module, choose M here. If unsure, say Y. diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index bdfeacee0817..a542761e90d1 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1817,9 +1817,7 @@ EXPORT_SYMBOL_GPL(nf_ct_kill_acct); #include #include -/* Generic function for tcp/udp/sctp/dccp and alike. This needs to be - * in ip_conntrack_core, since we don't want the protocols to autoload - * or depend on ctnetlink */ +/* Generic function for tcp/udp/sctp/dccp and alike. */ int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple) { diff --git a/net/netfilter/nf_conntrack_h323_asn1.c b/net/netfilter/nf_conntrack_h323_asn1.c index 8f6ba8162f0b..573cb4481481 100644 --- a/net/netfilter/nf_conntrack_h323_asn1.c +++ b/net/netfilter/nf_conntrack_h323_asn1.c @@ -1,11 +1,10 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * ip_conntrack_helper_h323_asn1.c - BER and PER decoding library for H.323 - * conntrack/NAT module. + * BER and PER decoding library for H.323 conntrack/NAT module. * * Copyright (c) 2006 by Jing Min Zhao * - * See ip_conntrack_helper_h323_asn1.h for details. + * See nf_conntrack_helper_h323_asn1.h for details. */ #ifdef __KERNEL__ diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c index c2eb365f1723..5b05487a60d2 100644 --- a/net/netfilter/nf_conntrack_proto_gre.c +++ b/net/netfilter/nf_conntrack_proto_gre.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * ip_conntrack_proto_gre.c - Version 3.0 - * * Connection tracking protocol helper module for GRE. * * GRE is a generic encapsulation protocol, which is generally not very diff --git a/net/netfilter/nf_conntrack_proto_icmp.c b/net/netfilter/nf_conntrack_proto_icmp.c index dd53e2b20f6b..097deba7441a 100644 --- a/net/netfilter/nf_conntrack_proto_icmp.c +++ b/net/netfilter/nf_conntrack_proto_icmp.c @@ -215,7 +215,7 @@ int nf_conntrack_icmpv4_error(struct nf_conn *tmpl, return -NF_ACCEPT; } - /* See ip_conntrack_proto_tcp.c */ + /* See nf_conntrack_proto_tcp.c */ if (state->net->ct.sysctl_checksum && state->hook == NF_INET_PRE_ROUTING && nf_ip_checksum(skb, state->hook, dataoff, IPPROTO_ICMP)) { diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 9ab410455992..3f6023ed4966 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -519,7 +519,7 @@ another_round: * and NF_INET_LOCAL_OUT, we change the destination to map into the * range. It might not be possible to get a unique tuple, but we try. * At worst (or if we race), we will end up with a final duplicate in - * __ip_conntrack_confirm and drop the packet. */ + * __nf_conntrack_confirm and drop the packet. */ static void get_unique_tuple(struct nf_conntrack_tuple *tuple, const struct nf_conntrack_tuple *orig_tuple, -- cgit v1.2.3 From b83329fb473f29d34d85d642e3a3313bb2871fa9 Mon Sep 17 00:00:00 2001 From: Fernando Fernandez Mancera Date: Wed, 10 Jul 2019 12:05:57 +0200 Subject: netfilter: synproxy: fix erroneous tcp mss option Now synproxy sends the mss value set by the user on client syn-ack packet instead of the mss value that client announced. Fixes: 48b1de4c110a ("netfilter: add SYNPROXY core/target") Signed-off-by: Fernando Fernandez Mancera Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_synproxy.h | 1 + net/ipv4/netfilter/ipt_SYNPROXY.c | 2 ++ net/ipv6/netfilter/ip6t_SYNPROXY.c | 2 ++ net/netfilter/nf_synproxy_core.c | 4 ++-- net/netfilter/nft_synproxy.c | 2 ++ 5 files changed, 9 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack_synproxy.h b/include/net/netfilter/nf_conntrack_synproxy.h index 8f00125b06f4..44513b93bd55 100644 --- a/include/net/netfilter/nf_conntrack_synproxy.h +++ b/include/net/netfilter/nf_conntrack_synproxy.h @@ -68,6 +68,7 @@ struct synproxy_options { u8 options; u8 wscale; u16 mss; + u16 mss_encode; u32 tsval; u32 tsecr; }; diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c index 8e7f84ec783d..0e70f3f65f6f 100644 --- a/net/ipv4/netfilter/ipt_SYNPROXY.c +++ b/net/ipv4/netfilter/ipt_SYNPROXY.c @@ -36,6 +36,8 @@ synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par) opts.options |= XT_SYNPROXY_OPT_ECN; opts.options &= info->options; + opts.mss_encode = opts.mss; + opts.mss = info->mss; if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) synproxy_init_timestamp_cookie(info, &opts); else diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c index e77ea1ed5edd..5cdb4a69d277 100644 --- a/net/ipv6/netfilter/ip6t_SYNPROXY.c +++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c @@ -36,6 +36,8 @@ synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par) opts.options |= XT_SYNPROXY_OPT_ECN; opts.options &= info->options; + opts.mss_encode = opts.mss; + opts.mss = info->mss; if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) synproxy_init_timestamp_cookie(info, &opts); else diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index b101f187eda8..09718e5a9e41 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -470,7 +470,7 @@ synproxy_send_client_synack(struct net *net, struct iphdr *iph, *niph; struct tcphdr *nth; unsigned int tcp_hdr_size; - u16 mss = opts->mss; + u16 mss = opts->mss_encode; iph = ip_hdr(skb); @@ -884,7 +884,7 @@ synproxy_send_client_synack_ipv6(struct net *net, struct ipv6hdr *iph, *niph; struct tcphdr *nth; unsigned int tcp_hdr_size; - u16 mss = opts->mss; + u16 mss = opts->mss_encode; iph = ipv6_hdr(skb); diff --git a/net/netfilter/nft_synproxy.c b/net/netfilter/nft_synproxy.c index 80060ade8a5b..928e661d1517 100644 --- a/net/netfilter/nft_synproxy.c +++ b/net/netfilter/nft_synproxy.c @@ -31,6 +31,8 @@ static void nft_synproxy_tcp_options(struct synproxy_options *opts, opts->options |= NF_SYNPROXY_OPT_ECN; opts->options &= priv->info.options; + opts->mss_encode = opts->mss; + opts->mss = info->mss; if (opts->options & NF_SYNPROXY_OPT_TIMESTAMP) synproxy_init_timestamp_cookie(info, opts); else -- cgit v1.2.3 From 07b0fdecb2477396bcb69609019aade2b22124a1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 16 Jul 2019 07:58:31 -0700 Subject: blkcg: allow blkcg_policy->pd_stat() to print non-debug info too Currently, ->pd_stat() is called only when moduleparam blkcg_debug_stats is set which prevents it from printing non-debug policy-specific statistics. Let's move debug testing down so that ->pd_stat() can print non-debug stat too. This patch doesn't cause any visible behavior change. Signed-off-by: Tejun Heo Cc: Josef Bacik Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 9 +++------ block/blk-iolatency.c | 3 +++ include/linux/blk-cgroup.h | 1 + 3 files changed, 7 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 24ed26957367..55a7dc227dfb 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -54,7 +54,7 @@ static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */ -static bool blkcg_debug_stats = false; +bool blkcg_debug_stats = false; static struct workqueue_struct *blkcg_punt_bio_wq; static bool blkcg_policy_enabled(struct request_queue *q, @@ -944,10 +944,7 @@ static int blkcg_print_stat(struct seq_file *sf, void *v) dbytes, dios); } - if (!blkcg_debug_stats) - goto next; - - if (atomic_read(&blkg->use_delay)) { + if (blkcg_debug_stats && atomic_read(&blkg->use_delay)) { has_stats = true; off += scnprintf(buf+off, size-off, " use_delay=%d delay_nsec=%llu", @@ -967,7 +964,7 @@ static int blkcg_print_stat(struct seq_file *sf, void *v) has_stats = true; off += written; } -next: + if (has_stats) { if (off < size - 1) { off += scnprintf(buf+off, size-off, "\n"); diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index d973c38ee4fd..0fff7b56df0e 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -917,6 +917,9 @@ static size_t iolatency_pd_stat(struct blkg_policy_data *pd, char *buf, unsigned long long avg_lat; unsigned long long cur_win; + if (!blkcg_debug_stats) + return 0; + if (iolat->ssd) return iolatency_ssd_stat(iolat, buf, size); diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 689a58231288..12811091fd50 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -181,6 +181,7 @@ struct blkcg_policy { extern struct blkcg blkcg_root; extern struct cgroup_subsys_state * const blkcg_root_css; +extern bool blkcg_debug_stats; struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg, struct request_queue *q, bool update_hint); -- cgit v1.2.3 From e0aaa332e6a97dae57ad59cdb19e21f83c3d081c Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Mon, 15 Jul 2019 12:00:21 +0200 Subject: xfrm interface: ifname may be wrong in logs The ifname is copied when the interface is created, but is never updated later. In fact, this property is used only in one error message, where the netdevice pointer is available, thus let's use it. Fixes: f203b76d7809 ("xfrm: Add virtual xfrm interfaces") Signed-off-by: Nicolas Dichtel Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 1 - net/xfrm/xfrm_interface.c | 10 +--------- 2 files changed, 1 insertion(+), 10 deletions(-) (limited to 'include') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index b22db30c3d88..ad761ef84797 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -983,7 +983,6 @@ static inline void xfrm_dst_destroy(struct xfrm_dst *xdst) void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev); struct xfrm_if_parms { - char name[IFNAMSIZ]; /* name of XFRM device */ int link; /* ifindex of underlying L2 interface */ u32 if_id; /* interface identifyer */ }; diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c index 2310dc9e35c2..68336ee00d72 100644 --- a/net/xfrm/xfrm_interface.c +++ b/net/xfrm/xfrm_interface.c @@ -145,8 +145,6 @@ static int xfrmi_create(struct net_device *dev) if (err < 0) goto out; - strcpy(xi->p.name, dev->name); - dev_hold(dev); xfrmi_link(xfrmn, xi); @@ -294,7 +292,7 @@ xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) if (tdev == dev) { stats->collisions++; net_warn_ratelimited("%s: Local routing loop detected!\n", - xi->p.name); + dev->name); goto tx_err_dst_release; } @@ -638,12 +636,6 @@ static int xfrmi_newlink(struct net *src_net, struct net_device *dev, int err; xfrmi_netlink_parms(data, &p); - - if (!tb[IFLA_IFNAME]) - return -EINVAL; - - nla_strlcpy(p.name, tb[IFLA_IFNAME], IFNAMSIZ); - xi = xfrmi_locate(net, &p); if (xi) return -EEXIST; -- cgit v1.2.3 From 22d6552f827ef76ade3edf6bbb3f05048a0a7d8b Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Mon, 15 Jul 2019 12:00:23 +0200 Subject: xfrm interface: fix management of phydev With the current implementation, phydev cannot be removed: $ ip link add dummy type dummy $ ip link add xfrm1 type xfrm dev dummy if_id 1 $ ip l d dummy kernel:[77938.465445] unregister_netdevice: waiting for dummy to become free. Usage count = 1 Manage it like in ip tunnels, ie just keep the ifindex. Not that the side effect, is that the phydev is now optional. Fixes: f203b76d7809 ("xfrm: Add virtual xfrm interfaces") Signed-off-by: Nicolas Dichtel Tested-by: Julien Floret Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 1 - net/xfrm/xfrm_interface.c | 32 +++++++++++++++++--------------- 2 files changed, 17 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index ad761ef84797..aa08a7a5f6ac 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -990,7 +990,6 @@ struct xfrm_if_parms { struct xfrm_if { struct xfrm_if __rcu *next; /* next interface in list */ struct net_device *dev; /* virtual device associated with interface */ - struct net_device *phydev; /* physical device */ struct net *net; /* netns for packet i/o */ struct xfrm_if_parms p; /* interface parms */ diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c index 53e5e47b2c55..2ab4859df55a 100644 --- a/net/xfrm/xfrm_interface.c +++ b/net/xfrm/xfrm_interface.c @@ -175,7 +175,6 @@ static void xfrmi_dev_uninit(struct net_device *dev) struct xfrmi_net *xfrmn = net_generic(xi->net, xfrmi_net_id); xfrmi_unlink(xfrmn, xi); - dev_put(xi->phydev); dev_put(dev); } @@ -362,7 +361,7 @@ static netdev_tx_t xfrmi_xmit(struct sk_buff *skb, struct net_device *dev) goto tx_err; } - fl.flowi_oif = xi->phydev->ifindex; + fl.flowi_oif = xi->p.link; ret = xfrmi_xmit2(skb, dev, &fl); if (ret < 0) @@ -548,7 +547,7 @@ static int xfrmi_get_iflink(const struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); - return xi->phydev->ifindex; + return xi->p.link; } @@ -574,12 +573,14 @@ static void xfrmi_dev_setup(struct net_device *dev) dev->needs_free_netdev = true; dev->priv_destructor = xfrmi_dev_free; netif_keep_dst(dev); + + eth_broadcast_addr(dev->broadcast); } static int xfrmi_dev_init(struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); - struct net_device *phydev = xi->phydev; + struct net_device *phydev = __dev_get_by_index(xi->net, xi->p.link); int err; dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); @@ -594,13 +595,19 @@ static int xfrmi_dev_init(struct net_device *dev) dev->features |= NETIF_F_LLTX; - dev->needed_headroom = phydev->needed_headroom; - dev->needed_tailroom = phydev->needed_tailroom; + if (phydev) { + dev->needed_headroom = phydev->needed_headroom; + dev->needed_tailroom = phydev->needed_tailroom; - if (is_zero_ether_addr(dev->dev_addr)) - eth_hw_addr_inherit(dev, phydev); - if (is_zero_ether_addr(dev->broadcast)) - memcpy(dev->broadcast, phydev->broadcast, dev->addr_len); + if (is_zero_ether_addr(dev->dev_addr)) + eth_hw_addr_inherit(dev, phydev); + if (is_zero_ether_addr(dev->broadcast)) + memcpy(dev->broadcast, phydev->broadcast, + dev->addr_len); + } else { + eth_hw_addr_random(dev); + eth_broadcast_addr(dev->broadcast); + } return 0; } @@ -644,13 +651,8 @@ static int xfrmi_newlink(struct net *src_net, struct net_device *dev, xi->p = p; xi->net = net; xi->dev = dev; - xi->phydev = dev_get_by_index(net, p.link); - if (!xi->phydev) - return -ENODEV; err = xfrmi_create(dev); - if (err < 0) - dev_put(xi->phydev); return err; } -- cgit v1.2.3 From 4d2e26a38fbcde2ba14882cbdb845caa1c17e19b Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Wed, 10 Apr 2019 08:32:42 -0300 Subject: docs: powerpc: convert docs to ReST and rename to *.rst Convert docs to ReST and add them to the arch-specific book. The conversion here was trivial, as almost every file there was already using an elegant format close to ReST standard. The changes were mostly to mark literal blocks and add a few missing section title identifiers. One note with regards to "--": on Sphinx, this can't be used to identify a list, as it will format it badly. This can be used, however, to identify a long hyphen - and "---" is an even longer one. At its new index.rst, let's add a :orphan: while this is not linked to the main index.rst file, in order to avoid build warnings. Signed-off-by: Mauro Carvalho Chehab Acked-by: Andrew Donnellan # cxl --- Documentation/PCI/pci-error-recovery.rst | 5 +- Documentation/index.rst | 1 + Documentation/powerpc/DAWR-POWER9.txt | 90 ---- Documentation/powerpc/bootwrapper.rst | 155 ++++++ Documentation/powerpc/bootwrapper.txt | 141 ----- Documentation/powerpc/cpu_families.rst | 222 ++++++++ Documentation/powerpc/cpu_families.txt | 221 -------- Documentation/powerpc/cpu_features.rst | 60 +++ Documentation/powerpc/cpu_features.txt | 56 -- Documentation/powerpc/cxl.rst | 467 +++++++++++++++++ Documentation/powerpc/cxl.txt | 449 ---------------- Documentation/powerpc/cxlflash.rst | 433 +++++++++++++++ Documentation/powerpc/cxlflash.txt | 429 --------------- Documentation/powerpc/dawr-power9.rst | 93 ++++ Documentation/powerpc/dscr.rst | 87 +++ Documentation/powerpc/dscr.txt | 83 --- Documentation/powerpc/eeh-pci-error-recovery.rst | 336 ++++++++++++ Documentation/powerpc/eeh-pci-error-recovery.txt | 334 ------------ Documentation/powerpc/firmware-assisted-dump.rst | 301 +++++++++++ Documentation/powerpc/firmware-assisted-dump.txt | 292 ----------- Documentation/powerpc/hvcs.rst | 581 +++++++++++++++++++++ Documentation/powerpc/hvcs.txt | 567 -------------------- Documentation/powerpc/index.rst | 34 ++ Documentation/powerpc/isa-versions.rst | 15 +- Documentation/powerpc/mpc52xx.rst | 43 ++ Documentation/powerpc/mpc52xx.txt | 39 -- .../powerpc/pci_iov_resource_on_powernv.rst | 312 +++++++++++ .../powerpc/pci_iov_resource_on_powernv.txt | 301 ----------- Documentation/powerpc/pmu-ebb.rst | 138 +++++ Documentation/powerpc/pmu-ebb.txt | 137 ----- Documentation/powerpc/ptrace.rst | 156 ++++++ Documentation/powerpc/ptrace.txt | 151 ------ Documentation/powerpc/qe_firmware.rst | 296 +++++++++++ Documentation/powerpc/qe_firmware.txt | 295 ----------- Documentation/powerpc/syscall64-abi.rst | 110 ++++ Documentation/powerpc/syscall64-abi.txt | 105 ---- Documentation/powerpc/transactional_memory.rst | 247 +++++++++ Documentation/powerpc/transactional_memory.txt | 244 --------- MAINTAINERS | 6 +- arch/powerpc/kernel/exceptions-64s.S | 2 +- drivers/soc/fsl/qe/qe.c | 2 +- drivers/tty/hvc/hvcs.c | 2 +- include/soc/fsl/qe/qe.h | 2 +- 43 files changed, 4090 insertions(+), 3950 deletions(-) delete mode 100644 Documentation/powerpc/DAWR-POWER9.txt create mode 100644 Documentation/powerpc/bootwrapper.rst delete mode 100644 Documentation/powerpc/bootwrapper.txt create mode 100644 Documentation/powerpc/cpu_families.rst delete mode 100644 Documentation/powerpc/cpu_families.txt create mode 100644 Documentation/powerpc/cpu_features.rst delete mode 100644 Documentation/powerpc/cpu_features.txt create mode 100644 Documentation/powerpc/cxl.rst delete mode 100644 Documentation/powerpc/cxl.txt create mode 100644 Documentation/powerpc/cxlflash.rst delete mode 100644 Documentation/powerpc/cxlflash.txt create mode 100644 Documentation/powerpc/dawr-power9.rst create mode 100644 Documentation/powerpc/dscr.rst delete mode 100644 Documentation/powerpc/dscr.txt create mode 100644 Documentation/powerpc/eeh-pci-error-recovery.rst delete mode 100644 Documentation/powerpc/eeh-pci-error-recovery.txt create mode 100644 Documentation/powerpc/firmware-assisted-dump.rst delete mode 100644 Documentation/powerpc/firmware-assisted-dump.txt create mode 100644 Documentation/powerpc/hvcs.rst delete mode 100644 Documentation/powerpc/hvcs.txt create mode 100644 Documentation/powerpc/index.rst create mode 100644 Documentation/powerpc/mpc52xx.rst delete mode 100644 Documentation/powerpc/mpc52xx.txt create mode 100644 Documentation/powerpc/pci_iov_resource_on_powernv.rst delete mode 100644 Documentation/powerpc/pci_iov_resource_on_powernv.txt create mode 100644 Documentation/powerpc/pmu-ebb.rst delete mode 100644 Documentation/powerpc/pmu-ebb.txt create mode 100644 Documentation/powerpc/ptrace.rst delete mode 100644 Documentation/powerpc/ptrace.txt create mode 100644 Documentation/powerpc/qe_firmware.rst delete mode 100644 Documentation/powerpc/qe_firmware.txt create mode 100644 Documentation/powerpc/syscall64-abi.rst delete mode 100644 Documentation/powerpc/syscall64-abi.txt create mode 100644 Documentation/powerpc/transactional_memory.rst delete mode 100644 Documentation/powerpc/transactional_memory.txt (limited to 'include') diff --git a/Documentation/PCI/pci-error-recovery.rst b/Documentation/PCI/pci-error-recovery.rst index 83db42092935..e5d450df06b4 100644 --- a/Documentation/PCI/pci-error-recovery.rst +++ b/Documentation/PCI/pci-error-recovery.rst @@ -403,7 +403,7 @@ That is, the recovery API only requires that: .. note:: Implementation details for the powerpc platform are discussed in - the file Documentation/powerpc/eeh-pci-error-recovery.txt + the file Documentation/powerpc/eeh-pci-error-recovery.rst As of this writing, there is a growing list of device drivers with patches implementing error recovery. Not all of these patches are in @@ -422,3 +422,6 @@ That is, the recovery API only requires that: - drivers/net/cxgb3 - drivers/net/s2io.c - drivers/net/qlge + +The End +------- diff --git a/Documentation/index.rst b/Documentation/index.rst index 70ae148ec980..3fe6170aa41d 100644 --- a/Documentation/index.rst +++ b/Documentation/index.rst @@ -143,6 +143,7 @@ implementation. arm64/index ia64/index m68k/index + powerpc/index riscv/index s390/index sh/index diff --git a/Documentation/powerpc/DAWR-POWER9.txt b/Documentation/powerpc/DAWR-POWER9.txt deleted file mode 100644 index ecdbb076438c..000000000000 --- a/Documentation/powerpc/DAWR-POWER9.txt +++ /dev/null @@ -1,90 +0,0 @@ -DAWR issues on POWER9 -============================ - -On POWER9 the Data Address Watchpoint Register (DAWR) can cause a checkstop -if it points to cache inhibited (CI) memory. Currently Linux has no way to -disinguish CI memory when configuring the DAWR, so (for now) the DAWR is -disabled by this commit: - - commit 9654153158d3e0684a1bdb76dbababdb7111d5a0 - Author: Michael Neuling - Date: Tue Mar 27 15:37:24 2018 +1100 - powerpc: Disable DAWR in the base POWER9 CPU features - -Technical Details: -============================ - -DAWR has 6 different ways of being set. -1) ptrace -2) h_set_mode(DAWR) -3) h_set_dabr() -4) kvmppc_set_one_reg() -5) xmon - -For ptrace, we now advertise zero breakpoints on POWER9 via the -PPC_PTRACE_GETHWDBGINFO call. This results in GDB falling back to -software emulation of the watchpoint (which is slow). - -h_set_mode(DAWR) and h_set_dabr() will now return an error to the -guest on a POWER9 host. Current Linux guests ignore this error, so -they will silently not get the DAWR. - -kvmppc_set_one_reg() will store the value in the vcpu but won't -actually set it on POWER9 hardware. This is done so we don't break -migration from POWER8 to POWER9, at the cost of silently losing the -DAWR on the migration. - -For xmon, the 'bd' command will return an error on P9. - -Consequences for users -============================ - -For GDB watchpoints (ie 'watch' command) on POWER9 bare metal , GDB -will accept the command. Unfortunately since there is no hardware -support for the watchpoint, GDB will software emulate the watchpoint -making it run very slowly. - -The same will also be true for any guests started on a POWER9 -host. The watchpoint will fail and GDB will fall back to software -emulation. - -If a guest is started on a POWER8 host, GDB will accept the watchpoint -and configure the hardware to use the DAWR. This will run at full -speed since it can use the hardware emulation. Unfortunately if this -guest is migrated to a POWER9 host, the watchpoint will be lost on the -POWER9. Loads and stores to the watchpoint locations will not be -trapped in GDB. The watchpoint is remembered, so if the guest is -migrated back to the POWER8 host, it will start working again. - -Force enabling the DAWR -============================= -Kernels (since ~v5.2) have an option to force enable the DAWR via: - - echo Y > /sys/kernel/debug/powerpc/dawr_enable_dangerous - -This enables the DAWR even on POWER9. - -This is a dangerous setting, USE AT YOUR OWN RISK. - -Some users may not care about a bad user crashing their box -(ie. single user/desktop systems) and really want the DAWR. This -allows them to force enable DAWR. - -This flag can also be used to disable DAWR access. Once this is -cleared, all DAWR access should be cleared immediately and your -machine once again safe from crashing. - -Userspace may get confused by toggling this. If DAWR is force -enabled/disabled between getting the number of breakpoints (via -PTRACE_GETHWDBGINFO) and setting the breakpoint, userspace will get an -inconsistent view of what's available. Similarly for guests. - -For the DAWR to be enabled in a KVM guest, the DAWR needs to be force -enabled in the host AND the guest. For this reason, this won't work on -POWERVM as it doesn't allow the HCALL to work. Writes of 'Y' to the -dawr_enable_dangerous file will fail if the hypervisor doesn't support -writing the DAWR. - -To double check the DAWR is working, run this kernel selftest: - tools/testing/selftests/powerpc/ptrace/ptrace-hwbreak.c -Any errors/failures/skips mean something is wrong. diff --git a/Documentation/powerpc/bootwrapper.rst b/Documentation/powerpc/bootwrapper.rst new file mode 100644 index 000000000000..a6292afba573 --- /dev/null +++ b/Documentation/powerpc/bootwrapper.rst @@ -0,0 +1,155 @@ +======================== +The PowerPC boot wrapper +======================== + +Copyright (C) Secret Lab Technologies Ltd. + +PowerPC image targets compresses and wraps the kernel image (vmlinux) with +a boot wrapper to make it usable by the system firmware. There is no +standard PowerPC firmware interface, so the boot wrapper is designed to +be adaptable for each kind of image that needs to be built. + +The boot wrapper can be found in the arch/powerpc/boot/ directory. The +Makefile in that directory has targets for all the available image types. +The different image types are used to support all of the various firmware +interfaces found on PowerPC platforms. OpenFirmware is the most commonly +used firmware type on general purpose PowerPC systems from Apple, IBM and +others. U-Boot is typically found on embedded PowerPC hardware, but there +are a handful of other firmware implementations which are also popular. Each +firmware interface requires a different image format. + +The boot wrapper is built from the makefile in arch/powerpc/boot/Makefile and +it uses the wrapper script (arch/powerpc/boot/wrapper) to generate target +image. The details of the build system is discussed in the next section. +Currently, the following image format targets exist: + + ==================== ======================================================== + cuImage.%: Backwards compatible uImage for older version of + U-Boot (for versions that don't understand the device + tree). This image embeds a device tree blob inside + the image. The boot wrapper, kernel and device tree + are all embedded inside the U-Boot uImage file format + with boot wrapper code that extracts data from the old + bd_info structure and loads the data into the device + tree before jumping into the kernel. + + Because of the series of #ifdefs found in the + bd_info structure used in the old U-Boot interfaces, + cuImages are platform specific. Each specific + U-Boot platform has a different platform init file + which populates the embedded device tree with data + from the platform specific bd_info file. The platform + specific cuImage platform init code can be found in + `arch/powerpc/boot/cuboot.*.c`. Selection of the correct + cuImage init code for a specific board can be found in + the wrapper structure. + + dtbImage.%: Similar to zImage, except device tree blob is embedded + inside the image instead of provided by firmware. The + output image file can be either an elf file or a flat + binary depending on the platform. + + dtbImages are used on systems which do not have an + interface for passing a device tree directly. + dtbImages are similar to simpleImages except that + dtbImages have platform specific code for extracting + data from the board firmware, but simpleImages do not + talk to the firmware at all. + + PlayStation 3 support uses dtbImage. So do Embedded + Planet boards using the PlanetCore firmware. Board + specific initialization code is typically found in a + file named arch/powerpc/boot/.c; but this + can be overridden by the wrapper script. + + simpleImage.%: Firmware independent compressed image that does not + depend on any particular firmware interface and embeds + a device tree blob. This image is a flat binary that + can be loaded to any location in RAM and jumped to. + Firmware cannot pass any configuration data to the + kernel with this image type and it depends entirely on + the embedded device tree for all information. + + The simpleImage is useful for booting systems with + an unknown firmware interface or for booting from + a debugger when no firmware is present (such as on + the Xilinx Virtex platform). The only assumption that + simpleImage makes is that RAM is correctly initialized + and that the MMU is either off or has RAM mapped to + base address 0. + + simpleImage also supports inserting special platform + specific initialization code to the start of the bootup + sequence. The virtex405 platform uses this feature to + ensure that the cache is invalidated before caching + is enabled. Platform specific initialization code is + added as part of the wrapper script and is keyed on + the image target name. For example, all + simpleImage.virtex405-* targets will add the + virtex405-head.S initialization code (This also means + that the dts file for virtex405 targets should be + named (virtex405-.dts). Search the wrapper + script for 'virtex405' and see the file + arch/powerpc/boot/virtex405-head.S for details. + + treeImage.%; Image format for used with OpenBIOS firmware found + on some ppc4xx hardware. This image embeds a device + tree blob inside the image. + + uImage: Native image format used by U-Boot. The uImage target + does not add any boot code. It just wraps a compressed + vmlinux in the uImage data structure. This image + requires a version of U-Boot that is able to pass + a device tree to the kernel at boot. If using an older + version of U-Boot, then you need to use a cuImage + instead. + + zImage.%: Image format which does not embed a device tree. + Used by OpenFirmware and other firmware interfaces + which are able to supply a device tree. This image + expects firmware to provide the device tree at boot. + Typically, if you have general purpose PowerPC + hardware then you want this image format. + ==================== ======================================================== + +Image types which embed a device tree blob (simpleImage, dtbImage, treeImage, +and cuImage) all generate the device tree blob from a file in the +arch/powerpc/boot/dts/ directory. The Makefile selects the correct device +tree source based on the name of the target. Therefore, if the kernel is +built with 'make treeImage.walnut simpleImage.virtex405-ml403', then the +build system will use arch/powerpc/boot/dts/walnut.dts to build +treeImage.walnut and arch/powerpc/boot/dts/virtex405-ml403.dts to build +the simpleImage.virtex405-ml403. + +Two special targets called 'zImage' and 'zImage.initrd' also exist. These +targets build all the default images as selected by the kernel configuration. +Default images are selected by the boot wrapper Makefile +(arch/powerpc/boot/Makefile) by adding targets to the $image-y variable. Look +at the Makefile to see which default image targets are available. + +How it is built +--------------- +arch/powerpc is designed to support multiplatform kernels, which means +that a single vmlinux image can be booted on many different target boards. +It also means that the boot wrapper must be able to wrap for many kinds of +images on a single build. The design decision was made to not use any +conditional compilation code (#ifdef, etc) in the boot wrapper source code. +All of the boot wrapper pieces are buildable at any time regardless of the +kernel configuration. Building all the wrapper bits on every kernel build +also ensures that obscure parts of the wrapper are at the very least compile +tested in a large variety of environments. + +The wrapper is adapted for different image types at link time by linking in +just the wrapper bits that are appropriate for the image type. The 'wrapper +script' (found in arch/powerpc/boot/wrapper) is called by the Makefile and +is responsible for selecting the correct wrapper bits for the image type. +The arguments are well documented in the script's comment block, so they +are not repeated here. However, it is worth mentioning that the script +uses the -p (platform) argument as the main method of deciding which wrapper +bits to compile in. Look for the large 'case "$platform" in' block in the +middle of the script. This is also the place where platform specific fixups +can be selected by changing the link order. + +In particular, care should be taken when working with cuImages. cuImage +wrapper bits are very board specific and care should be taken to make sure +the target you are trying to build is supported by the wrapper bits. diff --git a/Documentation/powerpc/bootwrapper.txt b/Documentation/powerpc/bootwrapper.txt deleted file mode 100644 index d60fced5e1cc..000000000000 --- a/Documentation/powerpc/bootwrapper.txt +++ /dev/null @@ -1,141 +0,0 @@ -The PowerPC boot wrapper ------------------------- -Copyright (C) Secret Lab Technologies Ltd. - -PowerPC image targets compresses and wraps the kernel image (vmlinux) with -a boot wrapper to make it usable by the system firmware. There is no -standard PowerPC firmware interface, so the boot wrapper is designed to -be adaptable for each kind of image that needs to be built. - -The boot wrapper can be found in the arch/powerpc/boot/ directory. The -Makefile in that directory has targets for all the available image types. -The different image types are used to support all of the various firmware -interfaces found on PowerPC platforms. OpenFirmware is the most commonly -used firmware type on general purpose PowerPC systems from Apple, IBM and -others. U-Boot is typically found on embedded PowerPC hardware, but there -are a handful of other firmware implementations which are also popular. Each -firmware interface requires a different image format. - -The boot wrapper is built from the makefile in arch/powerpc/boot/Makefile and -it uses the wrapper script (arch/powerpc/boot/wrapper) to generate target -image. The details of the build system is discussed in the next section. -Currently, the following image format targets exist: - - cuImage.%: Backwards compatible uImage for older version of - U-Boot (for versions that don't understand the device - tree). This image embeds a device tree blob inside - the image. The boot wrapper, kernel and device tree - are all embedded inside the U-Boot uImage file format - with boot wrapper code that extracts data from the old - bd_info structure and loads the data into the device - tree before jumping into the kernel. - Because of the series of #ifdefs found in the - bd_info structure used in the old U-Boot interfaces, - cuImages are platform specific. Each specific - U-Boot platform has a different platform init file - which populates the embedded device tree with data - from the platform specific bd_info file. The platform - specific cuImage platform init code can be found in - arch/powerpc/boot/cuboot.*.c. Selection of the correct - cuImage init code for a specific board can be found in - the wrapper structure. - dtbImage.%: Similar to zImage, except device tree blob is embedded - inside the image instead of provided by firmware. The - output image file can be either an elf file or a flat - binary depending on the platform. - dtbImages are used on systems which do not have an - interface for passing a device tree directly. - dtbImages are similar to simpleImages except that - dtbImages have platform specific code for extracting - data from the board firmware, but simpleImages do not - talk to the firmware at all. - PlayStation 3 support uses dtbImage. So do Embedded - Planet boards using the PlanetCore firmware. Board - specific initialization code is typically found in a - file named arch/powerpc/boot/.c; but this - can be overridden by the wrapper script. - simpleImage.%: Firmware independent compressed image that does not - depend on any particular firmware interface and embeds - a device tree blob. This image is a flat binary that - can be loaded to any location in RAM and jumped to. - Firmware cannot pass any configuration data to the - kernel with this image type and it depends entirely on - the embedded device tree for all information. - The simpleImage is useful for booting systems with - an unknown firmware interface or for booting from - a debugger when no firmware is present (such as on - the Xilinx Virtex platform). The only assumption that - simpleImage makes is that RAM is correctly initialized - and that the MMU is either off or has RAM mapped to - base address 0. - simpleImage also supports inserting special platform - specific initialization code to the start of the bootup - sequence. The virtex405 platform uses this feature to - ensure that the cache is invalidated before caching - is enabled. Platform specific initialization code is - added as part of the wrapper script and is keyed on - the image target name. For example, all - simpleImage.virtex405-* targets will add the - virtex405-head.S initialization code (This also means - that the dts file for virtex405 targets should be - named (virtex405-.dts). Search the wrapper - script for 'virtex405' and see the file - arch/powerpc/boot/virtex405-head.S for details. - treeImage.%; Image format for used with OpenBIOS firmware found - on some ppc4xx hardware. This image embeds a device - tree blob inside the image. - uImage: Native image format used by U-Boot. The uImage target - does not add any boot code. It just wraps a compressed - vmlinux in the uImage data structure. This image - requires a version of U-Boot that is able to pass - a device tree to the kernel at boot. If using an older - version of U-Boot, then you need to use a cuImage - instead. - zImage.%: Image format which does not embed a device tree. - Used by OpenFirmware and other firmware interfaces - which are able to supply a device tree. This image - expects firmware to provide the device tree at boot. - Typically, if you have general purpose PowerPC - hardware then you want this image format. - -Image types which embed a device tree blob (simpleImage, dtbImage, treeImage, -and cuImage) all generate the device tree blob from a file in the -arch/powerpc/boot/dts/ directory. The Makefile selects the correct device -tree source based on the name of the target. Therefore, if the kernel is -built with 'make treeImage.walnut simpleImage.virtex405-ml403', then the -build system will use arch/powerpc/boot/dts/walnut.dts to build -treeImage.walnut and arch/powerpc/boot/dts/virtex405-ml403.dts to build -the simpleImage.virtex405-ml403. - -Two special targets called 'zImage' and 'zImage.initrd' also exist. These -targets build all the default images as selected by the kernel configuration. -Default images are selected by the boot wrapper Makefile -(arch/powerpc/boot/Makefile) by adding targets to the $image-y variable. Look -at the Makefile to see which default image targets are available. - -How it is built ---------------- -arch/powerpc is designed to support multiplatform kernels, which means -that a single vmlinux image can be booted on many different target boards. -It also means that the boot wrapper must be able to wrap for many kinds of -images on a single build. The design decision was made to not use any -conditional compilation code (#ifdef, etc) in the boot wrapper source code. -All of the boot wrapper pieces are buildable at any time regardless of the -kernel configuration. Building all the wrapper bits on every kernel build -also ensures that obscure parts of the wrapper are at the very least compile -tested in a large variety of environments. - -The wrapper is adapted for different image types at link time by linking in -just the wrapper bits that are appropriate for the image type. The 'wrapper -script' (found in arch/powerpc/boot/wrapper) is called by the Makefile and -is responsible for selecting the correct wrapper bits for the image type. -The arguments are well documented in the script's comment block, so they -are not repeated here. However, it is worth mentioning that the script -uses the -p (platform) argument as the main method of deciding which wrapper -bits to compile in. Look for the large 'case "$platform" in' block in the -middle of the script. This is also the place where platform specific fixups -can be selected by changing the link order. - -In particular, care should be taken when working with cuImages. cuImage -wrapper bits are very board specific and care should be taken to make sure -the target you are trying to build is supported by the wrapper bits. diff --git a/Documentation/powerpc/cpu_families.rst b/Documentation/powerpc/cpu_families.rst new file mode 100644 index 000000000000..1e063c5440c3 --- /dev/null +++ b/Documentation/powerpc/cpu_families.rst @@ -0,0 +1,222 @@ +============ +CPU Families +============ + +This document tries to summarise some of the different cpu families that exist +and are supported by arch/powerpc. + + +Book3S (aka sPAPR) +------------------ + +- Hash MMU +- Mix of 32 & 64 bit:: + + +--------------+ +----------------+ + | Old POWER | --------------> | RS64 (threads) | + +--------------+ +----------------+ + | + | + v + +--------------+ +----------------+ +------+ + | 601 | --------------> | 603 | ---> | e300 | + +--------------+ +----------------+ +------+ + | | + | | + v v + +--------------+ +----------------+ +-------+ + | 604 | | 750 (G3) | ---> | 750CX | + +--------------+ +----------------+ +-------+ + | | | + | | | + v v v + +--------------+ +----------------+ +-------+ + | 620 (64 bit) | | 7400 | | 750CL | + +--------------+ +----------------+ +-------+ + | | | + | | | + v v v + +--------------+ +----------------+ +-------+ + | POWER3/630 | | 7410 | | 750FX | + +--------------+ +----------------+ +-------+ + | | + | | + v v + +--------------+ +----------------+ + | POWER3+ | | 7450 | + +--------------+ +----------------+ + | | + | | + v v + +--------------+ +----------------+ + | POWER4 | | 7455 | + +--------------+ +----------------+ + | | + | | + v v + +--------------+ +-------+ +----------------+ + | POWER4+ | --> | 970 | | 7447 | + +--------------+ +-------+ +----------------+ + | | | + | | | + v v v + +--------------+ +-------+ +----------------+ + | POWER5 | | 970FX | | 7448 | + +--------------+ +-------+ +----------------+ + | | | + | | | + v v v + +--------------+ +-------+ +----------------+ + | POWER5+ | | 970MP | | e600 | + +--------------+ +-------+ +----------------+ + | + | + v + +--------------+ + | POWER5++ | + +--------------+ + | + | + v + +--------------+ +-------+ + | POWER6 | <-?-> | Cell | + +--------------+ +-------+ + | + | + v + +--------------+ + | POWER7 | + +--------------+ + | + | + v + +--------------+ + | POWER7+ | + +--------------+ + | + | + v + +--------------+ + | POWER8 | + +--------------+ + + + +---------------+ + | PA6T (64 bit) | + +---------------+ + + +IBM BookE +--------- + +- Software loaded TLB. +- All 32 bit:: + + +--------------+ + | 401 | + +--------------+ + | + | + v + +--------------+ + | 403 | + +--------------+ + | + | + v + +--------------+ + | 405 | + +--------------+ + | + | + v + +--------------+ + | 440 | + +--------------+ + | + | + v + +--------------+ +----------------+ + | 450 | --> | BG/P | + +--------------+ +----------------+ + | + | + v + +--------------+ + | 460 | + +--------------+ + | + | + v + +--------------+ + | 476 | + +--------------+ + + +Motorola/Freescale 8xx +---------------------- + +- Software loaded with hardware assist. +- All 32 bit:: + + +-------------+ + | MPC8xx Core | + +-------------+ + + +Freescale BookE +--------------- + +- Software loaded TLB. +- e6500 adds HW loaded indirect TLB entries. +- Mix of 32 & 64 bit:: + + +--------------+ + | e200 | + +--------------+ + + + +--------------------------------+ + | e500 | + +--------------------------------+ + | + | + v + +--------------------------------+ + | e500v2 | + +--------------------------------+ + | + | + v + +--------------------------------+ + | e500mc (Book3e) | + +--------------------------------+ + | + | + v + +--------------------------------+ + | e5500 (64 bit) | + +--------------------------------+ + | + | + v + +--------------------------------+ + | e6500 (HW TLB) (Multithreaded) | + +--------------------------------+ + + +IBM A2 core +----------- + +- Book3E, software loaded TLB + HW loaded indirect TLB entries. +- 64 bit:: + + +--------------+ +----------------+ + | A2 core | --> | WSP | + +--------------+ +----------------+ + | + | + v + +--------------+ + | BG/Q | + +--------------+ diff --git a/Documentation/powerpc/cpu_families.txt b/Documentation/powerpc/cpu_families.txt deleted file mode 100644 index fc08e22feb1a..000000000000 --- a/Documentation/powerpc/cpu_families.txt +++ /dev/null @@ -1,221 +0,0 @@ -CPU Families -============ - -This document tries to summarise some of the different cpu families that exist -and are supported by arch/powerpc. - - -Book3S (aka sPAPR) ------------------- - - - Hash MMU - - Mix of 32 & 64 bit - - +--------------+ +----------------+ - | Old POWER | --------------> | RS64 (threads) | - +--------------+ +----------------+ - | - | - v - +--------------+ +----------------+ +------+ - | 601 | --------------> | 603 | ---> | e300 | - +--------------+ +----------------+ +------+ - | | - | | - v v - +--------------+ +----------------+ +-------+ - | 604 | | 750 (G3) | ---> | 750CX | - +--------------+ +----------------+ +-------+ - | | | - | | | - v v v - +--------------+ +----------------+ +-------+ - | 620 (64 bit) | | 7400 | | 750CL | - +--------------+ +----------------+ +-------+ - | | | - | | | - v v v - +--------------+ +----------------+ +-------+ - | POWER3/630 | | 7410 | | 750FX | - +--------------+ +----------------+ +-------+ - | | - | | - v v - +--------------+ +----------------+ - | POWER3+ | | 7450 | - +--------------+ +----------------+ - | | - | | - v v - +--------------+ +----------------+ - | POWER4 | | 7455 | - +--------------+ +----------------+ - | | - | | - v v - +--------------+ +-------+ +----------------+ - | POWER4+ | --> | 970 | | 7447 | - +--------------+ +-------+ +----------------+ - | | | - | | | - v v v - +--------------+ +-------+ +----------------+ - | POWER5 | | 970FX | | 7448 | - +--------------+ +-------+ +----------------+ - | | | - | | | - v v v - +--------------+ +-------+ +----------------+ - | POWER5+ | | 970MP | | e600 | - +--------------+ +-------+ +----------------+ - | - | - v - +--------------+ - | POWER5++ | - +--------------+ - | - | - v - +--------------+ +-------+ - | POWER6 | <-?-> | Cell | - +--------------+ +-------+ - | - | - v - +--------------+ - | POWER7 | - +--------------+ - | - | - v - +--------------+ - | POWER7+ | - +--------------+ - | - | - v - +--------------+ - | POWER8 | - +--------------+ - - - +---------------+ - | PA6T (64 bit) | - +---------------+ - - -IBM BookE ---------- - - - Software loaded TLB. - - All 32 bit - - +--------------+ - | 401 | - +--------------+ - | - | - v - +--------------+ - | 403 | - +--------------+ - | - | - v - +--------------+ - | 405 | - +--------------+ - | - | - v - +--------------+ - | 440 | - +--------------+ - | - | - v - +--------------+ +----------------+ - | 450 | --> | BG/P | - +--------------+ +----------------+ - | - | - v - +--------------+ - | 460 | - +--------------+ - | - | - v - +--------------+ - | 476 | - +--------------+ - - -Motorola/Freescale 8xx ----------------------- - - - Software loaded with hardware assist. - - All 32 bit - - +-------------+ - | MPC8xx Core | - +-------------+ - - -Freescale BookE ---------------- - - - Software loaded TLB. - - e6500 adds HW loaded indirect TLB entries. - - Mix of 32 & 64 bit - - +--------------+ - | e200 | - +--------------+ - - - +--------------------------------+ - | e500 | - +--------------------------------+ - | - | - v - +--------------------------------+ - | e500v2 | - +--------------------------------+ - | - | - v - +--------------------------------+ - | e500mc (Book3e) | - +--------------------------------+ - | - | - v - +--------------------------------+ - | e5500 (64 bit) | - +--------------------------------+ - | - | - v - +--------------------------------+ - | e6500 (HW TLB) (Multithreaded) | - +--------------------------------+ - - -IBM A2 core ------------ - - - Book3E, software loaded TLB + HW loaded indirect TLB entries. - - 64 bit - - +--------------+ +----------------+ - | A2 core | --> | WSP | - +--------------+ +----------------+ - | - | - v - +--------------+ - | BG/Q | - +--------------+ diff --git a/Documentation/powerpc/cpu_features.rst b/Documentation/powerpc/cpu_features.rst new file mode 100644 index 000000000000..b7bcdd2f41bb --- /dev/null +++ b/Documentation/powerpc/cpu_features.rst @@ -0,0 +1,60 @@ +============ +CPU Features +============ + +Hollis Blanchard +5 Jun 2002 + +This document describes the system (including self-modifying code) used in the +PPC Linux kernel to support a variety of PowerPC CPUs without requiring +compile-time selection. + +Early in the boot process the ppc32 kernel detects the current CPU type and +chooses a set of features accordingly. Some examples include Altivec support, +split instruction and data caches, and if the CPU supports the DOZE and NAP +sleep modes. + +Detection of the feature set is simple. A list of processors can be found in +arch/powerpc/kernel/cputable.c. The PVR register is masked and compared with +each value in the list. If a match is found, the cpu_features of cur_cpu_spec +is assigned to the feature bitmask for this processor and a __setup_cpu +function is called. + +C code may test 'cur_cpu_spec[smp_processor_id()]->cpu_features' for a +particular feature bit. This is done in quite a few places, for example +in ppc_setup_l2cr(). + +Implementing cpufeatures in assembly is a little more involved. There are +several paths that are performance-critical and would suffer if an array +index, structure dereference, and conditional branch were added. To avoid the +performance penalty but still allow for runtime (rather than compile-time) CPU +selection, unused code is replaced by 'nop' instructions. This nop'ing is +based on CPU 0's capabilities, so a multi-processor system with non-identical +processors will not work (but such a system would likely have other problems +anyways). + +After detecting the processor type, the kernel patches out sections of code +that shouldn't be used by writing nop's over it. Using cpufeatures requires +just 2 macros (found in arch/powerpc/include/asm/cputable.h), as seen in head.S +transfer_to_handler:: + + #ifdef CONFIG_ALTIVEC + BEGIN_FTR_SECTION + mfspr r22,SPRN_VRSAVE /* if G4, save vrsave register value */ + stw r22,THREAD_VRSAVE(r23) + END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) + #endif /* CONFIG_ALTIVEC */ + +If CPU 0 supports Altivec, the code is left untouched. If it doesn't, both +instructions are replaced with nop's. + +The END_FTR_SECTION macro has two simpler variations: END_FTR_SECTION_IFSET +and END_FTR_SECTION_IFCLR. These simply test if a flag is set (in +cur_cpu_spec[0]->cpu_features) or is cleared, respectively. These two macros +should be used in the majority of cases. + +The END_FTR_SECTION macros are implemented by storing information about this +code in the '__ftr_fixup' ELF section. When do_cpu_ftr_fixups +(arch/powerpc/kernel/misc.S) is invoked, it will iterate over the records in +__ftr_fixup, and if the required feature is not present it will loop writing +nop's from each BEGIN_FTR_SECTION to END_FTR_SECTION. diff --git a/Documentation/powerpc/cpu_features.txt b/Documentation/powerpc/cpu_features.txt deleted file mode 100644 index ae09df8722c8..000000000000 --- a/Documentation/powerpc/cpu_features.txt +++ /dev/null @@ -1,56 +0,0 @@ -Hollis Blanchard -5 Jun 2002 - -This document describes the system (including self-modifying code) used in the -PPC Linux kernel to support a variety of PowerPC CPUs without requiring -compile-time selection. - -Early in the boot process the ppc32 kernel detects the current CPU type and -chooses a set of features accordingly. Some examples include Altivec support, -split instruction and data caches, and if the CPU supports the DOZE and NAP -sleep modes. - -Detection of the feature set is simple. A list of processors can be found in -arch/powerpc/kernel/cputable.c. The PVR register is masked and compared with -each value in the list. If a match is found, the cpu_features of cur_cpu_spec -is assigned to the feature bitmask for this processor and a __setup_cpu -function is called. - -C code may test 'cur_cpu_spec[smp_processor_id()]->cpu_features' for a -particular feature bit. This is done in quite a few places, for example -in ppc_setup_l2cr(). - -Implementing cpufeatures in assembly is a little more involved. There are -several paths that are performance-critical and would suffer if an array -index, structure dereference, and conditional branch were added. To avoid the -performance penalty but still allow for runtime (rather than compile-time) CPU -selection, unused code is replaced by 'nop' instructions. This nop'ing is -based on CPU 0's capabilities, so a multi-processor system with non-identical -processors will not work (but such a system would likely have other problems -anyways). - -After detecting the processor type, the kernel patches out sections of code -that shouldn't be used by writing nop's over it. Using cpufeatures requires -just 2 macros (found in arch/powerpc/include/asm/cputable.h), as seen in head.S -transfer_to_handler: - - #ifdef CONFIG_ALTIVEC - BEGIN_FTR_SECTION - mfspr r22,SPRN_VRSAVE /* if G4, save vrsave register value */ - stw r22,THREAD_VRSAVE(r23) - END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) - #endif /* CONFIG_ALTIVEC */ - -If CPU 0 supports Altivec, the code is left untouched. If it doesn't, both -instructions are replaced with nop's. - -The END_FTR_SECTION macro has two simpler variations: END_FTR_SECTION_IFSET -and END_FTR_SECTION_IFCLR. These simply test if a flag is set (in -cur_cpu_spec[0]->cpu_features) or is cleared, respectively. These two macros -should be used in the majority of cases. - -The END_FTR_SECTION macros are implemented by storing information about this -code in the '__ftr_fixup' ELF section. When do_cpu_ftr_fixups -(arch/powerpc/kernel/misc.S) is invoked, it will iterate over the records in -__ftr_fixup, and if the required feature is not present it will loop writing -nop's from each BEGIN_FTR_SECTION to END_FTR_SECTION. diff --git a/Documentation/powerpc/cxl.rst b/Documentation/powerpc/cxl.rst new file mode 100644 index 000000000000..920546d81326 --- /dev/null +++ b/Documentation/powerpc/cxl.rst @@ -0,0 +1,467 @@ +==================================== +Coherent Accelerator Interface (CXL) +==================================== + +Introduction +============ + + The coherent accelerator interface is designed to allow the + coherent connection of accelerators (FPGAs and other devices) to a + POWER system. These devices need to adhere to the Coherent + Accelerator Interface Architecture (CAIA). + + IBM refers to this as the Coherent Accelerator Processor Interface + or CAPI. In the kernel it's referred to by the name CXL to avoid + confusion with the ISDN CAPI subsystem. + + Coherent in this context means that the accelerator and CPUs can + both access system memory directly and with the same effective + addresses. + + +Hardware overview +================= + + :: + + POWER8/9 FPGA + +----------+ +---------+ + | | | | + | CPU | | AFU | + | | | | + | | | | + | | | | + +----------+ +---------+ + | PHB | | | + | +------+ | PSL | + | | CAPP |<------>| | + +---+------+ PCIE +---------+ + + The POWER8/9 chip has a Coherently Attached Processor Proxy (CAPP) + unit which is part of the PCIe Host Bridge (PHB). This is managed + by Linux by calls into OPAL. Linux doesn't directly program the + CAPP. + + The FPGA (or coherently attached device) consists of two parts. + The POWER Service Layer (PSL) and the Accelerator Function Unit + (AFU). The AFU is used to implement specific functionality behind + the PSL. The PSL, among other things, provides memory address + translation services to allow each AFU direct access to userspace + memory. + + The AFU is the core part of the accelerator (eg. the compression, + crypto etc function). The kernel has no knowledge of the function + of the AFU. Only userspace interacts directly with the AFU. + + The PSL provides the translation and interrupt services that the + AFU needs. This is what the kernel interacts with. For example, if + the AFU needs to read a particular effective address, it sends + that address to the PSL, the PSL then translates it, fetches the + data from memory and returns it to the AFU. If the PSL has a + translation miss, it interrupts the kernel and the kernel services + the fault. The context to which this fault is serviced is based on + who owns that acceleration function. + + - POWER8 and PSL Version 8 are compliant to the CAIA Version 1.0. + - POWER9 and PSL Version 9 are compliant to the CAIA Version 2.0. + + This PSL Version 9 provides new features such as: + + * Interaction with the nest MMU on the P9 chip. + * Native DMA support. + * Supports sending ASB_Notify messages for host thread wakeup. + * Supports Atomic operations. + * etc. + + Cards with a PSL9 won't work on a POWER8 system and cards with a + PSL8 won't work on a POWER9 system. + +AFU Modes +========= + + There are two programming modes supported by the AFU. Dedicated + and AFU directed. AFU may support one or both modes. + + When using dedicated mode only one MMU context is supported. In + this mode, only one userspace process can use the accelerator at + time. + + When using AFU directed mode, up to 16K simultaneous contexts can + be supported. This means up to 16K simultaneous userspace + applications may use the accelerator (although specific AFUs may + support fewer). In this mode, the AFU sends a 16 bit context ID + with each of its requests. This tells the PSL which context is + associated with each operation. If the PSL can't translate an + operation, the ID can also be accessed by the kernel so it can + determine the userspace context associated with an operation. + + +MMIO space +========== + + A portion of the accelerator MMIO space can be directly mapped + from the AFU to userspace. Either the whole space can be mapped or + just a per context portion. The hardware is self describing, hence + the kernel can determine the offset and size of the per context + portion. + + +Interrupts +========== + + AFUs may generate interrupts that are destined for userspace. These + are received by the kernel as hardware interrupts and passed onto + userspace by a read syscall documented below. + + Data storage faults and error interrupts are handled by the kernel + driver. + + +Work Element Descriptor (WED) +============================= + + The WED is a 64-bit parameter passed to the AFU when a context is + started. Its format is up to the AFU hence the kernel has no + knowledge of what it represents. Typically it will be the + effective address of a work queue or status block where the AFU + and userspace can share control and status information. + + + + +User API +======== + +1. AFU character devices + + For AFUs operating in AFU directed mode, two character device + files will be created. /dev/cxl/afu0.0m will correspond to a + master context and /dev/cxl/afu0.0s will correspond to a slave + context. Master contexts have access to the full MMIO space an + AFU provides. Slave contexts have access to only the per process + MMIO space an AFU provides. + + For AFUs operating in dedicated process mode, the driver will + only create a single character device per AFU called + /dev/cxl/afu0.0d. This will have access to the entire MMIO space + that the AFU provides (like master contexts in AFU directed). + + The types described below are defined in include/uapi/misc/cxl.h + + The following file operations are supported on both slave and + master devices. + + A userspace library libcxl is available here: + + https://github.com/ibm-capi/libcxl + + This provides a C interface to this kernel API. + +open +---- + + Opens the device and allocates a file descriptor to be used with + the rest of the API. + + A dedicated mode AFU only has one context and only allows the + device to be opened once. + + An AFU directed mode AFU can have many contexts, the device can be + opened once for each context that is available. + + When all available contexts are allocated the open call will fail + and return -ENOSPC. + + Note: + IRQs need to be allocated for each context, which may limit + the number of contexts that can be created, and therefore + how many times the device can be opened. The POWER8 CAPP + supports 2040 IRQs and 3 are used by the kernel, so 2037 are + left. If 1 IRQ is needed per context, then only 2037 + contexts can be allocated. If 4 IRQs are needed per context, + then only 2037/4 = 509 contexts can be allocated. + + +ioctl +----- + + CXL_IOCTL_START_WORK: + Starts the AFU context and associates it with the current + process. Once this ioctl is successfully executed, all memory + mapped into this process is accessible to this AFU context + using the same effective addresses. No additional calls are + required to map/unmap memory. The AFU memory context will be + updated as userspace allocates and frees memory. This ioctl + returns once the AFU context is started. + + Takes a pointer to a struct cxl_ioctl_start_work + + :: + + struct cxl_ioctl_start_work { + __u64 flags; + __u64 work_element_descriptor; + __u64 amr; + __s16 num_interrupts; + __s16 reserved1; + __s32 reserved2; + __u64 reserved3; + __u64 reserved4; + __u64 reserved5; + __u64 reserved6; + }; + + flags: + Indicates which optional fields in the structure are + valid. + + work_element_descriptor: + The Work Element Descriptor (WED) is a 64-bit argument + defined by the AFU. Typically this is an effective + address pointing to an AFU specific structure + describing what work to perform. + + amr: + Authority Mask Register (AMR), same as the powerpc + AMR. This field is only used by the kernel when the + corresponding CXL_START_WORK_AMR value is specified in + flags. If not specified the kernel will use a default + value of 0. + + num_interrupts: + Number of userspace interrupts to request. This field + is only used by the kernel when the corresponding + CXL_START_WORK_NUM_IRQS value is specified in flags. + If not specified the minimum number required by the + AFU will be allocated. The min and max number can be + obtained from sysfs. + + reserved fields: + For ABI padding and future extensions + + CXL_IOCTL_GET_PROCESS_ELEMENT: + Get the current context id, also known as the process element. + The value is returned from the kernel as a __u32. + + +mmap +---- + + An AFU may have an MMIO space to facilitate communication with the + AFU. If it does, the MMIO space can be accessed via mmap. The size + and contents of this area are specific to the particular AFU. The + size can be discovered via sysfs. + + In AFU directed mode, master contexts are allowed to map all of + the MMIO space and slave contexts are allowed to only map the per + process MMIO space associated with the context. In dedicated + process mode the entire MMIO space can always be mapped. + + This mmap call must be done after the START_WORK ioctl. + + Care should be taken when accessing MMIO space. Only 32 and 64-bit + accesses are supported by POWER8. Also, the AFU will be designed + with a specific endianness, so all MMIO accesses should consider + endianness (recommend endian(3) variants like: le64toh(), + be64toh() etc). These endian issues equally apply to shared memory + queues the WED may describe. + + +read +---- + + Reads events from the AFU. Blocks if no events are pending + (unless O_NONBLOCK is supplied). Returns -EIO in the case of an + unrecoverable error or if the card is removed. + + read() will always return an integral number of events. + + The buffer passed to read() must be at least 4K bytes. + + The result of the read will be a buffer of one or more events, + each event is of type struct cxl_event, of varying size:: + + struct cxl_event { + struct cxl_event_header header; + union { + struct cxl_event_afu_interrupt irq; + struct cxl_event_data_storage fault; + struct cxl_event_afu_error afu_error; + }; + }; + + The struct cxl_event_header is defined as + + :: + + struct cxl_event_header { + __u16 type; + __u16 size; + __u16 process_element; + __u16 reserved1; + }; + + type: + This defines the type of event. The type determines how + the rest of the event is structured. These types are + described below and defined by enum cxl_event_type. + + size: + This is the size of the event in bytes including the + struct cxl_event_header. The start of the next event can + be found at this offset from the start of the current + event. + + process_element: + Context ID of the event. + + reserved field: + For future extensions and padding. + + If the event type is CXL_EVENT_AFU_INTERRUPT then the event + structure is defined as + + :: + + struct cxl_event_afu_interrupt { + __u16 flags; + __u16 irq; /* Raised AFU interrupt number */ + __u32 reserved1; + }; + + flags: + These flags indicate which optional fields are present + in this struct. Currently all fields are mandatory. + + irq: + The IRQ number sent by the AFU. + + reserved field: + For future extensions and padding. + + If the event type is CXL_EVENT_DATA_STORAGE then the event + structure is defined as + + :: + + struct cxl_event_data_storage { + __u16 flags; + __u16 reserved1; + __u32 reserved2; + __u64 addr; + __u64 dsisr; + __u64 reserved3; + }; + + flags: + These flags indicate which optional fields are present in + this struct. Currently all fields are mandatory. + + address: + The address that the AFU unsuccessfully attempted to + access. Valid accesses will be handled transparently by the + kernel but invalid accesses will generate this event. + + dsisr: + This field gives information on the type of fault. It is a + copy of the DSISR from the PSL hardware when the address + fault occurred. The form of the DSISR is as defined in the + CAIA. + + reserved fields: + For future extensions + + If the event type is CXL_EVENT_AFU_ERROR then the event structure + is defined as + + :: + + struct cxl_event_afu_error { + __u16 flags; + __u16 reserved1; + __u32 reserved2; + __u64 error; + }; + + flags: + These flags indicate which optional fields are present in + this struct. Currently all fields are Mandatory. + + error: + Error status from the AFU. Defined by the AFU. + + reserved fields: + For future extensions and padding + + +2. Card character device (powerVM guest only) + + In a powerVM guest, an extra character device is created for the + card. The device is only used to write (flash) a new image on the + FPGA accelerator. Once the image is written and verified, the + device tree is updated and the card is reset to reload the updated + image. + +open +---- + + Opens the device and allocates a file descriptor to be used with + the rest of the API. The device can only be opened once. + +ioctl +----- + +CXL_IOCTL_DOWNLOAD_IMAGE / CXL_IOCTL_VALIDATE_IMAGE: + Starts and controls flashing a new FPGA image. Partial + reconfiguration is not supported (yet), so the image must contain + a copy of the PSL and AFU(s). Since an image can be quite large, + the caller may have to iterate, splitting the image in smaller + chunks. + + Takes a pointer to a struct cxl_adapter_image:: + + struct cxl_adapter_image { + __u64 flags; + __u64 data; + __u64 len_data; + __u64 len_image; + __u64 reserved1; + __u64 reserved2; + __u64 reserved3; + __u64 reserved4; + }; + + flags: + These flags indicate which optional fields are present in + this struct. Currently all fields are mandatory. + + data: + Pointer to a buffer with part of the image to write to the + card. + + len_data: + Size of the buffer pointed to by data. + + len_image: + Full size of the image. + + +Sysfs Class +=========== + + A cxl sysfs class is added under /sys/class/cxl to facilitate + enumeration and tuning of the accelerators. Its layout is + described in Documentation/ABI/testing/sysfs-class-cxl + + +Udev rules +========== + + The following udev rules could be used to create a symlink to the + most logical chardev to use in any programming mode (afuX.Yd for + dedicated, afuX.Ys for afu directed), since the API is virtually + identical for each:: + + SUBSYSTEM=="cxl", ATTRS{mode}=="dedicated_process", SYMLINK="cxl/%b" + SUBSYSTEM=="cxl", ATTRS{mode}=="afu_directed", \ + KERNEL=="afu[0-9]*.[0-9]*s", SYMLINK="cxl/%b" diff --git a/Documentation/powerpc/cxl.txt b/Documentation/powerpc/cxl.txt deleted file mode 100644 index c5e8d5098ed3..000000000000 --- a/Documentation/powerpc/cxl.txt +++ /dev/null @@ -1,449 +0,0 @@ -Coherent Accelerator Interface (CXL) -==================================== - -Introduction -============ - - The coherent accelerator interface is designed to allow the - coherent connection of accelerators (FPGAs and other devices) to a - POWER system. These devices need to adhere to the Coherent - Accelerator Interface Architecture (CAIA). - - IBM refers to this as the Coherent Accelerator Processor Interface - or CAPI. In the kernel it's referred to by the name CXL to avoid - confusion with the ISDN CAPI subsystem. - - Coherent in this context means that the accelerator and CPUs can - both access system memory directly and with the same effective - addresses. - - -Hardware overview -================= - - POWER8/9 FPGA - +----------+ +---------+ - | | | | - | CPU | | AFU | - | | | | - | | | | - | | | | - +----------+ +---------+ - | PHB | | | - | +------+ | PSL | - | | CAPP |<------>| | - +---+------+ PCIE +---------+ - - The POWER8/9 chip has a Coherently Attached Processor Proxy (CAPP) - unit which is part of the PCIe Host Bridge (PHB). This is managed - by Linux by calls into OPAL. Linux doesn't directly program the - CAPP. - - The FPGA (or coherently attached device) consists of two parts. - The POWER Service Layer (PSL) and the Accelerator Function Unit - (AFU). The AFU is used to implement specific functionality behind - the PSL. The PSL, among other things, provides memory address - translation services to allow each AFU direct access to userspace - memory. - - The AFU is the core part of the accelerator (eg. the compression, - crypto etc function). The kernel has no knowledge of the function - of the AFU. Only userspace interacts directly with the AFU. - - The PSL provides the translation and interrupt services that the - AFU needs. This is what the kernel interacts with. For example, if - the AFU needs to read a particular effective address, it sends - that address to the PSL, the PSL then translates it, fetches the - data from memory and returns it to the AFU. If the PSL has a - translation miss, it interrupts the kernel and the kernel services - the fault. The context to which this fault is serviced is based on - who owns that acceleration function. - - POWER8 <-----> PSL Version 8 is compliant to the CAIA Version 1.0. - POWER9 <-----> PSL Version 9 is compliant to the CAIA Version 2.0. - This PSL Version 9 provides new features such as: - * Interaction with the nest MMU on the P9 chip. - * Native DMA support. - * Supports sending ASB_Notify messages for host thread wakeup. - * Supports Atomic operations. - * .... - - Cards with a PSL9 won't work on a POWER8 system and cards with a - PSL8 won't work on a POWER9 system. - -AFU Modes -========= - - There are two programming modes supported by the AFU. Dedicated - and AFU directed. AFU may support one or both modes. - - When using dedicated mode only one MMU context is supported. In - this mode, only one userspace process can use the accelerator at - time. - - When using AFU directed mode, up to 16K simultaneous contexts can - be supported. This means up to 16K simultaneous userspace - applications may use the accelerator (although specific AFUs may - support fewer). In this mode, the AFU sends a 16 bit context ID - with each of its requests. This tells the PSL which context is - associated with each operation. If the PSL can't translate an - operation, the ID can also be accessed by the kernel so it can - determine the userspace context associated with an operation. - - -MMIO space -========== - - A portion of the accelerator MMIO space can be directly mapped - from the AFU to userspace. Either the whole space can be mapped or - just a per context portion. The hardware is self describing, hence - the kernel can determine the offset and size of the per context - portion. - - -Interrupts -========== - - AFUs may generate interrupts that are destined for userspace. These - are received by the kernel as hardware interrupts and passed onto - userspace by a read syscall documented below. - - Data storage faults and error interrupts are handled by the kernel - driver. - - -Work Element Descriptor (WED) -============================= - - The WED is a 64-bit parameter passed to the AFU when a context is - started. Its format is up to the AFU hence the kernel has no - knowledge of what it represents. Typically it will be the - effective address of a work queue or status block where the AFU - and userspace can share control and status information. - - - - -User API -======== - -1. AFU character devices - - For AFUs operating in AFU directed mode, two character device - files will be created. /dev/cxl/afu0.0m will correspond to a - master context and /dev/cxl/afu0.0s will correspond to a slave - context. Master contexts have access to the full MMIO space an - AFU provides. Slave contexts have access to only the per process - MMIO space an AFU provides. - - For AFUs operating in dedicated process mode, the driver will - only create a single character device per AFU called - /dev/cxl/afu0.0d. This will have access to the entire MMIO space - that the AFU provides (like master contexts in AFU directed). - - The types described below are defined in include/uapi/misc/cxl.h - - The following file operations are supported on both slave and - master devices. - - A userspace library libcxl is available here: - https://github.com/ibm-capi/libcxl - This provides a C interface to this kernel API. - -open ----- - - Opens the device and allocates a file descriptor to be used with - the rest of the API. - - A dedicated mode AFU only has one context and only allows the - device to be opened once. - - An AFU directed mode AFU can have many contexts, the device can be - opened once for each context that is available. - - When all available contexts are allocated the open call will fail - and return -ENOSPC. - - Note: IRQs need to be allocated for each context, which may limit - the number of contexts that can be created, and therefore - how many times the device can be opened. The POWER8 CAPP - supports 2040 IRQs and 3 are used by the kernel, so 2037 are - left. If 1 IRQ is needed per context, then only 2037 - contexts can be allocated. If 4 IRQs are needed per context, - then only 2037/4 = 509 contexts can be allocated. - - -ioctl ------ - - CXL_IOCTL_START_WORK: - Starts the AFU context and associates it with the current - process. Once this ioctl is successfully executed, all memory - mapped into this process is accessible to this AFU context - using the same effective addresses. No additional calls are - required to map/unmap memory. The AFU memory context will be - updated as userspace allocates and frees memory. This ioctl - returns once the AFU context is started. - - Takes a pointer to a struct cxl_ioctl_start_work: - - struct cxl_ioctl_start_work { - __u64 flags; - __u64 work_element_descriptor; - __u64 amr; - __s16 num_interrupts; - __s16 reserved1; - __s32 reserved2; - __u64 reserved3; - __u64 reserved4; - __u64 reserved5; - __u64 reserved6; - }; - - flags: - Indicates which optional fields in the structure are - valid. - - work_element_descriptor: - The Work Element Descriptor (WED) is a 64-bit argument - defined by the AFU. Typically this is an effective - address pointing to an AFU specific structure - describing what work to perform. - - amr: - Authority Mask Register (AMR), same as the powerpc - AMR. This field is only used by the kernel when the - corresponding CXL_START_WORK_AMR value is specified in - flags. If not specified the kernel will use a default - value of 0. - - num_interrupts: - Number of userspace interrupts to request. This field - is only used by the kernel when the corresponding - CXL_START_WORK_NUM_IRQS value is specified in flags. - If not specified the minimum number required by the - AFU will be allocated. The min and max number can be - obtained from sysfs. - - reserved fields: - For ABI padding and future extensions - - CXL_IOCTL_GET_PROCESS_ELEMENT: - Get the current context id, also known as the process element. - The value is returned from the kernel as a __u32. - - -mmap ----- - - An AFU may have an MMIO space to facilitate communication with the - AFU. If it does, the MMIO space can be accessed via mmap. The size - and contents of this area are specific to the particular AFU. The - size can be discovered via sysfs. - - In AFU directed mode, master contexts are allowed to map all of - the MMIO space and slave contexts are allowed to only map the per - process MMIO space associated with the context. In dedicated - process mode the entire MMIO space can always be mapped. - - This mmap call must be done after the START_WORK ioctl. - - Care should be taken when accessing MMIO space. Only 32 and 64-bit - accesses are supported by POWER8. Also, the AFU will be designed - with a specific endianness, so all MMIO accesses should consider - endianness (recommend endian(3) variants like: le64toh(), - be64toh() etc). These endian issues equally apply to shared memory - queues the WED may describe. - - -read ----- - - Reads events from the AFU. Blocks if no events are pending - (unless O_NONBLOCK is supplied). Returns -EIO in the case of an - unrecoverable error or if the card is removed. - - read() will always return an integral number of events. - - The buffer passed to read() must be at least 4K bytes. - - The result of the read will be a buffer of one or more events, - each event is of type struct cxl_event, of varying size. - - struct cxl_event { - struct cxl_event_header header; - union { - struct cxl_event_afu_interrupt irq; - struct cxl_event_data_storage fault; - struct cxl_event_afu_error afu_error; - }; - }; - - The struct cxl_event_header is defined as: - - struct cxl_event_header { - __u16 type; - __u16 size; - __u16 process_element; - __u16 reserved1; - }; - - type: - This defines the type of event. The type determines how - the rest of the event is structured. These types are - described below and defined by enum cxl_event_type. - - size: - This is the size of the event in bytes including the - struct cxl_event_header. The start of the next event can - be found at this offset from the start of the current - event. - - process_element: - Context ID of the event. - - reserved field: - For future extensions and padding. - - If the event type is CXL_EVENT_AFU_INTERRUPT then the event - structure is defined as: - - struct cxl_event_afu_interrupt { - __u16 flags; - __u16 irq; /* Raised AFU interrupt number */ - __u32 reserved1; - }; - - flags: - These flags indicate which optional fields are present - in this struct. Currently all fields are mandatory. - - irq: - The IRQ number sent by the AFU. - - reserved field: - For future extensions and padding. - - If the event type is CXL_EVENT_DATA_STORAGE then the event - structure is defined as: - - struct cxl_event_data_storage { - __u16 flags; - __u16 reserved1; - __u32 reserved2; - __u64 addr; - __u64 dsisr; - __u64 reserved3; - }; - - flags: - These flags indicate which optional fields are present in - this struct. Currently all fields are mandatory. - - address: - The address that the AFU unsuccessfully attempted to - access. Valid accesses will be handled transparently by the - kernel but invalid accesses will generate this event. - - dsisr: - This field gives information on the type of fault. It is a - copy of the DSISR from the PSL hardware when the address - fault occurred. The form of the DSISR is as defined in the - CAIA. - - reserved fields: - For future extensions - - If the event type is CXL_EVENT_AFU_ERROR then the event structure - is defined as: - - struct cxl_event_afu_error { - __u16 flags; - __u16 reserved1; - __u32 reserved2; - __u64 error; - }; - - flags: - These flags indicate which optional fields are present in - this struct. Currently all fields are Mandatory. - - error: - Error status from the AFU. Defined by the AFU. - - reserved fields: - For future extensions and padding - - -2. Card character device (powerVM guest only) - - In a powerVM guest, an extra character device is created for the - card. The device is only used to write (flash) a new image on the - FPGA accelerator. Once the image is written and verified, the - device tree is updated and the card is reset to reload the updated - image. - -open ----- - - Opens the device and allocates a file descriptor to be used with - the rest of the API. The device can only be opened once. - -ioctl ------ - -CXL_IOCTL_DOWNLOAD_IMAGE: -CXL_IOCTL_VALIDATE_IMAGE: - Starts and controls flashing a new FPGA image. Partial - reconfiguration is not supported (yet), so the image must contain - a copy of the PSL and AFU(s). Since an image can be quite large, - the caller may have to iterate, splitting the image in smaller - chunks. - - Takes a pointer to a struct cxl_adapter_image: - struct cxl_adapter_image { - __u64 flags; - __u64 data; - __u64 len_data; - __u64 len_image; - __u64 reserved1; - __u64 reserved2; - __u64 reserved3; - __u64 reserved4; - }; - - flags: - These flags indicate which optional fields are present in - this struct. Currently all fields are mandatory. - - data: - Pointer to a buffer with part of the image to write to the - card. - - len_data: - Size of the buffer pointed to by data. - - len_image: - Full size of the image. - - -Sysfs Class -=========== - - A cxl sysfs class is added under /sys/class/cxl to facilitate - enumeration and tuning of the accelerators. Its layout is - described in Documentation/ABI/testing/sysfs-class-cxl - - -Udev rules -========== - - The following udev rules could be used to create a symlink to the - most logical chardev to use in any programming mode (afuX.Yd for - dedicated, afuX.Ys for afu directed), since the API is virtually - identical for each: - - SUBSYSTEM=="cxl", ATTRS{mode}=="dedicated_process", SYMLINK="cxl/%b" - SUBSYSTEM=="cxl", ATTRS{mode}=="afu_directed", \ - KERNEL=="afu[0-9]*.[0-9]*s", SYMLINK="cxl/%b" diff --git a/Documentation/powerpc/cxlflash.rst b/Documentation/powerpc/cxlflash.rst new file mode 100644 index 000000000000..cea67931b3b9 --- /dev/null +++ b/Documentation/powerpc/cxlflash.rst @@ -0,0 +1,433 @@ +================================ +Coherent Accelerator (CXL) Flash +================================ + +Introduction +============ + + The IBM Power architecture provides support for CAPI (Coherent + Accelerator Power Interface), which is available to certain PCIe slots + on Power 8 systems. CAPI can be thought of as a special tunneling + protocol through PCIe that allow PCIe adapters to look like special + purpose co-processors which can read or write an application's + memory and generate page faults. As a result, the host interface to + an adapter running in CAPI mode does not require the data buffers to + be mapped to the device's memory (IOMMU bypass) nor does it require + memory to be pinned. + + On Linux, Coherent Accelerator (CXL) kernel services present CAPI + devices as a PCI device by implementing a virtual PCI host bridge. + This abstraction simplifies the infrastructure and programming + model, allowing for drivers to look similar to other native PCI + device drivers. + + CXL provides a mechanism by which user space applications can + directly talk to a device (network or storage) bypassing the typical + kernel/device driver stack. The CXL Flash Adapter Driver enables a + user space application direct access to Flash storage. + + The CXL Flash Adapter Driver is a kernel module that sits in the + SCSI stack as a low level device driver (below the SCSI disk and + protocol drivers) for the IBM CXL Flash Adapter. This driver is + responsible for the initialization of the adapter, setting up the + special path for user space access, and performing error recovery. It + communicates directly the Flash Accelerator Functional Unit (AFU) + as described in Documentation/powerpc/cxl.rst. + + The cxlflash driver supports two, mutually exclusive, modes of + operation at the device (LUN) level: + + - Any flash device (LUN) can be configured to be accessed as a + regular disk device (i.e.: /dev/sdc). This is the default mode. + + - Any flash device (LUN) can be configured to be accessed from + user space with a special block library. This mode further + specifies the means of accessing the device and provides for + either raw access to the entire LUN (referred to as direct + or physical LUN access) or access to a kernel/AFU-mediated + partition of the LUN (referred to as virtual LUN access). The + segmentation of a disk device into virtual LUNs is assisted + by special translation services provided by the Flash AFU. + +Overview +======== + + The Coherent Accelerator Interface Architecture (CAIA) introduces a + concept of a master context. A master typically has special privileges + granted to it by the kernel or hypervisor allowing it to perform AFU + wide management and control. The master may or may not be involved + directly in each user I/O, but at the minimum is involved in the + initial setup before the user application is allowed to send requests + directly to the AFU. + + The CXL Flash Adapter Driver establishes a master context with the + AFU. It uses memory mapped I/O (MMIO) for this control and setup. The + Adapter Problem Space Memory Map looks like this:: + + +-------------------------------+ + | 512 * 64 KB User MMIO | + | (per context) | + | User Accessible | + +-------------------------------+ + | 512 * 128 B per context | + | Provisioning and Control | + | Trusted Process accessible | + +-------------------------------+ + | 64 KB Global | + | Trusted Process accessible | + +-------------------------------+ + + This driver configures itself into the SCSI software stack as an + adapter driver. The driver is the only entity that is considered a + Trusted Process to program the Provisioning and Control and Global + areas in the MMIO Space shown above. The master context driver + discovers all LUNs attached to the CXL Flash adapter and instantiates + scsi block devices (/dev/sdb, /dev/sdc etc.) for each unique LUN + seen from each path. + + Once these scsi block devices are instantiated, an application + written to a specification provided by the block library may get + access to the Flash from user space (without requiring a system call). + + This master context driver also provides a series of ioctls for this + block library to enable this user space access. The driver supports + two modes for accessing the block device. + + The first mode is called a virtual mode. In this mode a single scsi + block device (/dev/sdb) may be carved up into any number of distinct + virtual LUNs. The virtual LUNs may be resized as long as the sum of + the sizes of all the virtual LUNs, along with the meta-data associated + with it does not exceed the physical capacity. + + The second mode is called the physical mode. In this mode a single + block device (/dev/sdb) may be opened directly by the block library + and the entire space for the LUN is available to the application. + + Only the physical mode provides persistence of the data. i.e. The + data written to the block device will survive application exit and + restart and also reboot. The virtual LUNs do not persist (i.e. do + not survive after the application terminates or the system reboots). + + +Block library API +================= + + Applications intending to get access to the CXL Flash from user + space should use the block library, as it abstracts the details of + interfacing directly with the cxlflash driver that are necessary for + performing administrative actions (i.e.: setup, tear down, resize). + The block library can be thought of as a 'user' of services, + implemented as IOCTLs, that are provided by the cxlflash driver + specifically for devices (LUNs) operating in user space access + mode. While it is not a requirement that applications understand + the interface between the block library and the cxlflash driver, + a high-level overview of each supported service (IOCTL) is provided + below. + + The block library can be found on GitHub: + http://github.com/open-power/capiflash + + +CXL Flash Driver LUN IOCTLs +=========================== + + Users, such as the block library, that wish to interface with a flash + device (LUN) via user space access need to use the services provided + by the cxlflash driver. As these services are implemented as ioctls, + a file descriptor handle must first be obtained in order to establish + the communication channel between a user and the kernel. This file + descriptor is obtained by opening the device special file associated + with the scsi disk device (/dev/sdb) that was created during LUN + discovery. As per the location of the cxlflash driver within the + SCSI protocol stack, this open is actually not seen by the cxlflash + driver. Upon successful open, the user receives a file descriptor + (herein referred to as fd1) that should be used for issuing the + subsequent ioctls listed below. + + The structure definitions for these IOCTLs are available in: + uapi/scsi/cxlflash_ioctl.h + +DK_CXLFLASH_ATTACH +------------------ + + This ioctl obtains, initializes, and starts a context using the CXL + kernel services. These services specify a context id (u16) by which + to uniquely identify the context and its allocated resources. The + services additionally provide a second file descriptor (herein + referred to as fd2) that is used by the block library to initiate + memory mapped I/O (via mmap()) to the CXL flash device and poll for + completion events. This file descriptor is intentionally installed by + this driver and not the CXL kernel services to allow for intermediary + notification and access in the event of a non-user-initiated close(), + such as a killed process. This design point is described in further + detail in the description for the DK_CXLFLASH_DETACH ioctl. + + There are a few important aspects regarding the "tokens" (context id + and fd2) that are provided back to the user: + + - These tokens are only valid for the process under which they + were created. The child of a forked process cannot continue + to use the context id or file descriptor created by its parent + (see DK_CXLFLASH_VLUN_CLONE for further details). + + - These tokens are only valid for the lifetime of the context and + the process under which they were created. Once either is + destroyed, the tokens are to be considered stale and subsequent + usage will result in errors. + + - A valid adapter file descriptor (fd2 >= 0) is only returned on + the initial attach for a context. Subsequent attaches to an + existing context (DK_CXLFLASH_ATTACH_REUSE_CONTEXT flag present) + do not provide the adapter file descriptor as it was previously + made known to the application. + + - When a context is no longer needed, the user shall detach from + the context via the DK_CXLFLASH_DETACH ioctl. When this ioctl + returns with a valid adapter file descriptor and the return flag + DK_CXLFLASH_APP_CLOSE_ADAP_FD is present, the application _must_ + close the adapter file descriptor following a successful detach. + + - When this ioctl returns with a valid fd2 and the return flag + DK_CXLFLASH_APP_CLOSE_ADAP_FD is present, the application _must_ + close fd2 in the following circumstances: + + + Following a successful detach of the last user of the context + + Following a successful recovery on the context's original fd2 + + In the child process of a fork(), following a clone ioctl, + on the fd2 associated with the source context + + - At any time, a close on fd2 will invalidate the tokens. Applications + should exercise caution to only close fd2 when appropriate (outlined + in the previous bullet) to avoid premature loss of I/O. + +DK_CXLFLASH_USER_DIRECT +----------------------- + This ioctl is responsible for transitioning the LUN to direct + (physical) mode access and configuring the AFU for direct access from + user space on a per-context basis. Additionally, the block size and + last logical block address (LBA) are returned to the user. + + As mentioned previously, when operating in user space access mode, + LUNs may be accessed in whole or in part. Only one mode is allowed + at a time and if one mode is active (outstanding references exist), + requests to use the LUN in a different mode are denied. + + The AFU is configured for direct access from user space by adding an + entry to the AFU's resource handle table. The index of the entry is + treated as a resource handle that is returned to the user. The user + is then able to use the handle to reference the LUN during I/O. + +DK_CXLFLASH_USER_VIRTUAL +------------------------ + This ioctl is responsible for transitioning the LUN to virtual mode + of access and configuring the AFU for virtual access from user space + on a per-context basis. Additionally, the block size and last logical + block address (LBA) are returned to the user. + + As mentioned previously, when operating in user space access mode, + LUNs may be accessed in whole or in part. Only one mode is allowed + at a time and if one mode is active (outstanding references exist), + requests to use the LUN in a different mode are denied. + + The AFU is configured for virtual access from user space by adding + an entry to the AFU's resource handle table. The index of the entry + is treated as a resource handle that is returned to the user. The + user is then able to use the handle to reference the LUN during I/O. + + By default, the virtual LUN is created with a size of 0. The user + would need to use the DK_CXLFLASH_VLUN_RESIZE ioctl to adjust the grow + the virtual LUN to a desired size. To avoid having to perform this + resize for the initial creation of the virtual LUN, the user has the + option of specifying a size as part of the DK_CXLFLASH_USER_VIRTUAL + ioctl, such that when success is returned to the user, the + resource handle that is provided is already referencing provisioned + storage. This is reflected by the last LBA being a non-zero value. + + When a LUN is accessible from more than one port, this ioctl will + return with the DK_CXLFLASH_ALL_PORTS_ACTIVE return flag set. This + provides the user with a hint that I/O can be retried in the event + of an I/O error as the LUN can be reached over multiple paths. + +DK_CXLFLASH_VLUN_RESIZE +----------------------- + This ioctl is responsible for resizing a previously created virtual + LUN and will fail if invoked upon a LUN that is not in virtual + mode. Upon success, an updated last LBA is returned to the user + indicating the new size of the virtual LUN associated with the + resource handle. + + The partitioning of virtual LUNs is jointly mediated by the cxlflash + driver and the AFU. An allocation table is kept for each LUN that is + operating in the virtual mode and used to program a LUN translation + table that the AFU references when provided with a resource handle. + + This ioctl can return -EAGAIN if an AFU sync operation takes too long. + In addition to returning a failure to user, cxlflash will also schedule + an asynchronous AFU reset. Should the user choose to retry the operation, + it is expected to succeed. If this ioctl fails with -EAGAIN, the user + can either retry the operation or treat it as a failure. + +DK_CXLFLASH_RELEASE +------------------- + This ioctl is responsible for releasing a previously obtained + reference to either a physical or virtual LUN. This can be + thought of as the inverse of the DK_CXLFLASH_USER_DIRECT or + DK_CXLFLASH_USER_VIRTUAL ioctls. Upon success, the resource handle + is no longer valid and the entry in the resource handle table is + made available to be used again. + + As part of the release process for virtual LUNs, the virtual LUN + is first resized to 0 to clear out and free the translation tables + associated with the virtual LUN reference. + +DK_CXLFLASH_DETACH +------------------ + This ioctl is responsible for unregistering a context with the + cxlflash driver and release outstanding resources that were + not explicitly released via the DK_CXLFLASH_RELEASE ioctl. Upon + success, all "tokens" which had been provided to the user from the + DK_CXLFLASH_ATTACH onward are no longer valid. + + When the DK_CXLFLASH_APP_CLOSE_ADAP_FD flag was returned on a successful + attach, the application _must_ close the fd2 associated with the context + following the detach of the final user of the context. + +DK_CXLFLASH_VLUN_CLONE +---------------------- + This ioctl is responsible for cloning a previously created + context to a more recently created context. It exists solely to + support maintaining user space access to storage after a process + forks. Upon success, the child process (which invoked the ioctl) + will have access to the same LUNs via the same resource handle(s) + as the parent, but under a different context. + + Context sharing across processes is not supported with CXL and + therefore each fork must be met with establishing a new context + for the child process. This ioctl simplifies the state management + and playback required by a user in such a scenario. When a process + forks, child process can clone the parents context by first creating + a context (via DK_CXLFLASH_ATTACH) and then using this ioctl to + perform the clone from the parent to the child. + + The clone itself is fairly simple. The resource handle and lun + translation tables are copied from the parent context to the child's + and then synced with the AFU. + + When the DK_CXLFLASH_APP_CLOSE_ADAP_FD flag was returned on a successful + attach, the application _must_ close the fd2 associated with the source + context (still resident/accessible in the parent process) following the + clone. This is to avoid a stale entry in the file descriptor table of the + child process. + + This ioctl can return -EAGAIN if an AFU sync operation takes too long. + In addition to returning a failure to user, cxlflash will also schedule + an asynchronous AFU reset. Should the user choose to retry the operation, + it is expected to succeed. If this ioctl fails with -EAGAIN, the user + can either retry the operation or treat it as a failure. + +DK_CXLFLASH_VERIFY +------------------ + This ioctl is used to detect various changes such as the capacity of + the disk changing, the number of LUNs visible changing, etc. In cases + where the changes affect the application (such as a LUN resize), the + cxlflash driver will report the changed state to the application. + + The user calls in when they want to validate that a LUN hasn't been + changed in response to a check condition. As the user is operating out + of band from the kernel, they will see these types of events without + the kernel's knowledge. When encountered, the user's architected + behavior is to call in to this ioctl, indicating what they want to + verify and passing along any appropriate information. For now, only + verifying a LUN change (ie: size different) with sense data is + supported. + +DK_CXLFLASH_RECOVER_AFU +----------------------- + This ioctl is used to drive recovery (if such an action is warranted) + of a specified user context. Any state associated with the user context + is re-established upon successful recovery. + + User contexts are put into an error condition when the device needs to + be reset or is terminating. Users are notified of this error condition + by seeing all 0xF's on an MMIO read. Upon encountering this, the + architected behavior for a user is to call into this ioctl to recover + their context. A user may also call into this ioctl at any time to + check if the device is operating normally. If a failure is returned + from this ioctl, the user is expected to gracefully clean up their + context via release/detach ioctls. Until they do, the context they + hold is not relinquished. The user may also optionally exit the process + at which time the context/resources they held will be freed as part of + the release fop. + + When the DK_CXLFLASH_APP_CLOSE_ADAP_FD flag was returned on a successful + attach, the application _must_ unmap and close the fd2 associated with the + original context following this ioctl returning success and indicating that + the context was recovered (DK_CXLFLASH_RECOVER_AFU_CONTEXT_RESET). + +DK_CXLFLASH_MANAGE_LUN +---------------------- + This ioctl is used to switch a LUN from a mode where it is available + for file-system access (legacy), to a mode where it is set aside for + exclusive user space access (superpipe). In case a LUN is visible + across multiple ports and adapters, this ioctl is used to uniquely + identify each LUN by its World Wide Node Name (WWNN). + + +CXL Flash Driver Host IOCTLs +============================ + + Each host adapter instance that is supported by the cxlflash driver + has a special character device associated with it to enable a set of + host management function. These character devices are hosted in a + class dedicated for cxlflash and can be accessed via `/dev/cxlflash/*`. + + Applications can be written to perform various functions using the + host ioctl APIs below. + + The structure definitions for these IOCTLs are available in: + uapi/scsi/cxlflash_ioctl.h + +HT_CXLFLASH_LUN_PROVISION +------------------------- + This ioctl is used to create and delete persistent LUNs on cxlflash + devices that lack an external LUN management interface. It is only + valid when used with AFUs that support the LUN provision capability. + + When sufficient space is available, LUNs can be created by specifying + the target port to host the LUN and a desired size in 4K blocks. Upon + success, the LUN ID and WWID of the created LUN will be returned and + the SCSI bus can be scanned to detect the change in LUN topology. Note + that partial allocations are not supported. Should a creation fail due + to a space issue, the target port can be queried for its current LUN + geometry. + + To remove a LUN, the device must first be disassociated from the Linux + SCSI subsystem. The LUN deletion can then be initiated by specifying a + target port and LUN ID. Upon success, the LUN geometry associated with + the port will be updated to reflect new number of provisioned LUNs and + available capacity. + + To query the LUN geometry of a port, the target port is specified and + upon success, the following information is presented: + + - Maximum number of provisioned LUNs allowed for the port + - Current number of provisioned LUNs for the port + - Maximum total capacity of provisioned LUNs for the port (4K blocks) + - Current total capacity of provisioned LUNs for the port (4K blocks) + + With this information, the number of available LUNs and capacity can be + can be calculated. + +HT_CXLFLASH_AFU_DEBUG +--------------------- + This ioctl is used to debug AFUs by supporting a command pass-through + interface. It is only valid when used with AFUs that support the AFU + debug capability. + + With exception of buffer management, AFU debug commands are opaque to + cxlflash and treated as pass-through. For debug commands that do require + data transfer, the user supplies an adequately sized data buffer and must + specify the data transfer direction with respect to the host. There is a + maximum transfer size of 256K imposed. Note that partial read completions + are not supported - when errors are experienced with a host read data + transfer, the data buffer is not copied back to the user. diff --git a/Documentation/powerpc/cxlflash.txt b/Documentation/powerpc/cxlflash.txt deleted file mode 100644 index a64bdaa0a1cf..000000000000 --- a/Documentation/powerpc/cxlflash.txt +++ /dev/null @@ -1,429 +0,0 @@ -Introduction -============ - - The IBM Power architecture provides support for CAPI (Coherent - Accelerator Power Interface), which is available to certain PCIe slots - on Power 8 systems. CAPI can be thought of as a special tunneling - protocol through PCIe that allow PCIe adapters to look like special - purpose co-processors which can read or write an application's - memory and generate page faults. As a result, the host interface to - an adapter running in CAPI mode does not require the data buffers to - be mapped to the device's memory (IOMMU bypass) nor does it require - memory to be pinned. - - On Linux, Coherent Accelerator (CXL) kernel services present CAPI - devices as a PCI device by implementing a virtual PCI host bridge. - This abstraction simplifies the infrastructure and programming - model, allowing for drivers to look similar to other native PCI - device drivers. - - CXL provides a mechanism by which user space applications can - directly talk to a device (network or storage) bypassing the typical - kernel/device driver stack. The CXL Flash Adapter Driver enables a - user space application direct access to Flash storage. - - The CXL Flash Adapter Driver is a kernel module that sits in the - SCSI stack as a low level device driver (below the SCSI disk and - protocol drivers) for the IBM CXL Flash Adapter. This driver is - responsible for the initialization of the adapter, setting up the - special path for user space access, and performing error recovery. It - communicates directly the Flash Accelerator Functional Unit (AFU) - as described in Documentation/powerpc/cxl.txt. - - The cxlflash driver supports two, mutually exclusive, modes of - operation at the device (LUN) level: - - - Any flash device (LUN) can be configured to be accessed as a - regular disk device (i.e.: /dev/sdc). This is the default mode. - - - Any flash device (LUN) can be configured to be accessed from - user space with a special block library. This mode further - specifies the means of accessing the device and provides for - either raw access to the entire LUN (referred to as direct - or physical LUN access) or access to a kernel/AFU-mediated - partition of the LUN (referred to as virtual LUN access). The - segmentation of a disk device into virtual LUNs is assisted - by special translation services provided by the Flash AFU. - -Overview -======== - - The Coherent Accelerator Interface Architecture (CAIA) introduces a - concept of a master context. A master typically has special privileges - granted to it by the kernel or hypervisor allowing it to perform AFU - wide management and control. The master may or may not be involved - directly in each user I/O, but at the minimum is involved in the - initial setup before the user application is allowed to send requests - directly to the AFU. - - The CXL Flash Adapter Driver establishes a master context with the - AFU. It uses memory mapped I/O (MMIO) for this control and setup. The - Adapter Problem Space Memory Map looks like this: - - +-------------------------------+ - | 512 * 64 KB User MMIO | - | (per context) | - | User Accessible | - +-------------------------------+ - | 512 * 128 B per context | - | Provisioning and Control | - | Trusted Process accessible | - +-------------------------------+ - | 64 KB Global | - | Trusted Process accessible | - +-------------------------------+ - - This driver configures itself into the SCSI software stack as an - adapter driver. The driver is the only entity that is considered a - Trusted Process to program the Provisioning and Control and Global - areas in the MMIO Space shown above. The master context driver - discovers all LUNs attached to the CXL Flash adapter and instantiates - scsi block devices (/dev/sdb, /dev/sdc etc.) for each unique LUN - seen from each path. - - Once these scsi block devices are instantiated, an application - written to a specification provided by the block library may get - access to the Flash from user space (without requiring a system call). - - This master context driver also provides a series of ioctls for this - block library to enable this user space access. The driver supports - two modes for accessing the block device. - - The first mode is called a virtual mode. In this mode a single scsi - block device (/dev/sdb) may be carved up into any number of distinct - virtual LUNs. The virtual LUNs may be resized as long as the sum of - the sizes of all the virtual LUNs, along with the meta-data associated - with it does not exceed the physical capacity. - - The second mode is called the physical mode. In this mode a single - block device (/dev/sdb) may be opened directly by the block library - and the entire space for the LUN is available to the application. - - Only the physical mode provides persistence of the data. i.e. The - data written to the block device will survive application exit and - restart and also reboot. The virtual LUNs do not persist (i.e. do - not survive after the application terminates or the system reboots). - - -Block library API -================= - - Applications intending to get access to the CXL Flash from user - space should use the block library, as it abstracts the details of - interfacing directly with the cxlflash driver that are necessary for - performing administrative actions (i.e.: setup, tear down, resize). - The block library can be thought of as a 'user' of services, - implemented as IOCTLs, that are provided by the cxlflash driver - specifically for devices (LUNs) operating in user space access - mode. While it is not a requirement that applications understand - the interface between the block library and the cxlflash driver, - a high-level overview of each supported service (IOCTL) is provided - below. - - The block library can be found on GitHub: - http://github.com/open-power/capiflash - - -CXL Flash Driver LUN IOCTLs -=========================== - - Users, such as the block library, that wish to interface with a flash - device (LUN) via user space access need to use the services provided - by the cxlflash driver. As these services are implemented as ioctls, - a file descriptor handle must first be obtained in order to establish - the communication channel between a user and the kernel. This file - descriptor is obtained by opening the device special file associated - with the scsi disk device (/dev/sdb) that was created during LUN - discovery. As per the location of the cxlflash driver within the - SCSI protocol stack, this open is actually not seen by the cxlflash - driver. Upon successful open, the user receives a file descriptor - (herein referred to as fd1) that should be used for issuing the - subsequent ioctls listed below. - - The structure definitions for these IOCTLs are available in: - uapi/scsi/cxlflash_ioctl.h - -DK_CXLFLASH_ATTACH ------------------- - - This ioctl obtains, initializes, and starts a context using the CXL - kernel services. These services specify a context id (u16) by which - to uniquely identify the context and its allocated resources. The - services additionally provide a second file descriptor (herein - referred to as fd2) that is used by the block library to initiate - memory mapped I/O (via mmap()) to the CXL flash device and poll for - completion events. This file descriptor is intentionally installed by - this driver and not the CXL kernel services to allow for intermediary - notification and access in the event of a non-user-initiated close(), - such as a killed process. This design point is described in further - detail in the description for the DK_CXLFLASH_DETACH ioctl. - - There are a few important aspects regarding the "tokens" (context id - and fd2) that are provided back to the user: - - - These tokens are only valid for the process under which they - were created. The child of a forked process cannot continue - to use the context id or file descriptor created by its parent - (see DK_CXLFLASH_VLUN_CLONE for further details). - - - These tokens are only valid for the lifetime of the context and - the process under which they were created. Once either is - destroyed, the tokens are to be considered stale and subsequent - usage will result in errors. - - - A valid adapter file descriptor (fd2 >= 0) is only returned on - the initial attach for a context. Subsequent attaches to an - existing context (DK_CXLFLASH_ATTACH_REUSE_CONTEXT flag present) - do not provide the adapter file descriptor as it was previously - made known to the application. - - - When a context is no longer needed, the user shall detach from - the context via the DK_CXLFLASH_DETACH ioctl. When this ioctl - returns with a valid adapter file descriptor and the return flag - DK_CXLFLASH_APP_CLOSE_ADAP_FD is present, the application _must_ - close the adapter file descriptor following a successful detach. - - - When this ioctl returns with a valid fd2 and the return flag - DK_CXLFLASH_APP_CLOSE_ADAP_FD is present, the application _must_ - close fd2 in the following circumstances: - - + Following a successful detach of the last user of the context - + Following a successful recovery on the context's original fd2 - + In the child process of a fork(), following a clone ioctl, - on the fd2 associated with the source context - - - At any time, a close on fd2 will invalidate the tokens. Applications - should exercise caution to only close fd2 when appropriate (outlined - in the previous bullet) to avoid premature loss of I/O. - -DK_CXLFLASH_USER_DIRECT ------------------------ - This ioctl is responsible for transitioning the LUN to direct - (physical) mode access and configuring the AFU for direct access from - user space on a per-context basis. Additionally, the block size and - last logical block address (LBA) are returned to the user. - - As mentioned previously, when operating in user space access mode, - LUNs may be accessed in whole or in part. Only one mode is allowed - at a time and if one mode is active (outstanding references exist), - requests to use the LUN in a different mode are denied. - - The AFU is configured for direct access from user space by adding an - entry to the AFU's resource handle table. The index of the entry is - treated as a resource handle that is returned to the user. The user - is then able to use the handle to reference the LUN during I/O. - -DK_CXLFLASH_USER_VIRTUAL ------------------------- - This ioctl is responsible for transitioning the LUN to virtual mode - of access and configuring the AFU for virtual access from user space - on a per-context basis. Additionally, the block size and last logical - block address (LBA) are returned to the user. - - As mentioned previously, when operating in user space access mode, - LUNs may be accessed in whole or in part. Only one mode is allowed - at a time and if one mode is active (outstanding references exist), - requests to use the LUN in a different mode are denied. - - The AFU is configured for virtual access from user space by adding - an entry to the AFU's resource handle table. The index of the entry - is treated as a resource handle that is returned to the user. The - user is then able to use the handle to reference the LUN during I/O. - - By default, the virtual LUN is created with a size of 0. The user - would need to use the DK_CXLFLASH_VLUN_RESIZE ioctl to adjust the grow - the virtual LUN to a desired size. To avoid having to perform this - resize for the initial creation of the virtual LUN, the user has the - option of specifying a size as part of the DK_CXLFLASH_USER_VIRTUAL - ioctl, such that when success is returned to the user, the - resource handle that is provided is already referencing provisioned - storage. This is reflected by the last LBA being a non-zero value. - - When a LUN is accessible from more than one port, this ioctl will - return with the DK_CXLFLASH_ALL_PORTS_ACTIVE return flag set. This - provides the user with a hint that I/O can be retried in the event - of an I/O error as the LUN can be reached over multiple paths. - -DK_CXLFLASH_VLUN_RESIZE ------------------------ - This ioctl is responsible for resizing a previously created virtual - LUN and will fail if invoked upon a LUN that is not in virtual - mode. Upon success, an updated last LBA is returned to the user - indicating the new size of the virtual LUN associated with the - resource handle. - - The partitioning of virtual LUNs is jointly mediated by the cxlflash - driver and the AFU. An allocation table is kept for each LUN that is - operating in the virtual mode and used to program a LUN translation - table that the AFU references when provided with a resource handle. - - This ioctl can return -EAGAIN if an AFU sync operation takes too long. - In addition to returning a failure to user, cxlflash will also schedule - an asynchronous AFU reset. Should the user choose to retry the operation, - it is expected to succeed. If this ioctl fails with -EAGAIN, the user - can either retry the operation or treat it as a failure. - -DK_CXLFLASH_RELEASE -------------------- - This ioctl is responsible for releasing a previously obtained - reference to either a physical or virtual LUN. This can be - thought of as the inverse of the DK_CXLFLASH_USER_DIRECT or - DK_CXLFLASH_USER_VIRTUAL ioctls. Upon success, the resource handle - is no longer valid and the entry in the resource handle table is - made available to be used again. - - As part of the release process for virtual LUNs, the virtual LUN - is first resized to 0 to clear out and free the translation tables - associated with the virtual LUN reference. - -DK_CXLFLASH_DETACH ------------------- - This ioctl is responsible for unregistering a context with the - cxlflash driver and release outstanding resources that were - not explicitly released via the DK_CXLFLASH_RELEASE ioctl. Upon - success, all "tokens" which had been provided to the user from the - DK_CXLFLASH_ATTACH onward are no longer valid. - - When the DK_CXLFLASH_APP_CLOSE_ADAP_FD flag was returned on a successful - attach, the application _must_ close the fd2 associated with the context - following the detach of the final user of the context. - -DK_CXLFLASH_VLUN_CLONE ----------------------- - This ioctl is responsible for cloning a previously created - context to a more recently created context. It exists solely to - support maintaining user space access to storage after a process - forks. Upon success, the child process (which invoked the ioctl) - will have access to the same LUNs via the same resource handle(s) - as the parent, but under a different context. - - Context sharing across processes is not supported with CXL and - therefore each fork must be met with establishing a new context - for the child process. This ioctl simplifies the state management - and playback required by a user in such a scenario. When a process - forks, child process can clone the parents context by first creating - a context (via DK_CXLFLASH_ATTACH) and then using this ioctl to - perform the clone from the parent to the child. - - The clone itself is fairly simple. The resource handle and lun - translation tables are copied from the parent context to the child's - and then synced with the AFU. - - When the DK_CXLFLASH_APP_CLOSE_ADAP_FD flag was returned on a successful - attach, the application _must_ close the fd2 associated with the source - context (still resident/accessible in the parent process) following the - clone. This is to avoid a stale entry in the file descriptor table of the - child process. - - This ioctl can return -EAGAIN if an AFU sync operation takes too long. - In addition to returning a failure to user, cxlflash will also schedule - an asynchronous AFU reset. Should the user choose to retry the operation, - it is expected to succeed. If this ioctl fails with -EAGAIN, the user - can either retry the operation or treat it as a failure. - -DK_CXLFLASH_VERIFY ------------------- - This ioctl is used to detect various changes such as the capacity of - the disk changing, the number of LUNs visible changing, etc. In cases - where the changes affect the application (such as a LUN resize), the - cxlflash driver will report the changed state to the application. - - The user calls in when they want to validate that a LUN hasn't been - changed in response to a check condition. As the user is operating out - of band from the kernel, they will see these types of events without - the kernel's knowledge. When encountered, the user's architected - behavior is to call in to this ioctl, indicating what they want to - verify and passing along any appropriate information. For now, only - verifying a LUN change (ie: size different) with sense data is - supported. - -DK_CXLFLASH_RECOVER_AFU ------------------------ - This ioctl is used to drive recovery (if such an action is warranted) - of a specified user context. Any state associated with the user context - is re-established upon successful recovery. - - User contexts are put into an error condition when the device needs to - be reset or is terminating. Users are notified of this error condition - by seeing all 0xF's on an MMIO read. Upon encountering this, the - architected behavior for a user is to call into this ioctl to recover - their context. A user may also call into this ioctl at any time to - check if the device is operating normally. If a failure is returned - from this ioctl, the user is expected to gracefully clean up their - context via release/detach ioctls. Until they do, the context they - hold is not relinquished. The user may also optionally exit the process - at which time the context/resources they held will be freed as part of - the release fop. - - When the DK_CXLFLASH_APP_CLOSE_ADAP_FD flag was returned on a successful - attach, the application _must_ unmap and close the fd2 associated with the - original context following this ioctl returning success and indicating that - the context was recovered (DK_CXLFLASH_RECOVER_AFU_CONTEXT_RESET). - -DK_CXLFLASH_MANAGE_LUN ----------------------- - This ioctl is used to switch a LUN from a mode where it is available - for file-system access (legacy), to a mode where it is set aside for - exclusive user space access (superpipe). In case a LUN is visible - across multiple ports and adapters, this ioctl is used to uniquely - identify each LUN by its World Wide Node Name (WWNN). - - -CXL Flash Driver Host IOCTLs -============================ - - Each host adapter instance that is supported by the cxlflash driver - has a special character device associated with it to enable a set of - host management function. These character devices are hosted in a - class dedicated for cxlflash and can be accessed via /dev/cxlflash/*. - - Applications can be written to perform various functions using the - host ioctl APIs below. - - The structure definitions for these IOCTLs are available in: - uapi/scsi/cxlflash_ioctl.h - -HT_CXLFLASH_LUN_PROVISION -------------------------- - This ioctl is used to create and delete persistent LUNs on cxlflash - devices that lack an external LUN management interface. It is only - valid when used with AFUs that support the LUN provision capability. - - When sufficient space is available, LUNs can be created by specifying - the target port to host the LUN and a desired size in 4K blocks. Upon - success, the LUN ID and WWID of the created LUN will be returned and - the SCSI bus can be scanned to detect the change in LUN topology. Note - that partial allocations are not supported. Should a creation fail due - to a space issue, the target port can be queried for its current LUN - geometry. - - To remove a LUN, the device must first be disassociated from the Linux - SCSI subsystem. The LUN deletion can then be initiated by specifying a - target port and LUN ID. Upon success, the LUN geometry associated with - the port will be updated to reflect new number of provisioned LUNs and - available capacity. - - To query the LUN geometry of a port, the target port is specified and - upon success, the following information is presented: - - - Maximum number of provisioned LUNs allowed for the port - - Current number of provisioned LUNs for the port - - Maximum total capacity of provisioned LUNs for the port (4K blocks) - - Current total capacity of provisioned LUNs for the port (4K blocks) - - With this information, the number of available LUNs and capacity can be - can be calculated. - -HT_CXLFLASH_AFU_DEBUG ---------------------- - This ioctl is used to debug AFUs by supporting a command pass-through - interface. It is only valid when used with AFUs that support the AFU - debug capability. - - With exception of buffer management, AFU debug commands are opaque to - cxlflash and treated as pass-through. For debug commands that do require - data transfer, the user supplies an adequately sized data buffer and must - specify the data transfer direction with respect to the host. There is a - maximum transfer size of 256K imposed. Note that partial read completions - are not supported - when errors are experienced with a host read data - transfer, the data buffer is not copied back to the user. diff --git a/Documentation/powerpc/dawr-power9.rst b/Documentation/powerpc/dawr-power9.rst new file mode 100644 index 000000000000..c96ab6befd9c --- /dev/null +++ b/Documentation/powerpc/dawr-power9.rst @@ -0,0 +1,93 @@ +===================== +DAWR issues on POWER9 +===================== + +On POWER9 the Data Address Watchpoint Register (DAWR) can cause a checkstop +if it points to cache inhibited (CI) memory. Currently Linux has no way to +disinguish CI memory when configuring the DAWR, so (for now) the DAWR is +disabled by this commit:: + + commit 9654153158d3e0684a1bdb76dbababdb7111d5a0 + Author: Michael Neuling + Date: Tue Mar 27 15:37:24 2018 +1100 + powerpc: Disable DAWR in the base POWER9 CPU features + +Technical Details: +================== + +DAWR has 6 different ways of being set. +1) ptrace +2) h_set_mode(DAWR) +3) h_set_dabr() +4) kvmppc_set_one_reg() +5) xmon + +For ptrace, we now advertise zero breakpoints on POWER9 via the +PPC_PTRACE_GETHWDBGINFO call. This results in GDB falling back to +software emulation of the watchpoint (which is slow). + +h_set_mode(DAWR) and h_set_dabr() will now return an error to the +guest on a POWER9 host. Current Linux guests ignore this error, so +they will silently not get the DAWR. + +kvmppc_set_one_reg() will store the value in the vcpu but won't +actually set it on POWER9 hardware. This is done so we don't break +migration from POWER8 to POWER9, at the cost of silently losing the +DAWR on the migration. + +For xmon, the 'bd' command will return an error on P9. + +Consequences for users +====================== + +For GDB watchpoints (ie 'watch' command) on POWER9 bare metal , GDB +will accept the command. Unfortunately since there is no hardware +support for the watchpoint, GDB will software emulate the watchpoint +making it run very slowly. + +The same will also be true for any guests started on a POWER9 +host. The watchpoint will fail and GDB will fall back to software +emulation. + +If a guest is started on a POWER8 host, GDB will accept the watchpoint +and configure the hardware to use the DAWR. This will run at full +speed since it can use the hardware emulation. Unfortunately if this +guest is migrated to a POWER9 host, the watchpoint will be lost on the +POWER9. Loads and stores to the watchpoint locations will not be +trapped in GDB. The watchpoint is remembered, so if the guest is +migrated back to the POWER8 host, it will start working again. + +Force enabling the DAWR +======================= +Kernels (since ~v5.2) have an option to force enable the DAWR via:: + + echo Y > /sys/kernel/debug/powerpc/dawr_enable_dangerous + +This enables the DAWR even on POWER9. + +This is a dangerous setting, USE AT YOUR OWN RISK. + +Some users may not care about a bad user crashing their box +(ie. single user/desktop systems) and really want the DAWR. This +allows them to force enable DAWR. + +This flag can also be used to disable DAWR access. Once this is +cleared, all DAWR access should be cleared immediately and your +machine once again safe from crashing. + +Userspace may get confused by toggling this. If DAWR is force +enabled/disabled between getting the number of breakpoints (via +PTRACE_GETHWDBGINFO) and setting the breakpoint, userspace will get an +inconsistent view of what's available. Similarly for guests. + +For the DAWR to be enabled in a KVM guest, the DAWR needs to be force +enabled in the host AND the guest. For this reason, this won't work on +POWERVM as it doesn't allow the HCALL to work. Writes of 'Y' to the +dawr_enable_dangerous file will fail if the hypervisor doesn't support +writing the DAWR. + +To double check the DAWR is working, run this kernel selftest: + + tools/testing/selftests/powerpc/ptrace/ptrace-hwbreak.c + +Any errors/failures/skips mean something is wrong. diff --git a/Documentation/powerpc/dscr.rst b/Documentation/powerpc/dscr.rst new file mode 100644 index 000000000000..2ab99006014c --- /dev/null +++ b/Documentation/powerpc/dscr.rst @@ -0,0 +1,87 @@ +=================================== +DSCR (Data Stream Control Register) +=================================== + +DSCR register in powerpc allows user to have some control of prefetch of data +stream in the processor. Please refer to the ISA documents or related manual +for more detailed information regarding how to use this DSCR to attain this +control of the prefetches . This document here provides an overview of kernel +support for DSCR, related kernel objects, it's functionalities and exported +user interface. + +(A) Data Structures: + + (1) thread_struct:: + + dscr /* Thread DSCR value */ + dscr_inherit /* Thread has changed default DSCR */ + + (2) PACA:: + + dscr_default /* per-CPU DSCR default value */ + + (3) sysfs.c:: + + dscr_default /* System DSCR default value */ + +(B) Scheduler Changes: + + Scheduler will write the per-CPU DSCR default which is stored in the + CPU's PACA value into the register if the thread has dscr_inherit value + cleared which means that it has not changed the default DSCR till now. + If the dscr_inherit value is set which means that it has changed the + default DSCR value, scheduler will write the changed value which will + now be contained in thread struct's dscr into the register instead of + the per-CPU default PACA based DSCR value. + + NOTE: Please note here that the system wide global DSCR value never + gets used directly in the scheduler process context switch at all. + +(C) SYSFS Interface: + + - Global DSCR default: /sys/devices/system/cpu/dscr_default + - CPU specific DSCR default: /sys/devices/system/cpu/cpuN/dscr + + Changing the global DSCR default in the sysfs will change all the CPU + specific DSCR defaults immediately in their PACA structures. Again if + the current process has the dscr_inherit clear, it also writes the new + value into every CPU's DSCR register right away and updates the current + thread's DSCR value as well. + + Changing the CPU specific DSCR default value in the sysfs does exactly + the same thing as above but unlike the global one above, it just changes + stuff for that particular CPU instead for all the CPUs on the system. + +(D) User Space Instructions: + + The DSCR register can be accessed in the user space using any of these + two SPR numbers available for that purpose. + + (1) Problem state SPR: 0x03 (Un-privileged, POWER8 only) + (2) Privileged state SPR: 0x11 (Privileged) + + Accessing DSCR through privileged SPR number (0x11) from user space + works, as it is emulated following an illegal instruction exception + inside the kernel. Both mfspr and mtspr instructions are emulated. + + Accessing DSCR through user level SPR (0x03) from user space will first + create a facility unavailable exception. Inside this exception handler + all mfspr instruction based read attempts will get emulated and returned + where as the first mtspr instruction based write attempts will enable + the DSCR facility for the next time around (both for read and write) by + setting DSCR facility in the FSCR register. + +(E) Specifics about 'dscr_inherit': + + The thread struct element 'dscr_inherit' represents whether the thread + in question has attempted and changed the DSCR itself using any of the + following methods. This element signifies whether the thread wants to + use the CPU default DSCR value or its own changed DSCR value in the + kernel. + + (1) mtspr instruction (SPR number 0x03) + (2) mtspr instruction (SPR number 0x11) + (3) ptrace interface (Explicitly set user DSCR value) + + Any child of the process created after this event in the process inherits + this same behaviour as well. diff --git a/Documentation/powerpc/dscr.txt b/Documentation/powerpc/dscr.txt deleted file mode 100644 index ece300c64f76..000000000000 --- a/Documentation/powerpc/dscr.txt +++ /dev/null @@ -1,83 +0,0 @@ - DSCR (Data Stream Control Register) - ================================================ - -DSCR register in powerpc allows user to have some control of prefetch of data -stream in the processor. Please refer to the ISA documents or related manual -for more detailed information regarding how to use this DSCR to attain this -control of the prefetches . This document here provides an overview of kernel -support for DSCR, related kernel objects, it's functionalities and exported -user interface. - -(A) Data Structures: - - (1) thread_struct: - dscr /* Thread DSCR value */ - dscr_inherit /* Thread has changed default DSCR */ - - (2) PACA: - dscr_default /* per-CPU DSCR default value */ - - (3) sysfs.c: - dscr_default /* System DSCR default value */ - -(B) Scheduler Changes: - - Scheduler will write the per-CPU DSCR default which is stored in the - CPU's PACA value into the register if the thread has dscr_inherit value - cleared which means that it has not changed the default DSCR till now. - If the dscr_inherit value is set which means that it has changed the - default DSCR value, scheduler will write the changed value which will - now be contained in thread struct's dscr into the register instead of - the per-CPU default PACA based DSCR value. - - NOTE: Please note here that the system wide global DSCR value never - gets used directly in the scheduler process context switch at all. - -(C) SYSFS Interface: - - Global DSCR default: /sys/devices/system/cpu/dscr_default - CPU specific DSCR default: /sys/devices/system/cpu/cpuN/dscr - - Changing the global DSCR default in the sysfs will change all the CPU - specific DSCR defaults immediately in their PACA structures. Again if - the current process has the dscr_inherit clear, it also writes the new - value into every CPU's DSCR register right away and updates the current - thread's DSCR value as well. - - Changing the CPU specific DSCR default value in the sysfs does exactly - the same thing as above but unlike the global one above, it just changes - stuff for that particular CPU instead for all the CPUs on the system. - -(D) User Space Instructions: - - The DSCR register can be accessed in the user space using any of these - two SPR numbers available for that purpose. - - (1) Problem state SPR: 0x03 (Un-privileged, POWER8 only) - (2) Privileged state SPR: 0x11 (Privileged) - - Accessing DSCR through privileged SPR number (0x11) from user space - works, as it is emulated following an illegal instruction exception - inside the kernel. Both mfspr and mtspr instructions are emulated. - - Accessing DSCR through user level SPR (0x03) from user space will first - create a facility unavailable exception. Inside this exception handler - all mfspr instruction based read attempts will get emulated and returned - where as the first mtspr instruction based write attempts will enable - the DSCR facility for the next time around (both for read and write) by - setting DSCR facility in the FSCR register. - -(E) Specifics about 'dscr_inherit': - - The thread struct element 'dscr_inherit' represents whether the thread - in question has attempted and changed the DSCR itself using any of the - following methods. This element signifies whether the thread wants to - use the CPU default DSCR value or its own changed DSCR value in the - kernel. - - (1) mtspr instruction (SPR number 0x03) - (2) mtspr instruction (SPR number 0x11) - (3) ptrace interface (Explicitly set user DSCR value) - - Any child of the process created after this event in the process inherits - this same behaviour as well. diff --git a/Documentation/powerpc/eeh-pci-error-recovery.rst b/Documentation/powerpc/eeh-pci-error-recovery.rst new file mode 100644 index 000000000000..438a87ebc095 --- /dev/null +++ b/Documentation/powerpc/eeh-pci-error-recovery.rst @@ -0,0 +1,336 @@ +========================== +PCI Bus EEH Error Recovery +========================== + +Linas Vepstas + +12 January 2005 + + +Overview: +--------- +The IBM POWER-based pSeries and iSeries computers include PCI bus +controller chips that have extended capabilities for detecting and +reporting a large variety of PCI bus error conditions. These features +go under the name of "EEH", for "Enhanced Error Handling". The EEH +hardware features allow PCI bus errors to be cleared and a PCI +card to be "rebooted", without also having to reboot the operating +system. + +This is in contrast to traditional PCI error handling, where the +PCI chip is wired directly to the CPU, and an error would cause +a CPU machine-check/check-stop condition, halting the CPU entirely. +Another "traditional" technique is to ignore such errors, which +can lead to data corruption, both of user data or of kernel data, +hung/unresponsive adapters, or system crashes/lockups. Thus, +the idea behind EEH is that the operating system can become more +reliable and robust by protecting it from PCI errors, and giving +the OS the ability to "reboot"/recover individual PCI devices. + +Future systems from other vendors, based on the PCI-E specification, +may contain similar features. + + +Causes of EEH Errors +-------------------- +EEH was originally designed to guard against hardware failure, such +as PCI cards dying from heat, humidity, dust, vibration and bad +electrical connections. The vast majority of EEH errors seen in +"real life" are due to either poorly seated PCI cards, or, +unfortunately quite commonly, due to device driver bugs, device firmware +bugs, and sometimes PCI card hardware bugs. + +The most common software bug, is one that causes the device to +attempt to DMA to a location in system memory that has not been +reserved for DMA access for that card. This is a powerful feature, +as it prevents what; otherwise, would have been silent memory +corruption caused by the bad DMA. A number of device driver +bugs have been found and fixed in this way over the past few +years. Other possible causes of EEH errors include data or +address line parity errors (for example, due to poor electrical +connectivity due to a poorly seated card), and PCI-X split-completion +errors (due to software, device firmware, or device PCI hardware bugs). +The vast majority of "true hardware failures" can be cured by +physically removing and re-seating the PCI card. + + +Detection and Recovery +---------------------- +In the following discussion, a generic overview of how to detect +and recover from EEH errors will be presented. This is followed +by an overview of how the current implementation in the Linux +kernel does it. The actual implementation is subject to change, +and some of the finer points are still being debated. These +may in turn be swayed if or when other architectures implement +similar functionality. + +When a PCI Host Bridge (PHB, the bus controller connecting the +PCI bus to the system CPU electronics complex) detects a PCI error +condition, it will "isolate" the affected PCI card. Isolation +will block all writes (either to the card from the system, or +from the card to the system), and it will cause all reads to +return all-ff's (0xff, 0xffff, 0xffffffff for 8/16/32-bit reads). +This value was chosen because it is the same value you would +get if the device was physically unplugged from the slot. +This includes access to PCI memory, I/O space, and PCI config +space. Interrupts; however, will continued to be delivered. + +Detection and recovery are performed with the aid of ppc64 +firmware. The programming interfaces in the Linux kernel +into the firmware are referred to as RTAS (Run-Time Abstraction +Services). The Linux kernel does not (should not) access +the EEH function in the PCI chipsets directly, primarily because +there are a number of different chipsets out there, each with +different interfaces and quirks. The firmware provides a +uniform abstraction layer that will work with all pSeries +and iSeries hardware (and be forwards-compatible). + +If the OS or device driver suspects that a PCI slot has been +EEH-isolated, there is a firmware call it can make to determine if +this is the case. If so, then the device driver should put itself +into a consistent state (given that it won't be able to complete any +pending work) and start recovery of the card. Recovery normally +would consist of resetting the PCI device (holding the PCI #RST +line high for two seconds), followed by setting up the device +config space (the base address registers (BAR's), latency timer, +cache line size, interrupt line, and so on). This is followed by a +reinitialization of the device driver. In a worst-case scenario, +the power to the card can be toggled, at least on hot-plug-capable +slots. In principle, layers far above the device driver probably +do not need to know that the PCI card has been "rebooted" in this +way; ideally, there should be at most a pause in Ethernet/disk/USB +I/O while the card is being reset. + +If the card cannot be recovered after three or four resets, the +kernel/device driver should assume the worst-case scenario, that the +card has died completely, and report this error to the sysadmin. +In addition, error messages are reported through RTAS and also through +syslogd (/var/log/messages) to alert the sysadmin of PCI resets. +The correct way to deal with failed adapters is to use the standard +PCI hotplug tools to remove and replace the dead card. + + +Current PPC64 Linux EEH Implementation +-------------------------------------- +At this time, a generic EEH recovery mechanism has been implemented, +so that individual device drivers do not need to be modified to support +EEH recovery. This generic mechanism piggy-backs on the PCI hotplug +infrastructure, and percolates events up through the userspace/udev +infrastructure. Following is a detailed description of how this is +accomplished. + +EEH must be enabled in the PHB's very early during the boot process, +and if a PCI slot is hot-plugged. The former is performed by +eeh_init() in arch/powerpc/platforms/pseries/eeh.c, and the later by +drivers/pci/hotplug/pSeries_pci.c calling in to the eeh.c code. +EEH must be enabled before a PCI scan of the device can proceed. +Current Power5 hardware will not work unless EEH is enabled; +although older Power4 can run with it disabled. Effectively, +EEH can no longer be turned off. PCI devices *must* be +registered with the EEH code; the EEH code needs to know about +the I/O address ranges of the PCI device in order to detect an +error. Given an arbitrary address, the routine +pci_get_device_by_addr() will find the pci device associated +with that address (if any). + +The default arch/powerpc/include/asm/io.h macros readb(), inb(), insb(), +etc. include a check to see if the i/o read returned all-0xff's. +If so, these make a call to eeh_dn_check_failure(), which in turn +asks the firmware if the all-ff's value is the sign of a true EEH +error. If it is not, processing continues as normal. The grand +total number of these false alarms or "false positives" can be +seen in /proc/ppc64/eeh (subject to change). Normally, almost +all of these occur during boot, when the PCI bus is scanned, where +a large number of 0xff reads are part of the bus scan procedure. + +If a frozen slot is detected, code in +arch/powerpc/platforms/pseries/eeh.c will print a stack trace to +syslog (/var/log/messages). This stack trace has proven to be very +useful to device-driver authors for finding out at what point the EEH +error was detected, as the error itself usually occurs slightly +beforehand. + +Next, it uses the Linux kernel notifier chain/work queue mechanism to +allow any interested parties to find out about the failure. Device +drivers, or other parts of the kernel, can use +`eeh_register_notifier(struct notifier_block *)` to find out about EEH +events. The event will include a pointer to the pci device, the +device node and some state info. Receivers of the event can "do as +they wish"; the default handler will be described further in this +section. + +To assist in the recovery of the device, eeh.c exports the +following functions: + +rtas_set_slot_reset() + assert the PCI #RST line for 1/8th of a second +rtas_configure_bridge() + ask firmware to configure any PCI bridges + located topologically under the pci slot. +eeh_save_bars() and eeh_restore_bars(): + save and restore the PCI + config-space info for a device and any devices under it. + + +A handler for the EEH notifier_block events is implemented in +drivers/pci/hotplug/pSeries_pci.c, called handle_eeh_events(). +It saves the device BAR's and then calls rpaphp_unconfig_pci_adapter(). +This last call causes the device driver for the card to be stopped, +which causes uevents to go out to user space. This triggers +user-space scripts that might issue commands such as "ifdown eth0" +for ethernet cards, and so on. This handler then sleeps for 5 seconds, +hoping to give the user-space scripts enough time to complete. +It then resets the PCI card, reconfigures the device BAR's, and +any bridges underneath. It then calls rpaphp_enable_pci_slot(), +which restarts the device driver and triggers more user-space +events (for example, calling "ifup eth0" for ethernet cards). + + +Device Shutdown and User-Space Events +------------------------------------- +This section documents what happens when a pci slot is unconfigured, +focusing on how the device driver gets shut down, and on how the +events get delivered to user-space scripts. + +Following is an example sequence of events that cause a device driver +close function to be called during the first phase of an EEH reset. +The following sequence is an example of the pcnet32 device driver:: + + rpa_php_unconfig_pci_adapter (struct slot *) // in rpaphp_pci.c + { + calls + pci_remove_bus_device (struct pci_dev *) // in /drivers/pci/remove.c + { + calls + pci_destroy_dev (struct pci_dev *) + { + calls + device_unregister (&dev->dev) // in /drivers/base/core.c + { + calls + device_del (struct device *) + { + calls + bus_remove_device() // in /drivers/base/bus.c + { + calls + device_release_driver() + { + calls + struct device_driver->remove() which is just + pci_device_remove() // in /drivers/pci/pci_driver.c + { + calls + struct pci_driver->remove() which is just + pcnet32_remove_one() // in /drivers/net/pcnet32.c + { + calls + unregister_netdev() // in /net/core/dev.c + { + calls + dev_close() // in /net/core/dev.c + { + calls dev->stop(); + which is just pcnet32_close() // in pcnet32.c + { + which does what you wanted + to stop the device + } + } + } + which + frees pcnet32 device driver memory + } + }}}}}} + + +in drivers/pci/pci_driver.c, +struct device_driver->remove() is just pci_device_remove() +which calls struct pci_driver->remove() which is pcnet32_remove_one() +which calls unregister_netdev() (in net/core/dev.c) +which calls dev_close() (in net/core/dev.c) +which calls dev->stop() which is pcnet32_close() +which then does the appropriate shutdown. + +--- + +Following is the analogous stack trace for events sent to user-space +when the pci device is unconfigured:: + + rpa_php_unconfig_pci_adapter() { // in rpaphp_pci.c + calls + pci_remove_bus_device (struct pci_dev *) { // in /drivers/pci/remove.c + calls + pci_destroy_dev (struct pci_dev *) { + calls + device_unregister (&dev->dev) { // in /drivers/base/core.c + calls + device_del(struct device * dev) { // in /drivers/base/core.c + calls + kobject_del() { //in /libs/kobject.c + calls + kobject_uevent() { // in /libs/kobject.c + calls + kset_uevent() { // in /lib/kobject.c + calls + kset->uevent_ops->uevent() // which is really just + a call to + dev_uevent() { // in /drivers/base/core.c + calls + dev->bus->uevent() which is really just a call to + pci_uevent () { // in drivers/pci/hotplug.c + which prints device name, etc.... + } + } + then kobject_uevent() sends a netlink uevent to userspace + --> userspace uevent + (during early boot, nobody listens to netlink events and + kobject_uevent() executes uevent_helper[], which runs the + event process /sbin/hotplug) + } + } + kobject_del() then calls sysfs_remove_dir(), which would + trigger any user-space daemon that was watching /sysfs, + and notice the delete event. + + +Pro's and Con's of the Current Design +------------------------------------- +There are several issues with the current EEH software recovery design, +which may be addressed in future revisions. But first, note that the +big plus of the current design is that no changes need to be made to +individual device drivers, so that the current design throws a wide net. +The biggest negative of the design is that it potentially disturbs +network daemons and file systems that didn't need to be disturbed. + +- A minor complaint is that resetting the network card causes + user-space back-to-back ifdown/ifup burps that potentially disturb + network daemons, that didn't need to even know that the pci + card was being rebooted. + +- A more serious concern is that the same reset, for SCSI devices, + causes havoc to mounted file systems. Scripts cannot post-facto + unmount a file system without flushing pending buffers, but this + is impossible, because I/O has already been stopped. Thus, + ideally, the reset should happen at or below the block layer, + so that the file systems are not disturbed. + + Reiserfs does not tolerate errors returned from the block device. + Ext3fs seems to be tolerant, retrying reads/writes until it does + succeed. Both have been only lightly tested in this scenario. + + The SCSI-generic subsystem already has built-in code for performing + SCSI device resets, SCSI bus resets, and SCSI host-bus-adapter + (HBA) resets. These are cascaded into a chain of attempted + resets if a SCSI command fails. These are completely hidden + from the block layer. It would be very natural to add an EEH + reset into this chain of events. + +- If a SCSI error occurs for the root device, all is lost unless + the sysadmin had the foresight to run /bin, /sbin, /etc, /var + and so on, out of ramdisk/tmpfs. + + +Conclusions +----------- +There's forward progress ... diff --git a/Documentation/powerpc/eeh-pci-error-recovery.txt b/Documentation/powerpc/eeh-pci-error-recovery.txt deleted file mode 100644 index 678189280bb4..000000000000 --- a/Documentation/powerpc/eeh-pci-error-recovery.txt +++ /dev/null @@ -1,334 +0,0 @@ - - - PCI Bus EEH Error Recovery - -------------------------- - Linas Vepstas - - 12 January 2005 - - -Overview: ---------- -The IBM POWER-based pSeries and iSeries computers include PCI bus -controller chips that have extended capabilities for detecting and -reporting a large variety of PCI bus error conditions. These features -go under the name of "EEH", for "Enhanced Error Handling". The EEH -hardware features allow PCI bus errors to be cleared and a PCI -card to be "rebooted", without also having to reboot the operating -system. - -This is in contrast to traditional PCI error handling, where the -PCI chip is wired directly to the CPU, and an error would cause -a CPU machine-check/check-stop condition, halting the CPU entirely. -Another "traditional" technique is to ignore such errors, which -can lead to data corruption, both of user data or of kernel data, -hung/unresponsive adapters, or system crashes/lockups. Thus, -the idea behind EEH is that the operating system can become more -reliable and robust by protecting it from PCI errors, and giving -the OS the ability to "reboot"/recover individual PCI devices. - -Future systems from other vendors, based on the PCI-E specification, -may contain similar features. - - -Causes of EEH Errors --------------------- -EEH was originally designed to guard against hardware failure, such -as PCI cards dying from heat, humidity, dust, vibration and bad -electrical connections. The vast majority of EEH errors seen in -"real life" are due to either poorly seated PCI cards, or, -unfortunately quite commonly, due to device driver bugs, device firmware -bugs, and sometimes PCI card hardware bugs. - -The most common software bug, is one that causes the device to -attempt to DMA to a location in system memory that has not been -reserved for DMA access for that card. This is a powerful feature, -as it prevents what; otherwise, would have been silent memory -corruption caused by the bad DMA. A number of device driver -bugs have been found and fixed in this way over the past few -years. Other possible causes of EEH errors include data or -address line parity errors (for example, due to poor electrical -connectivity due to a poorly seated card), and PCI-X split-completion -errors (due to software, device firmware, or device PCI hardware bugs). -The vast majority of "true hardware failures" can be cured by -physically removing and re-seating the PCI card. - - -Detection and Recovery ----------------------- -In the following discussion, a generic overview of how to detect -and recover from EEH errors will be presented. This is followed -by an overview of how the current implementation in the Linux -kernel does it. The actual implementation is subject to change, -and some of the finer points are still being debated. These -may in turn be swayed if or when other architectures implement -similar functionality. - -When a PCI Host Bridge (PHB, the bus controller connecting the -PCI bus to the system CPU electronics complex) detects a PCI error -condition, it will "isolate" the affected PCI card. Isolation -will block all writes (either to the card from the system, or -from the card to the system), and it will cause all reads to -return all-ff's (0xff, 0xffff, 0xffffffff for 8/16/32-bit reads). -This value was chosen because it is the same value you would -get if the device was physically unplugged from the slot. -This includes access to PCI memory, I/O space, and PCI config -space. Interrupts; however, will continued to be delivered. - -Detection and recovery are performed with the aid of ppc64 -firmware. The programming interfaces in the Linux kernel -into the firmware are referred to as RTAS (Run-Time Abstraction -Services). The Linux kernel does not (should not) access -the EEH function in the PCI chipsets directly, primarily because -there are a number of different chipsets out there, each with -different interfaces and quirks. The firmware provides a -uniform abstraction layer that will work with all pSeries -and iSeries hardware (and be forwards-compatible). - -If the OS or device driver suspects that a PCI slot has been -EEH-isolated, there is a firmware call it can make to determine if -this is the case. If so, then the device driver should put itself -into a consistent state (given that it won't be able to complete any -pending work) and start recovery of the card. Recovery normally -would consist of resetting the PCI device (holding the PCI #RST -line high for two seconds), followed by setting up the device -config space (the base address registers (BAR's), latency timer, -cache line size, interrupt line, and so on). This is followed by a -reinitialization of the device driver. In a worst-case scenario, -the power to the card can be toggled, at least on hot-plug-capable -slots. In principle, layers far above the device driver probably -do not need to know that the PCI card has been "rebooted" in this -way; ideally, there should be at most a pause in Ethernet/disk/USB -I/O while the card is being reset. - -If the card cannot be recovered after three or four resets, the -kernel/device driver should assume the worst-case scenario, that the -card has died completely, and report this error to the sysadmin. -In addition, error messages are reported through RTAS and also through -syslogd (/var/log/messages) to alert the sysadmin of PCI resets. -The correct way to deal with failed adapters is to use the standard -PCI hotplug tools to remove and replace the dead card. - - -Current PPC64 Linux EEH Implementation --------------------------------------- -At this time, a generic EEH recovery mechanism has been implemented, -so that individual device drivers do not need to be modified to support -EEH recovery. This generic mechanism piggy-backs on the PCI hotplug -infrastructure, and percolates events up through the userspace/udev -infrastructure. Following is a detailed description of how this is -accomplished. - -EEH must be enabled in the PHB's very early during the boot process, -and if a PCI slot is hot-plugged. The former is performed by -eeh_init() in arch/powerpc/platforms/pseries/eeh.c, and the later by -drivers/pci/hotplug/pSeries_pci.c calling in to the eeh.c code. -EEH must be enabled before a PCI scan of the device can proceed. -Current Power5 hardware will not work unless EEH is enabled; -although older Power4 can run with it disabled. Effectively, -EEH can no longer be turned off. PCI devices *must* be -registered with the EEH code; the EEH code needs to know about -the I/O address ranges of the PCI device in order to detect an -error. Given an arbitrary address, the routine -pci_get_device_by_addr() will find the pci device associated -with that address (if any). - -The default arch/powerpc/include/asm/io.h macros readb(), inb(), insb(), -etc. include a check to see if the i/o read returned all-0xff's. -If so, these make a call to eeh_dn_check_failure(), which in turn -asks the firmware if the all-ff's value is the sign of a true EEH -error. If it is not, processing continues as normal. The grand -total number of these false alarms or "false positives" can be -seen in /proc/ppc64/eeh (subject to change). Normally, almost -all of these occur during boot, when the PCI bus is scanned, where -a large number of 0xff reads are part of the bus scan procedure. - -If a frozen slot is detected, code in -arch/powerpc/platforms/pseries/eeh.c will print a stack trace to -syslog (/var/log/messages). This stack trace has proven to be very -useful to device-driver authors for finding out at what point the EEH -error was detected, as the error itself usually occurs slightly -beforehand. - -Next, it uses the Linux kernel notifier chain/work queue mechanism to -allow any interested parties to find out about the failure. Device -drivers, or other parts of the kernel, can use -eeh_register_notifier(struct notifier_block *) to find out about EEH -events. The event will include a pointer to the pci device, the -device node and some state info. Receivers of the event can "do as -they wish"; the default handler will be described further in this -section. - -To assist in the recovery of the device, eeh.c exports the -following functions: - -rtas_set_slot_reset() -- assert the PCI #RST line for 1/8th of a second -rtas_configure_bridge() -- ask firmware to configure any PCI bridges - located topologically under the pci slot. -eeh_save_bars() and eeh_restore_bars(): save and restore the PCI - config-space info for a device and any devices under it. - - -A handler for the EEH notifier_block events is implemented in -drivers/pci/hotplug/pSeries_pci.c, called handle_eeh_events(). -It saves the device BAR's and then calls rpaphp_unconfig_pci_adapter(). -This last call causes the device driver for the card to be stopped, -which causes uevents to go out to user space. This triggers -user-space scripts that might issue commands such as "ifdown eth0" -for ethernet cards, and so on. This handler then sleeps for 5 seconds, -hoping to give the user-space scripts enough time to complete. -It then resets the PCI card, reconfigures the device BAR's, and -any bridges underneath. It then calls rpaphp_enable_pci_slot(), -which restarts the device driver and triggers more user-space -events (for example, calling "ifup eth0" for ethernet cards). - - -Device Shutdown and User-Space Events -------------------------------------- -This section documents what happens when a pci slot is unconfigured, -focusing on how the device driver gets shut down, and on how the -events get delivered to user-space scripts. - -Following is an example sequence of events that cause a device driver -close function to be called during the first phase of an EEH reset. -The following sequence is an example of the pcnet32 device driver. - - rpa_php_unconfig_pci_adapter (struct slot *) // in rpaphp_pci.c - { - calls - pci_remove_bus_device (struct pci_dev *) // in /drivers/pci/remove.c - { - calls - pci_destroy_dev (struct pci_dev *) - { - calls - device_unregister (&dev->dev) // in /drivers/base/core.c - { - calls - device_del (struct device *) - { - calls - bus_remove_device() // in /drivers/base/bus.c - { - calls - device_release_driver() - { - calls - struct device_driver->remove() which is just - pci_device_remove() // in /drivers/pci/pci_driver.c - { - calls - struct pci_driver->remove() which is just - pcnet32_remove_one() // in /drivers/net/pcnet32.c - { - calls - unregister_netdev() // in /net/core/dev.c - { - calls - dev_close() // in /net/core/dev.c - { - calls dev->stop(); - which is just pcnet32_close() // in pcnet32.c - { - which does what you wanted - to stop the device - } - } - } - which - frees pcnet32 device driver memory - } - }}}}}} - - - in drivers/pci/pci_driver.c, - struct device_driver->remove() is just pci_device_remove() - which calls struct pci_driver->remove() which is pcnet32_remove_one() - which calls unregister_netdev() (in net/core/dev.c) - which calls dev_close() (in net/core/dev.c) - which calls dev->stop() which is pcnet32_close() - which then does the appropriate shutdown. - ---- -Following is the analogous stack trace for events sent to user-space -when the pci device is unconfigured. - -rpa_php_unconfig_pci_adapter() { // in rpaphp_pci.c - calls - pci_remove_bus_device (struct pci_dev *) { // in /drivers/pci/remove.c - calls - pci_destroy_dev (struct pci_dev *) { - calls - device_unregister (&dev->dev) { // in /drivers/base/core.c - calls - device_del(struct device * dev) { // in /drivers/base/core.c - calls - kobject_del() { //in /libs/kobject.c - calls - kobject_uevent() { // in /libs/kobject.c - calls - kset_uevent() { // in /lib/kobject.c - calls - kset->uevent_ops->uevent() // which is really just - a call to - dev_uevent() { // in /drivers/base/core.c - calls - dev->bus->uevent() which is really just a call to - pci_uevent () { // in drivers/pci/hotplug.c - which prints device name, etc.... - } - } - then kobject_uevent() sends a netlink uevent to userspace - --> userspace uevent - (during early boot, nobody listens to netlink events and - kobject_uevent() executes uevent_helper[], which runs the - event process /sbin/hotplug) - } - } - kobject_del() then calls sysfs_remove_dir(), which would - trigger any user-space daemon that was watching /sysfs, - and notice the delete event. - - -Pro's and Con's of the Current Design -------------------------------------- -There are several issues with the current EEH software recovery design, -which may be addressed in future revisions. But first, note that the -big plus of the current design is that no changes need to be made to -individual device drivers, so that the current design throws a wide net. -The biggest negative of the design is that it potentially disturbs -network daemons and file systems that didn't need to be disturbed. - --- A minor complaint is that resetting the network card causes - user-space back-to-back ifdown/ifup burps that potentially disturb - network daemons, that didn't need to even know that the pci - card was being rebooted. - --- A more serious concern is that the same reset, for SCSI devices, - causes havoc to mounted file systems. Scripts cannot post-facto - unmount a file system without flushing pending buffers, but this - is impossible, because I/O has already been stopped. Thus, - ideally, the reset should happen at or below the block layer, - so that the file systems are not disturbed. - - Reiserfs does not tolerate errors returned from the block device. - Ext3fs seems to be tolerant, retrying reads/writes until it does - succeed. Both have been only lightly tested in this scenario. - - The SCSI-generic subsystem already has built-in code for performing - SCSI device resets, SCSI bus resets, and SCSI host-bus-adapter - (HBA) resets. These are cascaded into a chain of attempted - resets if a SCSI command fails. These are completely hidden - from the block layer. It would be very natural to add an EEH - reset into this chain of events. - --- If a SCSI error occurs for the root device, all is lost unless - the sysadmin had the foresight to run /bin, /sbin, /etc, /var - and so on, out of ramdisk/tmpfs. - - -Conclusions ------------ -There's forward progress ... - - diff --git a/Documentation/powerpc/firmware-assisted-dump.rst b/Documentation/powerpc/firmware-assisted-dump.rst new file mode 100644 index 000000000000..9ca12830a48e --- /dev/null +++ b/Documentation/powerpc/firmware-assisted-dump.rst @@ -0,0 +1,301 @@ +====================== +Firmware-Assisted Dump +====================== + +July 2011 + +The goal of firmware-assisted dump is to enable the dump of +a crashed system, and to do so from a fully-reset system, and +to minimize the total elapsed time until the system is back +in production use. + +- Firmware assisted dump (fadump) infrastructure is intended to replace + the existing phyp assisted dump. +- Fadump uses the same firmware interfaces and memory reservation model + as phyp assisted dump. +- Unlike phyp dump, fadump exports the memory dump through /proc/vmcore + in the ELF format in the same way as kdump. This helps us reuse the + kdump infrastructure for dump capture and filtering. +- Unlike phyp dump, userspace tool does not need to refer any sysfs + interface while reading /proc/vmcore. +- Unlike phyp dump, fadump allows user to release all the memory reserved + for dump, with a single operation of echo 1 > /sys/kernel/fadump_release_mem. +- Once enabled through kernel boot parameter, fadump can be + started/stopped through /sys/kernel/fadump_registered interface (see + sysfs files section below) and can be easily integrated with kdump + service start/stop init scripts. + +Comparing with kdump or other strategies, firmware-assisted +dump offers several strong, practical advantages: + +- Unlike kdump, the system has been reset, and loaded + with a fresh copy of the kernel. In particular, + PCI and I/O devices have been reinitialized and are + in a clean, consistent state. +- Once the dump is copied out, the memory that held the dump + is immediately available to the running kernel. And therefore, + unlike kdump, fadump doesn't need a 2nd reboot to get back + the system to the production configuration. + +The above can only be accomplished by coordination with, +and assistance from the Power firmware. The procedure is +as follows: + +- The first kernel registers the sections of memory with the + Power firmware for dump preservation during OS initialization. + These registered sections of memory are reserved by the first + kernel during early boot. + +- When a system crashes, the Power firmware will save + the low memory (boot memory of size larger of 5% of system RAM + or 256MB) of RAM to the previous registered region. It will + also save system registers, and hardware PTE's. + + NOTE: + The term 'boot memory' means size of the low memory chunk + that is required for a kernel to boot successfully when + booted with restricted memory. By default, the boot memory + size will be the larger of 5% of system RAM or 256MB. + Alternatively, user can also specify boot memory size + through boot parameter 'crashkernel=' which will override + the default calculated size. Use this option if default + boot memory size is not sufficient for second kernel to + boot successfully. For syntax of crashkernel= parameter, + refer to Documentation/admin-guide/kdump/kdump.rst. If any offset is + provided in crashkernel= parameter, it will be ignored + as fadump uses a predefined offset to reserve memory + for boot memory dump preservation in case of a crash. + +- After the low memory (boot memory) area has been saved, the + firmware will reset PCI and other hardware state. It will + *not* clear the RAM. It will then launch the bootloader, as + normal. + +- The freshly booted kernel will notice that there is a new + node (ibm,dump-kernel) in the device tree, indicating that + there is crash data available from a previous boot. During + the early boot OS will reserve rest of the memory above + boot memory size effectively booting with restricted memory + size. This will make sure that the second kernel will not + touch any of the dump memory area. + +- User-space tools will read /proc/vmcore to obtain the contents + of memory, which holds the previous crashed kernel dump in ELF + format. The userspace tools may copy this info to disk, or + network, nas, san, iscsi, etc. as desired. + +- Once the userspace tool is done saving dump, it will echo + '1' to /sys/kernel/fadump_release_mem to release the reserved + memory back to general use, except the memory required for + next firmware-assisted dump registration. + + e.g.:: + + # echo 1 > /sys/kernel/fadump_release_mem + +Please note that the firmware-assisted dump feature +is only available on Power6 and above systems with recent +firmware versions. + +Implementation details: +----------------------- + +During boot, a check is made to see if firmware supports +this feature on that particular machine. If it does, then +we check to see if an active dump is waiting for us. If yes +then everything but boot memory size of RAM is reserved during +early boot (See Fig. 2). This area is released once we finish +collecting the dump from user land scripts (e.g. kdump scripts) +that are run. If there is dump data, then the +/sys/kernel/fadump_release_mem file is created, and the reserved +memory is held. + +If there is no waiting dump data, then only the memory required +to hold CPU state, HPTE region, boot memory dump and elfcore +header, is usually reserved at an offset greater than boot memory +size (see Fig. 1). This area is *not* released: this region will +be kept permanently reserved, so that it can act as a receptacle +for a copy of the boot memory content in addition to CPU state +and HPTE region, in the case a crash does occur. Since this reserved +memory area is used only after the system crash, there is no point in +blocking this significant chunk of memory from production kernel. +Hence, the implementation uses the Linux kernel's Contiguous Memory +Allocator (CMA) for memory reservation if CMA is configured for kernel. +With CMA reservation this memory will be available for applications to +use it, while kernel is prevented from using it. With this fadump will +still be able to capture all of the kernel memory and most of the user +space memory except the user pages that were present in CMA region:: + + o Memory Reservation during first kernel + + Low memory Top of memory + 0 boot memory size | + | | |<--Reserved dump area -->| | + V V | Permanent Reservation | V + +-----------+----------/ /---+---+----+-----------+----+------+ + | | |CPU|HPTE| DUMP |ELF | | + +-----------+----------/ /---+---+----+-----------+----+------+ + | ^ + | | + \ / + ------------------------------------------- + Boot memory content gets transferred to + reserved area by firmware at the time of + crash + Fig. 1 + + o Memory Reservation during second kernel after crash + + Low memory Top of memory + 0 boot memory size | + | |<------------- Reserved dump area ----------- -->| + V V V + +-----------+----------/ /---+---+----+-----------+----+------+ + | | |CPU|HPTE| DUMP |ELF | | + +-----------+----------/ /---+---+----+-----------+----+------+ + | | + V V + Used by second /proc/vmcore + kernel to boot + Fig. 2 + +Currently the dump will be copied from /proc/vmcore to a +a new file upon user intervention. The dump data available through +/proc/vmcore will be in ELF format. Hence the existing kdump +infrastructure (kdump scripts) to save the dump works fine with +minor modifications. + +The tools to examine the dump will be same as the ones +used for kdump. + +How to enable firmware-assisted dump (fadump): +---------------------------------------------- + +1. Set config option CONFIG_FA_DUMP=y and build kernel. +2. Boot into linux kernel with 'fadump=on' kernel cmdline option. + By default, fadump reserved memory will be initialized as CMA area. + Alternatively, user can boot linux kernel with 'fadump=nocma' to + prevent fadump to use CMA. +3. Optionally, user can also set 'crashkernel=' kernel cmdline + to specify size of the memory to reserve for boot memory dump + preservation. + +NOTE: + 1. 'fadump_reserve_mem=' parameter has been deprecated. Instead + use 'crashkernel=' to specify size of the memory to reserve + for boot memory dump preservation. + 2. If firmware-assisted dump fails to reserve memory then it + will fallback to existing kdump mechanism if 'crashkernel=' + option is set at kernel cmdline. + 3. if user wants to capture all of user space memory and ok with + reserved memory not available to production system, then + 'fadump=nocma' kernel parameter can be used to fallback to + old behaviour. + +Sysfs/debugfs files: +-------------------- + +Firmware-assisted dump feature uses sysfs file system to hold +the control files and debugfs file to display memory reserved region. + +Here is the list of files under kernel sysfs: + + /sys/kernel/fadump_enabled + This is used to display the fadump status. + + - 0 = fadump is disabled + - 1 = fadump is enabled + + This interface can be used by kdump init scripts to identify if + fadump is enabled in the kernel and act accordingly. + + /sys/kernel/fadump_registered + This is used to display the fadump registration status as well + as to control (start/stop) the fadump registration. + + - 0 = fadump is not registered. + - 1 = fadump is registered and ready to handle system crash. + + To register fadump echo 1 > /sys/kernel/fadump_registered and + echo 0 > /sys/kernel/fadump_registered for un-register and stop the + fadump. Once the fadump is un-registered, the system crash will not + be handled and vmcore will not be captured. This interface can be + easily integrated with kdump service start/stop. + + /sys/kernel/fadump_release_mem + This file is available only when fadump is active during + second kernel. This is used to release the reserved memory + region that are held for saving crash dump. To release the + reserved memory echo 1 to it:: + + echo 1 > /sys/kernel/fadump_release_mem + + After echo 1, the content of the /sys/kernel/debug/powerpc/fadump_region + file will change to reflect the new memory reservations. + + The existing userspace tools (kdump infrastructure) can be easily + enhanced to use this interface to release the memory reserved for + dump and continue without 2nd reboot. + +Here is the list of files under powerpc debugfs: +(Assuming debugfs is mounted on /sys/kernel/debug directory.) + + /sys/kernel/debug/powerpc/fadump_region + This file shows the reserved memory regions if fadump is + enabled otherwise this file is empty. The output format + is:: + + : [-] bytes, Dumped: + + e.g. + Contents when fadump is registered during first kernel:: + + # cat /sys/kernel/debug/powerpc/fadump_region + CPU : [0x0000006ffb0000-0x0000006fff001f] 0x40020 bytes, Dumped: 0x0 + HPTE: [0x0000006fff0020-0x0000006fff101f] 0x1000 bytes, Dumped: 0x0 + DUMP: [0x0000006fff1020-0x0000007fff101f] 0x10000000 bytes, Dumped: 0x0 + + Contents when fadump is active during second kernel:: + + # cat /sys/kernel/debug/powerpc/fadump_region + CPU : [0x0000006ffb0000-0x0000006fff001f] 0x40020 bytes, Dumped: 0x40020 + HPTE: [0x0000006fff0020-0x0000006fff101f] 0x1000 bytes, Dumped: 0x1000 + DUMP: [0x0000006fff1020-0x0000007fff101f] 0x10000000 bytes, Dumped: 0x10000000 + : [0x00000010000000-0x0000006ffaffff] 0x5ffb0000 bytes, Dumped: 0x5ffb0000 + +NOTE: + Please refer to Documentation/filesystems/debugfs.txt on + how to mount the debugfs filesystem. + + +TODO: +----- + - Need to come up with the better approach to find out more + accurate boot memory size that is required for a kernel to + boot successfully when booted with restricted memory. + - The fadump implementation introduces a fadump crash info structure + in the scratch area before the ELF core header. The idea of introducing + this structure is to pass some important crash info data to the second + kernel which will help second kernel to populate ELF core header with + correct data before it gets exported through /proc/vmcore. The current + design implementation does not address a possibility of introducing + additional fields (in future) to this structure without affecting + compatibility. Need to come up with the better approach to address this. + + The possible approaches are: + + 1. Introduce version field for version tracking, bump up the version + whenever a new field is added to the structure in future. The version + field can be used to find out what fields are valid for the current + version of the structure. + 2. Reserve the area of predefined size (say PAGE_SIZE) for this + structure and have unused area as reserved (initialized to zero) + for future field additions. + + The advantage of approach 1 over 2 is we don't need to reserve extra space. + +Author: Mahesh Salgaonkar + +This document is based on the original documentation written for phyp + +assisted dump by Linas Vepstas and Manish Ahuja. diff --git a/Documentation/powerpc/firmware-assisted-dump.txt b/Documentation/powerpc/firmware-assisted-dump.txt deleted file mode 100644 index 10e7f4d16c14..000000000000 --- a/Documentation/powerpc/firmware-assisted-dump.txt +++ /dev/null @@ -1,292 +0,0 @@ - - Firmware-Assisted Dump - ------------------------ - July 2011 - -The goal of firmware-assisted dump is to enable the dump of -a crashed system, and to do so from a fully-reset system, and -to minimize the total elapsed time until the system is back -in production use. - -- Firmware assisted dump (fadump) infrastructure is intended to replace - the existing phyp assisted dump. -- Fadump uses the same firmware interfaces and memory reservation model - as phyp assisted dump. -- Unlike phyp dump, fadump exports the memory dump through /proc/vmcore - in the ELF format in the same way as kdump. This helps us reuse the - kdump infrastructure for dump capture and filtering. -- Unlike phyp dump, userspace tool does not need to refer any sysfs - interface while reading /proc/vmcore. -- Unlike phyp dump, fadump allows user to release all the memory reserved - for dump, with a single operation of echo 1 > /sys/kernel/fadump_release_mem. -- Once enabled through kernel boot parameter, fadump can be - started/stopped through /sys/kernel/fadump_registered interface (see - sysfs files section below) and can be easily integrated with kdump - service start/stop init scripts. - -Comparing with kdump or other strategies, firmware-assisted -dump offers several strong, practical advantages: - --- Unlike kdump, the system has been reset, and loaded - with a fresh copy of the kernel. In particular, - PCI and I/O devices have been reinitialized and are - in a clean, consistent state. --- Once the dump is copied out, the memory that held the dump - is immediately available to the running kernel. And therefore, - unlike kdump, fadump doesn't need a 2nd reboot to get back - the system to the production configuration. - -The above can only be accomplished by coordination with, -and assistance from the Power firmware. The procedure is -as follows: - --- The first kernel registers the sections of memory with the - Power firmware for dump preservation during OS initialization. - These registered sections of memory are reserved by the first - kernel during early boot. - --- When a system crashes, the Power firmware will save - the low memory (boot memory of size larger of 5% of system RAM - or 256MB) of RAM to the previous registered region. It will - also save system registers, and hardware PTE's. - - NOTE: The term 'boot memory' means size of the low memory chunk - that is required for a kernel to boot successfully when - booted with restricted memory. By default, the boot memory - size will be the larger of 5% of system RAM or 256MB. - Alternatively, user can also specify boot memory size - through boot parameter 'crashkernel=' which will override - the default calculated size. Use this option if default - boot memory size is not sufficient for second kernel to - boot successfully. For syntax of crashkernel= parameter, - refer to Documentation/admin-guide/kdump/kdump.rst. If any offset is - provided in crashkernel= parameter, it will be ignored - as fadump uses a predefined offset to reserve memory - for boot memory dump preservation in case of a crash. - --- After the low memory (boot memory) area has been saved, the - firmware will reset PCI and other hardware state. It will - *not* clear the RAM. It will then launch the bootloader, as - normal. - --- The freshly booted kernel will notice that there is a new - node (ibm,dump-kernel) in the device tree, indicating that - there is crash data available from a previous boot. During - the early boot OS will reserve rest of the memory above - boot memory size effectively booting with restricted memory - size. This will make sure that the second kernel will not - touch any of the dump memory area. - --- User-space tools will read /proc/vmcore to obtain the contents - of memory, which holds the previous crashed kernel dump in ELF - format. The userspace tools may copy this info to disk, or - network, nas, san, iscsi, etc. as desired. - --- Once the userspace tool is done saving dump, it will echo - '1' to /sys/kernel/fadump_release_mem to release the reserved - memory back to general use, except the memory required for - next firmware-assisted dump registration. - - e.g. - # echo 1 > /sys/kernel/fadump_release_mem - -Please note that the firmware-assisted dump feature -is only available on Power6 and above systems with recent -firmware versions. - -Implementation details: ----------------------- - -During boot, a check is made to see if firmware supports -this feature on that particular machine. If it does, then -we check to see if an active dump is waiting for us. If yes -then everything but boot memory size of RAM is reserved during -early boot (See Fig. 2). This area is released once we finish -collecting the dump from user land scripts (e.g. kdump scripts) -that are run. If there is dump data, then the -/sys/kernel/fadump_release_mem file is created, and the reserved -memory is held. - -If there is no waiting dump data, then only the memory required -to hold CPU state, HPTE region, boot memory dump and elfcore -header, is usually reserved at an offset greater than boot memory -size (see Fig. 1). This area is *not* released: this region will -be kept permanently reserved, so that it can act as a receptacle -for a copy of the boot memory content in addition to CPU state -and HPTE region, in the case a crash does occur. Since this reserved -memory area is used only after the system crash, there is no point in -blocking this significant chunk of memory from production kernel. -Hence, the implementation uses the Linux kernel's Contiguous Memory -Allocator (CMA) for memory reservation if CMA is configured for kernel. -With CMA reservation this memory will be available for applications to -use it, while kernel is prevented from using it. With this fadump will -still be able to capture all of the kernel memory and most of the user -space memory except the user pages that were present in CMA region. - - o Memory Reservation during first kernel - - Low memory Top of memory - 0 boot memory size | - | | |<--Reserved dump area -->| | - V V | Permanent Reservation | V - +-----------+----------/ /---+---+----+-----------+----+------+ - | | |CPU|HPTE| DUMP |ELF | | - +-----------+----------/ /---+---+----+-----------+----+------+ - | ^ - | | - \ / - ------------------------------------------- - Boot memory content gets transferred to - reserved area by firmware at the time of - crash - Fig. 1 - - o Memory Reservation during second kernel after crash - - Low memory Top of memory - 0 boot memory size | - | |<------------- Reserved dump area ----------- -->| - V V V - +-----------+----------/ /---+---+----+-----------+----+------+ - | | |CPU|HPTE| DUMP |ELF | | - +-----------+----------/ /---+---+----+-----------+----+------+ - | | - V V - Used by second /proc/vmcore - kernel to boot - Fig. 2 - -Currently the dump will be copied from /proc/vmcore to a -a new file upon user intervention. The dump data available through -/proc/vmcore will be in ELF format. Hence the existing kdump -infrastructure (kdump scripts) to save the dump works fine with -minor modifications. - -The tools to examine the dump will be same as the ones -used for kdump. - -How to enable firmware-assisted dump (fadump): -------------------------------------- - -1. Set config option CONFIG_FA_DUMP=y and build kernel. -2. Boot into linux kernel with 'fadump=on' kernel cmdline option. - By default, fadump reserved memory will be initialized as CMA area. - Alternatively, user can boot linux kernel with 'fadump=nocma' to - prevent fadump to use CMA. -3. Optionally, user can also set 'crashkernel=' kernel cmdline - to specify size of the memory to reserve for boot memory dump - preservation. - -NOTE: 1. 'fadump_reserve_mem=' parameter has been deprecated. Instead - use 'crashkernel=' to specify size of the memory to reserve - for boot memory dump preservation. - 2. If firmware-assisted dump fails to reserve memory then it - will fallback to existing kdump mechanism if 'crashkernel=' - option is set at kernel cmdline. - 3. if user wants to capture all of user space memory and ok with - reserved memory not available to production system, then - 'fadump=nocma' kernel parameter can be used to fallback to - old behaviour. - -Sysfs/debugfs files: ------------- - -Firmware-assisted dump feature uses sysfs file system to hold -the control files and debugfs file to display memory reserved region. - -Here is the list of files under kernel sysfs: - - /sys/kernel/fadump_enabled - - This is used to display the fadump status. - 0 = fadump is disabled - 1 = fadump is enabled - - This interface can be used by kdump init scripts to identify if - fadump is enabled in the kernel and act accordingly. - - /sys/kernel/fadump_registered - - This is used to display the fadump registration status as well - as to control (start/stop) the fadump registration. - 0 = fadump is not registered. - 1 = fadump is registered and ready to handle system crash. - - To register fadump echo 1 > /sys/kernel/fadump_registered and - echo 0 > /sys/kernel/fadump_registered for un-register and stop the - fadump. Once the fadump is un-registered, the system crash will not - be handled and vmcore will not be captured. This interface can be - easily integrated with kdump service start/stop. - - /sys/kernel/fadump_release_mem - - This file is available only when fadump is active during - second kernel. This is used to release the reserved memory - region that are held for saving crash dump. To release the - reserved memory echo 1 to it: - - echo 1 > /sys/kernel/fadump_release_mem - - After echo 1, the content of the /sys/kernel/debug/powerpc/fadump_region - file will change to reflect the new memory reservations. - - The existing userspace tools (kdump infrastructure) can be easily - enhanced to use this interface to release the memory reserved for - dump and continue without 2nd reboot. - -Here is the list of files under powerpc debugfs: -(Assuming debugfs is mounted on /sys/kernel/debug directory.) - - /sys/kernel/debug/powerpc/fadump_region - - This file shows the reserved memory regions if fadump is - enabled otherwise this file is empty. The output format - is: - : [-] bytes, Dumped: - - e.g. - Contents when fadump is registered during first kernel - - # cat /sys/kernel/debug/powerpc/fadump_region - CPU : [0x0000006ffb0000-0x0000006fff001f] 0x40020 bytes, Dumped: 0x0 - HPTE: [0x0000006fff0020-0x0000006fff101f] 0x1000 bytes, Dumped: 0x0 - DUMP: [0x0000006fff1020-0x0000007fff101f] 0x10000000 bytes, Dumped: 0x0 - - Contents when fadump is active during second kernel - - # cat /sys/kernel/debug/powerpc/fadump_region - CPU : [0x0000006ffb0000-0x0000006fff001f] 0x40020 bytes, Dumped: 0x40020 - HPTE: [0x0000006fff0020-0x0000006fff101f] 0x1000 bytes, Dumped: 0x1000 - DUMP: [0x0000006fff1020-0x0000007fff101f] 0x10000000 bytes, Dumped: 0x10000000 - : [0x00000010000000-0x0000006ffaffff] 0x5ffb0000 bytes, Dumped: 0x5ffb0000 - -NOTE: Please refer to Documentation/filesystems/debugfs.txt on - how to mount the debugfs filesystem. - - -TODO: ------ - o Need to come up with the better approach to find out more - accurate boot memory size that is required for a kernel to - boot successfully when booted with restricted memory. - o The fadump implementation introduces a fadump crash info structure - in the scratch area before the ELF core header. The idea of introducing - this structure is to pass some important crash info data to the second - kernel which will help second kernel to populate ELF core header with - correct data before it gets exported through /proc/vmcore. The current - design implementation does not address a possibility of introducing - additional fields (in future) to this structure without affecting - compatibility. Need to come up with the better approach to address this. - The possible approaches are: - 1. Introduce version field for version tracking, bump up the version - whenever a new field is added to the structure in future. The version - field can be used to find out what fields are valid for the current - version of the structure. - 2. Reserve the area of predefined size (say PAGE_SIZE) for this - structure and have unused area as reserved (initialized to zero) - for future field additions. - The advantage of approach 1 over 2 is we don't need to reserve extra space. ---- -Author: Mahesh Salgaonkar -This document is based on the original documentation written for phyp -assisted dump by Linas Vepstas and Manish Ahuja. diff --git a/Documentation/powerpc/hvcs.rst b/Documentation/powerpc/hvcs.rst new file mode 100644 index 000000000000..6808acde672f --- /dev/null +++ b/Documentation/powerpc/hvcs.rst @@ -0,0 +1,581 @@ +=============================================================== +HVCS IBM "Hypervisor Virtual Console Server" Installation Guide +=============================================================== + +for Linux Kernel 2.6.4+ + +Copyright (C) 2004 IBM Corporation + +.. =========================================================================== +.. NOTE:Eight space tabs are the optimum editor setting for reading this file. +.. =========================================================================== + + +Author(s): Ryan S. Arnold + +Date Created: March, 02, 2004 +Last Changed: August, 24, 2004 + +.. Table of contents: + + 1. Driver Introduction: + 2. System Requirements + 3. Build Options: + 3.1 Built-in: + 3.2 Module: + 4. Installation: + 5. Connection: + 6. Disconnection: + 7. Configuration: + 8. Questions & Answers: + 9. Reporting Bugs: + +1. Driver Introduction: +======================= + +This is the device driver for the IBM Hypervisor Virtual Console Server, +"hvcs". The IBM hvcs provides a tty driver interface to allow Linux user +space applications access to the system consoles of logically partitioned +operating systems (Linux and AIX) running on the same partitioned Power5 +ppc64 system. Physical hardware consoles per partition are not practical +on this hardware so system consoles are accessed by this driver using +firmware interfaces to virtual terminal devices. + +2. System Requirements: +======================= + +This device driver was written using 2.6.4 Linux kernel APIs and will only +build and run on kernels of this version or later. + +This driver was written to operate solely on IBM Power5 ppc64 hardware +though some care was taken to abstract the architecture dependent firmware +calls from the driver code. + +Sysfs must be mounted on the system so that the user can determine which +major and minor numbers are associated with each vty-server. Directions +for sysfs mounting are outside the scope of this document. + +3. Build Options: +================= + +The hvcs driver registers itself as a tty driver. The tty layer +dynamically allocates a block of major and minor numbers in a quantity +requested by the registering driver. The hvcs driver asks the tty layer +for 64 of these major/minor numbers by default to use for hvcs device node +entries. + +If the default number of device entries is adequate then this driver can be +built into the kernel. If not, the default can be over-ridden by inserting +the driver as a module with insmod parameters. + +3.1 Built-in: +------------- + +The following menuconfig example demonstrates selecting to build this +driver into the kernel:: + + Device Drivers ---> + Character devices ---> + <*> IBM Hypervisor Virtual Console Server Support + +Begin the kernel make process. + +3.2 Module: +----------- + +The following menuconfig example demonstrates selecting to build this +driver as a kernel module:: + + Device Drivers ---> + Character devices ---> + IBM Hypervisor Virtual Console Server Support + +The make process will build the following kernel modules: + + - hvcs.ko + - hvcserver.ko + +To insert the module with the default allocation execute the following +commands in the order they appear:: + + insmod hvcserver.ko + insmod hvcs.ko + +The hvcserver module contains architecture specific firmware calls and must +be inserted first, otherwise the hvcs module will not find some of the +symbols it expects. + +To override the default use an insmod parameter as follows (requesting 4 +tty devices as an example):: + + insmod hvcs.ko hvcs_parm_num_devs=4 + +There is a maximum number of dev entries that can be specified on insmod. +We think that 1024 is currently a decent maximum number of server adapters +to allow. This can always be changed by modifying the constant in the +source file before building. + +NOTE: The length of time it takes to insmod the driver seems to be related +to the number of tty interfaces the registering driver requests. + +In order to remove the driver module execute the following command:: + + rmmod hvcs.ko + +The recommended method for installing hvcs as a module is to use depmod to +build a current modules.dep file in /lib/modules/`uname -r` and then +execute:: + + modprobe hvcs hvcs_parm_num_devs=4 + +The modules.dep file indicates that hvcserver.ko needs to be inserted +before hvcs.ko and modprobe uses this file to smartly insert the modules in +the proper order. + +The following modprobe command is used to remove hvcs and hvcserver in the +proper order:: + + modprobe -r hvcs + +4. Installation: +================ + +The tty layer creates sysfs entries which contain the major and minor +numbers allocated for the hvcs driver. The following snippet of "tree" +output of the sysfs directory shows where these numbers are presented:: + + sys/ + |-- *other sysfs base dirs* + | + |-- class + | |-- *other classes of devices* + | | + | `-- tty + | |-- *other tty devices* + | | + | |-- hvcs0 + | | `-- dev + | |-- hvcs1 + | | `-- dev + | |-- hvcs2 + | | `-- dev + | |-- hvcs3 + | | `-- dev + | | + | |-- *other tty devices* + | + |-- *other sysfs base dirs* + +For the above examples the following output is a result of cat'ing the +"dev" entry in the hvcs directory:: + + Pow5:/sys/class/tty/hvcs0/ # cat dev + 254:0 + + Pow5:/sys/class/tty/hvcs1/ # cat dev + 254:1 + + Pow5:/sys/class/tty/hvcs2/ # cat dev + 254:2 + + Pow5:/sys/class/tty/hvcs3/ # cat dev + 254:3 + +The output from reading the "dev" attribute is the char device major and +minor numbers that the tty layer has allocated for this driver's use. Most +systems running hvcs will already have the device entries created or udev +will do it automatically. + +Given the example output above, to manually create a /dev/hvcs* node entry +mknod can be used as follows:: + + mknod /dev/hvcs0 c 254 0 + mknod /dev/hvcs1 c 254 1 + mknod /dev/hvcs2 c 254 2 + mknod /dev/hvcs3 c 254 3 + +Using mknod to manually create the device entries makes these device nodes +persistent. Once created they will exist prior to the driver insmod. + +Attempting to connect an application to /dev/hvcs* prior to insertion of +the hvcs module will result in an error message similar to the following:: + + "/dev/hvcs*: No such device". + +NOTE: Just because there is a device node present doesn't mean that there +is a vty-server device configured for that node. + +5. Connection +============= + +Since this driver controls devices that provide a tty interface a user can +interact with the device node entries using any standard tty-interactive +method (e.g. "cat", "dd", "echo"). The intent of this driver however, is +to provide real time console interaction with a Linux partition's console, +which requires the use of applications that provide bi-directional, +interactive I/O with a tty device. + +Applications (e.g. "minicom" and "screen") that act as terminal emulators +or perform terminal type control sequence conversion on the data being +passed through them are NOT acceptable for providing interactive console +I/O. These programs often emulate antiquated terminal types (vt100 and +ANSI) and expect inbound data to take the form of one of these supported +terminal types but they either do not convert, or do not _adequately_ +convert, outbound data into the terminal type of the terminal which invoked +them (though screen makes an attempt and can apparently be configured with +much termcap wrestling.) + +For this reason kermit and cu are two of the recommended applications for +interacting with a Linux console via an hvcs device. These programs simply +act as a conduit for data transfer to and from the tty device. They do not +require inbound data to take the form of a particular terminal type, nor do +they cook outbound data to a particular terminal type. + +In order to ensure proper functioning of console applications one must make +sure that once connected to a /dev/hvcs console that the console's $TERM +env variable is set to the exact terminal type of the terminal emulator +used to launch the interactive I/O application. If one is using xterm and +kermit to connect to /dev/hvcs0 when the console prompt becomes available +one should "export TERM=xterm" on the console. This tells ncurses +applications that are invoked from the console that they should output +control sequences that xterm can understand. + +As a precautionary measure an hvcs user should always "exit" from their +session before disconnecting an application such as kermit from the device +node. If this is not done, the next user to connect to the console will +continue using the previous user's logged in session which includes +using the $TERM variable that the previous user supplied. + +Hotplug add and remove of vty-server adapters affects which /dev/hvcs* node +is used to connect to each vty-server adapter. In order to determine which +vty-server adapter is associated with which /dev/hvcs* node a special sysfs +attribute has been added to each vty-server sysfs entry. This entry is +called "index" and showing it reveals an integer that refers to the +/dev/hvcs* entry to use to connect to that device. For instance cating the +index attribute of vty-server adapter 30000004 shows the following:: + + Pow5:/sys/bus/vio/drivers/hvcs/30000004 # cat index + 2 + +This index of '2' means that in order to connect to vty-server adapter +30000004 the user should interact with /dev/hvcs2. + +It should be noted that due to the system hotplug I/O capabilities of a +system the /dev/hvcs* entry that interacts with a particular vty-server +adapter is not guaranteed to remain the same across system reboots. Look +in the Q & A section for more on this issue. + +6. Disconnection +================ + +As a security feature to prevent the delivery of stale data to an +unintended target the Power5 system firmware disables the fetching of data +and discards that data when a connection between a vty-server and a vty has +been severed. As an example, when a vty-server is immediately disconnected +from a vty following output of data to the vty the vty adapter may not have +enough time between when it received the data interrupt and when the +connection was severed to fetch the data from firmware before the fetch is +disabled by firmware. + +When hvcs is being used to serve consoles this behavior is not a huge issue +because the adapter stays connected for large amounts of time following +almost all data writes. When hvcs is being used as a tty conduit to tunnel +data between two partitions [see Q & A below] this is a huge problem +because the standard Linux behavior when cat'ing or dd'ing data to a device +is to open the tty, send the data, and then close the tty. If this driver +manually terminated vty-server connections on tty close this would close +the vty-server and vty connection before the target vty has had a chance to +fetch the data. + +Additionally, disconnecting a vty-server and vty only on module removal or +adapter removal is impractical because other vty-servers in other +partitions may require the usage of the target vty at any time. + +Due to this behavioral restriction disconnection of vty-servers from the +connected vty is a manual procedure using a write to a sysfs attribute +outlined below, on the other hand the initial vty-server connection to a +vty is established automatically by this driver. Manual vty-server +connection is never required. + +In order to terminate the connection between a vty-server and vty the +"vterm_state" sysfs attribute within each vty-server's sysfs entry is used. +Reading this attribute reveals the current connection state of the +vty-server adapter. A zero means that the vty-server is not connected to a +vty. A one indicates that a connection is active. + +Writing a '0' (zero) to the vterm_state attribute will disconnect the VTERM +connection between the vty-server and target vty ONLY if the vterm_state +previously read '1'. The write directive is ignored if the vterm_state +read '0' or if any value other than '0' was written to the vterm_state +attribute. The following example will show the method used for verifying +the vty-server connection status and disconnecting a vty-server connection:: + + Pow5:/sys/bus/vio/drivers/hvcs/30000004 # cat vterm_state + 1 + + Pow5:/sys/bus/vio/drivers/hvcs/30000004 # echo 0 > vterm_state + + Pow5:/sys/bus/vio/drivers/hvcs/30000004 # cat vterm_state + 0 + +All vty-server connections are automatically terminated when the device is +hotplug removed and when the module is removed. + +7. Configuration +================ + +Each vty-server has a sysfs entry in the /sys/devices/vio directory, which +is symlinked in several other sysfs tree directories, notably under the +hvcs driver entry, which looks like the following example:: + + Pow5:/sys/bus/vio/drivers/hvcs # ls + . .. 30000003 30000004 rescan + +By design, firmware notifies the hvcs driver of vty-server lifetimes and +partner vty removals but not the addition of partner vtys. Since an HMC +Super Admin can add partner info dynamically we have provided the hvcs +driver sysfs directory with the "rescan" update attribute which will query +firmware and update the partner info for all the vty-servers that this +driver manages. Writing a '1' to the attribute triggers the update. An +explicit example follows: + + Pow5:/sys/bus/vio/drivers/hvcs # echo 1 > rescan + +Reading the attribute will indicate a state of '1' or '0'. A one indicates +that an update is in process. A zero indicates that an update has +completed or was never executed. + +Vty-server entries in this directory are a 32 bit partition unique unit +address that is created by firmware. An example vty-server sysfs entry +looks like the following:: + + Pow5:/sys/bus/vio/drivers/hvcs/30000004 # ls + . current_vty devspec name partner_vtys + .. index partner_clcs vterm_state + +Each entry is provided, by default with a "name" attribute. Reading the +"name" attribute will reveal the device type as shown in the following +example:: + + Pow5:/sys/bus/vio/drivers/hvcs/30000003 # cat name + vty-server + +Each entry is also provided, by default, with a "devspec" attribute which +reveals the full device specification when read, as shown in the following +example:: + + Pow5:/sys/bus/vio/drivers/hvcs/30000004 # cat devspec + /vdevice/vty-server@30000004 + +Each vty-server sysfs dir is provided with two read-only attributes that +provide lists of easily parsed partner vty data: "partner_vtys" and +"partner_clcs":: + + Pow5:/sys/bus/vio/drivers/hvcs/30000004 # cat partner_vtys + 30000000 + 30000001 + 30000002 + 30000000 + 30000000 + + Pow5:/sys/bus/vio/drivers/hvcs/30000004 # cat partner_clcs + U5112.428.103048A-V3-C0 + U5112.428.103048A-V3-C2 + U5112.428.103048A-V3-C3 + U5112.428.103048A-V4-C0 + U5112.428.103048A-V5-C0 + +Reading partner_vtys returns a list of partner vtys. Vty unit address +numbering is only per-partition-unique so entries will frequently repeat. + +Reading partner_clcs returns a list of "converged location codes" which are +composed of a system serial number followed by "-V*", where the '*' is the +target partition number, and "-C*", where the '*' is the slot of the +adapter. The first vty partner corresponds to the first clc item, the +second vty partner to the second clc item, etc. + +A vty-server can only be connected to a single vty at a time. The entry, +"current_vty" prints the clc of the currently selected partner vty when +read. + +The current_vty can be changed by writing a valid partner clc to the entry +as in the following example:: + + Pow5:/sys/bus/vio/drivers/hvcs/30000004 # echo U5112.428.10304 + 8A-V4-C0 > current_vty + +Changing the current_vty when a vty-server is already connected to a vty +does not affect the current connection. The change takes effect when the +currently open connection is freed. + +Information on the "vterm_state" attribute was covered earlier on the +chapter entitled "disconnection". + +8. Questions & Answers: +======================= + +Q: What are the security concerns involving hvcs? + +A: There are three main security concerns: + + 1. The creator of the /dev/hvcs* nodes has the ability to restrict + the access of the device entries to certain users or groups. It + may be best to create a special hvcs group privilege for providing + access to system consoles. + + 2. To provide network security when grabbing the console it is + suggested that the user connect to the console hosting partition + using a secure method, such as SSH or sit at a hardware console. + + 3. Make sure to exit the user session when done with a console or + the next vty-server connection (which may be from another + partition) will experience the previously logged in session. + +--------------------------------------------------------------------------- + +Q: How do I multiplex a console that I grab through hvcs so that other +people can see it: + +A: You can use "screen" to directly connect to the /dev/hvcs* device and +setup a session on your machine with the console group privileges. As +pointed out earlier by default screen doesn't provide the termcap settings +for most terminal emulators to provide adequate character conversion from +term type "screen" to others. This means that curses based programs may +not display properly in screen sessions. + +--------------------------------------------------------------------------- + +Q: Why are the colors all messed up? +Q: Why are the control characters acting strange or not working? +Q: Why is the console output all strange and unintelligible? + +A: Please see the preceding section on "Connection" for a discussion of how +applications can affect the display of character control sequences. +Additionally, just because you logged into the console using and xterm +doesn't mean someone else didn't log into the console with the HMC console +(vt320) before you and leave the session logged in. The best thing to do +is to export TERM to the terminal type of your terminal emulator when you +get the console. Additionally make sure to "exit" the console before you +disconnect from the console. This will ensure that the next user gets +their own TERM type set when they login. + +--------------------------------------------------------------------------- + +Q: When I try to CONNECT kermit to an hvcs device I get: +"Sorry, can't open connection: /dev/hvcs*"What is happening? + +A: Some other Power5 console mechanism has a connection to the vty and +isn't giving it up. You can try to force disconnect the consoles from the +HMC by right clicking on the partition and then selecting "close terminal". +Otherwise you have to hunt down the people who have console authority. It +is possible that you already have the console open using another kermit +session and just forgot about it. Please review the console options for +Power5 systems to determine the many ways a system console can be held. + +OR + +A: Another user may not have a connectivity method currently attached to a +/dev/hvcs device but the vterm_state may reveal that they still have the +vty-server connection established. They need to free this using the method +outlined in the section on "Disconnection" in order for others to connect +to the target vty. + +OR + +A: The user profile you are using to execute kermit probably doesn't have +permissions to use the /dev/hvcs* device. + +OR + +A: You probably haven't inserted the hvcs.ko module yet but the /dev/hvcs* +entry still exists (on systems without udev). + +OR + +A: There is not a corresponding vty-server device that maps to an existing +/dev/hvcs* entry. + +--------------------------------------------------------------------------- + +Q: When I try to CONNECT kermit to an hvcs device I get: +"Sorry, write access to UUCP lockfile directory denied." + +A: The /dev/hvcs* entry you have specified doesn't exist where you said it +does? Maybe you haven't inserted the module (on systems with udev). + +--------------------------------------------------------------------------- + +Q: If I already have one Linux partition installed can I use hvcs on said +partition to provide the console for the install of a second Linux +partition? + +A: Yes granted that your are connected to the /dev/hvcs* device using +kermit or cu or some other program that doesn't provide terminal emulation. + +--------------------------------------------------------------------------- + +Q: Can I connect to more than one partition's console at a time using this +driver? + +A: Yes. Of course this means that there must be more than one vty-server +configured for this partition and each must point to a disconnected vty. + +--------------------------------------------------------------------------- + +Q: Does the hvcs driver support dynamic (hotplug) addition of devices? + +A: Yes, if you have dlpar and hotplug enabled for your system and it has +been built into the kernel the hvcs drivers is configured to dynamically +handle additions of new devices and removals of unused devices. + +--------------------------------------------------------------------------- + +Q: For some reason /dev/hvcs* doesn't map to the same vty-server adapter +after a reboot. What happened? + +A: Assignment of vty-server adapters to /dev/hvcs* entries is always done +in the order that the adapters are exposed. Due to hotplug capabilities of +this driver assignment of hotplug added vty-servers may be in a different +order than how they would be exposed on module load. Rebooting or +reloading the module after dynamic addition may result in the /dev/hvcs* +and vty-server coupling changing if a vty-server adapter was added in a +slot between two other vty-server adapters. Refer to the section above +on how to determine which vty-server goes with which /dev/hvcs* node. +Hint; look at the sysfs "index" attribute for the vty-server. + +--------------------------------------------------------------------------- + +Q: Can I use /dev/hvcs* as a conduit to another partition and use a tty +device on that partition as the other end of the pipe? + +A: Yes, on Power5 platforms the hvc_console driver provides a tty interface +for extra /dev/hvc* devices (where /dev/hvc0 is most likely the console). +In order to get a tty conduit working between the two partitions the HMC +Super Admin must create an additional "serial server" for the target +partition with the HMC gui which will show up as /dev/hvc* when the target +partition is rebooted. + +The HMC Super Admin then creates an additional "serial client" for the +current partition and points this at the target partition's newly created +"serial server" adapter (remember the slot). This shows up as an +additional /dev/hvcs* device. + +Now a program on the target system can be configured to read or write to +/dev/hvc* and another program on the current partition can be configured to +read or write to /dev/hvcs*. Now you have a tty conduit between two +partitions. + +--------------------------------------------------------------------------- + +9. Reporting Bugs: +================== + +The proper channel for reporting bugs is either through the Linux OS +distribution company that provided your OS or by posting issues to the +PowerPC development mailing list at: + +linuxppc-dev@lists.ozlabs.org + +This request is to provide a documented and searchable public exchange +of the problems and solutions surrounding this driver for the benefit of +all users. diff --git a/Documentation/powerpc/hvcs.txt b/Documentation/powerpc/hvcs.txt deleted file mode 100644 index a730ca5a07f8..000000000000 --- a/Documentation/powerpc/hvcs.txt +++ /dev/null @@ -1,567 +0,0 @@ -=========================================================================== - HVCS - IBM "Hypervisor Virtual Console Server" Installation Guide - for Linux Kernel 2.6.4+ - Copyright (C) 2004 IBM Corporation - -=========================================================================== -NOTE:Eight space tabs are the optimum editor setting for reading this file. -=========================================================================== - - Author(s) : Ryan S. Arnold - Date Created: March, 02, 2004 - Last Changed: August, 24, 2004 - ---------------------------------------------------------------------------- -Table of contents: - - 1. Driver Introduction: - 2. System Requirements - 3. Build Options: - 3.1 Built-in: - 3.2 Module: - 4. Installation: - 5. Connection: - 6. Disconnection: - 7. Configuration: - 8. Questions & Answers: - 9. Reporting Bugs: - ---------------------------------------------------------------------------- -1. Driver Introduction: - -This is the device driver for the IBM Hypervisor Virtual Console Server, -"hvcs". The IBM hvcs provides a tty driver interface to allow Linux user -space applications access to the system consoles of logically partitioned -operating systems (Linux and AIX) running on the same partitioned Power5 -ppc64 system. Physical hardware consoles per partition are not practical -on this hardware so system consoles are accessed by this driver using -firmware interfaces to virtual terminal devices. - ---------------------------------------------------------------------------- -2. System Requirements: - -This device driver was written using 2.6.4 Linux kernel APIs and will only -build and run on kernels of this version or later. - -This driver was written to operate solely on IBM Power5 ppc64 hardware -though some care was taken to abstract the architecture dependent firmware -calls from the driver code. - -Sysfs must be mounted on the system so that the user can determine which -major and minor numbers are associated with each vty-server. Directions -for sysfs mounting are outside the scope of this document. - ---------------------------------------------------------------------------- -3. Build Options: - -The hvcs driver registers itself as a tty driver. The tty layer -dynamically allocates a block of major and minor numbers in a quantity -requested by the registering driver. The hvcs driver asks the tty layer -for 64 of these major/minor numbers by default to use for hvcs device node -entries. - -If the default number of device entries is adequate then this driver can be -built into the kernel. If not, the default can be over-ridden by inserting -the driver as a module with insmod parameters. - ---------------------------------------------------------------------------- -3.1 Built-in: - -The following menuconfig example demonstrates selecting to build this -driver into the kernel. - - Device Drivers ---> - Character devices ---> - <*> IBM Hypervisor Virtual Console Server Support - -Begin the kernel make process. - ---------------------------------------------------------------------------- -3.2 Module: - -The following menuconfig example demonstrates selecting to build this -driver as a kernel module. - - Device Drivers ---> - Character devices ---> - IBM Hypervisor Virtual Console Server Support - -The make process will build the following kernel modules: - - hvcs.ko - hvcserver.ko - -To insert the module with the default allocation execute the following -commands in the order they appear: - - insmod hvcserver.ko - insmod hvcs.ko - -The hvcserver module contains architecture specific firmware calls and must -be inserted first, otherwise the hvcs module will not find some of the -symbols it expects. - -To override the default use an insmod parameter as follows (requesting 4 -tty devices as an example): - - insmod hvcs.ko hvcs_parm_num_devs=4 - -There is a maximum number of dev entries that can be specified on insmod. -We think that 1024 is currently a decent maximum number of server adapters -to allow. This can always be changed by modifying the constant in the -source file before building. - -NOTE: The length of time it takes to insmod the driver seems to be related -to the number of tty interfaces the registering driver requests. - -In order to remove the driver module execute the following command: - - rmmod hvcs.ko - -The recommended method for installing hvcs as a module is to use depmod to -build a current modules.dep file in /lib/modules/`uname -r` and then -execute: - -modprobe hvcs hvcs_parm_num_devs=4 - -The modules.dep file indicates that hvcserver.ko needs to be inserted -before hvcs.ko and modprobe uses this file to smartly insert the modules in -the proper order. - -The following modprobe command is used to remove hvcs and hvcserver in the -proper order: - -modprobe -r hvcs - ---------------------------------------------------------------------------- -4. Installation: - -The tty layer creates sysfs entries which contain the major and minor -numbers allocated for the hvcs driver. The following snippet of "tree" -output of the sysfs directory shows where these numbers are presented: - - sys/ - |-- *other sysfs base dirs* - | - |-- class - | |-- *other classes of devices* - | | - | `-- tty - | |-- *other tty devices* - | | - | |-- hvcs0 - | | `-- dev - | |-- hvcs1 - | | `-- dev - | |-- hvcs2 - | | `-- dev - | |-- hvcs3 - | | `-- dev - | | - | |-- *other tty devices* - | - |-- *other sysfs base dirs* - -For the above examples the following output is a result of cat'ing the -"dev" entry in the hvcs directory: - - Pow5:/sys/class/tty/hvcs0/ # cat dev - 254:0 - - Pow5:/sys/class/tty/hvcs1/ # cat dev - 254:1 - - Pow5:/sys/class/tty/hvcs2/ # cat dev - 254:2 - - Pow5:/sys/class/tty/hvcs3/ # cat dev - 254:3 - -The output from reading the "dev" attribute is the char device major and -minor numbers that the tty layer has allocated for this driver's use. Most -systems running hvcs will already have the device entries created or udev -will do it automatically. - -Given the example output above, to manually create a /dev/hvcs* node entry -mknod can be used as follows: - - mknod /dev/hvcs0 c 254 0 - mknod /dev/hvcs1 c 254 1 - mknod /dev/hvcs2 c 254 2 - mknod /dev/hvcs3 c 254 3 - -Using mknod to manually create the device entries makes these device nodes -persistent. Once created they will exist prior to the driver insmod. - -Attempting to connect an application to /dev/hvcs* prior to insertion of -the hvcs module will result in an error message similar to the following: - - "/dev/hvcs*: No such device". - -NOTE: Just because there is a device node present doesn't mean that there -is a vty-server device configured for that node. - ---------------------------------------------------------------------------- -5. Connection - -Since this driver controls devices that provide a tty interface a user can -interact with the device node entries using any standard tty-interactive -method (e.g. "cat", "dd", "echo"). The intent of this driver however, is -to provide real time console interaction with a Linux partition's console, -which requires the use of applications that provide bi-directional, -interactive I/O with a tty device. - -Applications (e.g. "minicom" and "screen") that act as terminal emulators -or perform terminal type control sequence conversion on the data being -passed through them are NOT acceptable for providing interactive console -I/O. These programs often emulate antiquated terminal types (vt100 and -ANSI) and expect inbound data to take the form of one of these supported -terminal types but they either do not convert, or do not _adequately_ -convert, outbound data into the terminal type of the terminal which invoked -them (though screen makes an attempt and can apparently be configured with -much termcap wrestling.) - -For this reason kermit and cu are two of the recommended applications for -interacting with a Linux console via an hvcs device. These programs simply -act as a conduit for data transfer to and from the tty device. They do not -require inbound data to take the form of a particular terminal type, nor do -they cook outbound data to a particular terminal type. - -In order to ensure proper functioning of console applications one must make -sure that once connected to a /dev/hvcs console that the console's $TERM -env variable is set to the exact terminal type of the terminal emulator -used to launch the interactive I/O application. If one is using xterm and -kermit to connect to /dev/hvcs0 when the console prompt becomes available -one should "export TERM=xterm" on the console. This tells ncurses -applications that are invoked from the console that they should output -control sequences that xterm can understand. - -As a precautionary measure an hvcs user should always "exit" from their -session before disconnecting an application such as kermit from the device -node. If this is not done, the next user to connect to the console will -continue using the previous user's logged in session which includes -using the $TERM variable that the previous user supplied. - -Hotplug add and remove of vty-server adapters affects which /dev/hvcs* node -is used to connect to each vty-server adapter. In order to determine which -vty-server adapter is associated with which /dev/hvcs* node a special sysfs -attribute has been added to each vty-server sysfs entry. This entry is -called "index" and showing it reveals an integer that refers to the -/dev/hvcs* entry to use to connect to that device. For instance cating the -index attribute of vty-server adapter 30000004 shows the following. - - Pow5:/sys/bus/vio/drivers/hvcs/30000004 # cat index - 2 - -This index of '2' means that in order to connect to vty-server adapter -30000004 the user should interact with /dev/hvcs2. - -It should be noted that due to the system hotplug I/O capabilities of a -system the /dev/hvcs* entry that interacts with a particular vty-server -adapter is not guaranteed to remain the same across system reboots. Look -in the Q & A section for more on this issue. - ---------------------------------------------------------------------------- -6. Disconnection - -As a security feature to prevent the delivery of stale data to an -unintended target the Power5 system firmware disables the fetching of data -and discards that data when a connection between a vty-server and a vty has -been severed. As an example, when a vty-server is immediately disconnected -from a vty following output of data to the vty the vty adapter may not have -enough time between when it received the data interrupt and when the -connection was severed to fetch the data from firmware before the fetch is -disabled by firmware. - -When hvcs is being used to serve consoles this behavior is not a huge issue -because the adapter stays connected for large amounts of time following -almost all data writes. When hvcs is being used as a tty conduit to tunnel -data between two partitions [see Q & A below] this is a huge problem -because the standard Linux behavior when cat'ing or dd'ing data to a device -is to open the tty, send the data, and then close the tty. If this driver -manually terminated vty-server connections on tty close this would close -the vty-server and vty connection before the target vty has had a chance to -fetch the data. - -Additionally, disconnecting a vty-server and vty only on module removal or -adapter removal is impractical because other vty-servers in other -partitions may require the usage of the target vty at any time. - -Due to this behavioral restriction disconnection of vty-servers from the -connected vty is a manual procedure using a write to a sysfs attribute -outlined below, on the other hand the initial vty-server connection to a -vty is established automatically by this driver. Manual vty-server -connection is never required. - -In order to terminate the connection between a vty-server and vty the -"vterm_state" sysfs attribute within each vty-server's sysfs entry is used. -Reading this attribute reveals the current connection state of the -vty-server adapter. A zero means that the vty-server is not connected to a -vty. A one indicates that a connection is active. - -Writing a '0' (zero) to the vterm_state attribute will disconnect the VTERM -connection between the vty-server and target vty ONLY if the vterm_state -previously read '1'. The write directive is ignored if the vterm_state -read '0' or if any value other than '0' was written to the vterm_state -attribute. The following example will show the method used for verifying -the vty-server connection status and disconnecting a vty-server connection. - - Pow5:/sys/bus/vio/drivers/hvcs/30000004 # cat vterm_state - 1 - - Pow5:/sys/bus/vio/drivers/hvcs/30000004 # echo 0 > vterm_state - - Pow5:/sys/bus/vio/drivers/hvcs/30000004 # cat vterm_state - 0 - -All vty-server connections are automatically terminated when the device is -hotplug removed and when the module is removed. - ---------------------------------------------------------------------------- -7. Configuration - -Each vty-server has a sysfs entry in the /sys/devices/vio directory, which -is symlinked in several other sysfs tree directories, notably under the -hvcs driver entry, which looks like the following example: - - Pow5:/sys/bus/vio/drivers/hvcs # ls - . .. 30000003 30000004 rescan - -By design, firmware notifies the hvcs driver of vty-server lifetimes and -partner vty removals but not the addition of partner vtys. Since an HMC -Super Admin can add partner info dynamically we have provided the hvcs -driver sysfs directory with the "rescan" update attribute which will query -firmware and update the partner info for all the vty-servers that this -driver manages. Writing a '1' to the attribute triggers the update. An -explicit example follows: - - Pow5:/sys/bus/vio/drivers/hvcs # echo 1 > rescan - -Reading the attribute will indicate a state of '1' or '0'. A one indicates -that an update is in process. A zero indicates that an update has -completed or was never executed. - -Vty-server entries in this directory are a 32 bit partition unique unit -address that is created by firmware. An example vty-server sysfs entry -looks like the following: - - Pow5:/sys/bus/vio/drivers/hvcs/30000004 # ls - . current_vty devspec name partner_vtys - .. index partner_clcs vterm_state - -Each entry is provided, by default with a "name" attribute. Reading the -"name" attribute will reveal the device type as shown in the following -example: - - Pow5:/sys/bus/vio/drivers/hvcs/30000003 # cat name - vty-server - -Each entry is also provided, by default, with a "devspec" attribute which -reveals the full device specification when read, as shown in the following -example: - - Pow5:/sys/bus/vio/drivers/hvcs/30000004 # cat devspec - /vdevice/vty-server@30000004 - -Each vty-server sysfs dir is provided with two read-only attributes that -provide lists of easily parsed partner vty data: "partner_vtys" and -"partner_clcs". - - Pow5:/sys/bus/vio/drivers/hvcs/30000004 # cat partner_vtys - 30000000 - 30000001 - 30000002 - 30000000 - 30000000 - - Pow5:/sys/bus/vio/drivers/hvcs/30000004 # cat partner_clcs - U5112.428.103048A-V3-C0 - U5112.428.103048A-V3-C2 - U5112.428.103048A-V3-C3 - U5112.428.103048A-V4-C0 - U5112.428.103048A-V5-C0 - -Reading partner_vtys returns a list of partner vtys. Vty unit address -numbering is only per-partition-unique so entries will frequently repeat. - -Reading partner_clcs returns a list of "converged location codes" which are -composed of a system serial number followed by "-V*", where the '*' is the -target partition number, and "-C*", where the '*' is the slot of the -adapter. The first vty partner corresponds to the first clc item, the -second vty partner to the second clc item, etc. - -A vty-server can only be connected to a single vty at a time. The entry, -"current_vty" prints the clc of the currently selected partner vty when -read. - -The current_vty can be changed by writing a valid partner clc to the entry -as in the following example: - - Pow5:/sys/bus/vio/drivers/hvcs/30000004 # echo U5112.428.10304 - 8A-V4-C0 > current_vty - -Changing the current_vty when a vty-server is already connected to a vty -does not affect the current connection. The change takes effect when the -currently open connection is freed. - -Information on the "vterm_state" attribute was covered earlier on the -chapter entitled "disconnection". - ---------------------------------------------------------------------------- -8. Questions & Answers: -=========================================================================== -Q: What are the security concerns involving hvcs? - -A: There are three main security concerns: - - 1. The creator of the /dev/hvcs* nodes has the ability to restrict - the access of the device entries to certain users or groups. It - may be best to create a special hvcs group privilege for providing - access to system consoles. - - 2. To provide network security when grabbing the console it is - suggested that the user connect to the console hosting partition - using a secure method, such as SSH or sit at a hardware console. - - 3. Make sure to exit the user session when done with a console or - the next vty-server connection (which may be from another - partition) will experience the previously logged in session. - ---------------------------------------------------------------------------- -Q: How do I multiplex a console that I grab through hvcs so that other -people can see it: - -A: You can use "screen" to directly connect to the /dev/hvcs* device and -setup a session on your machine with the console group privileges. As -pointed out earlier by default screen doesn't provide the termcap settings -for most terminal emulators to provide adequate character conversion from -term type "screen" to others. This means that curses based programs may -not display properly in screen sessions. - ---------------------------------------------------------------------------- -Q: Why are the colors all messed up? -Q: Why are the control characters acting strange or not working? -Q: Why is the console output all strange and unintelligible? - -A: Please see the preceding section on "Connection" for a discussion of how -applications can affect the display of character control sequences. -Additionally, just because you logged into the console using and xterm -doesn't mean someone else didn't log into the console with the HMC console -(vt320) before you and leave the session logged in. The best thing to do -is to export TERM to the terminal type of your terminal emulator when you -get the console. Additionally make sure to "exit" the console before you -disconnect from the console. This will ensure that the next user gets -their own TERM type set when they login. - ---------------------------------------------------------------------------- -Q: When I try to CONNECT kermit to an hvcs device I get: -"Sorry, can't open connection: /dev/hvcs*"What is happening? - -A: Some other Power5 console mechanism has a connection to the vty and -isn't giving it up. You can try to force disconnect the consoles from the -HMC by right clicking on the partition and then selecting "close terminal". -Otherwise you have to hunt down the people who have console authority. It -is possible that you already have the console open using another kermit -session and just forgot about it. Please review the console options for -Power5 systems to determine the many ways a system console can be held. - -OR - -A: Another user may not have a connectivity method currently attached to a -/dev/hvcs device but the vterm_state may reveal that they still have the -vty-server connection established. They need to free this using the method -outlined in the section on "Disconnection" in order for others to connect -to the target vty. - -OR - -A: The user profile you are using to execute kermit probably doesn't have -permissions to use the /dev/hvcs* device. - -OR - -A: You probably haven't inserted the hvcs.ko module yet but the /dev/hvcs* -entry still exists (on systems without udev). - -OR - -A: There is not a corresponding vty-server device that maps to an existing -/dev/hvcs* entry. - ---------------------------------------------------------------------------- -Q: When I try to CONNECT kermit to an hvcs device I get: -"Sorry, write access to UUCP lockfile directory denied." - -A: The /dev/hvcs* entry you have specified doesn't exist where you said it -does? Maybe you haven't inserted the module (on systems with udev). - ---------------------------------------------------------------------------- -Q: If I already have one Linux partition installed can I use hvcs on said -partition to provide the console for the install of a second Linux -partition? - -A: Yes granted that your are connected to the /dev/hvcs* device using -kermit or cu or some other program that doesn't provide terminal emulation. - ---------------------------------------------------------------------------- -Q: Can I connect to more than one partition's console at a time using this -driver? - -A: Yes. Of course this means that there must be more than one vty-server -configured for this partition and each must point to a disconnected vty. - ---------------------------------------------------------------------------- -Q: Does the hvcs driver support dynamic (hotplug) addition of devices? - -A: Yes, if you have dlpar and hotplug enabled for your system and it has -been built into the kernel the hvcs drivers is configured to dynamically -handle additions of new devices and removals of unused devices. - ---------------------------------------------------------------------------- -Q: For some reason /dev/hvcs* doesn't map to the same vty-server adapter -after a reboot. What happened? - -A: Assignment of vty-server adapters to /dev/hvcs* entries is always done -in the order that the adapters are exposed. Due to hotplug capabilities of -this driver assignment of hotplug added vty-servers may be in a different -order than how they would be exposed on module load. Rebooting or -reloading the module after dynamic addition may result in the /dev/hvcs* -and vty-server coupling changing if a vty-server adapter was added in a -slot between two other vty-server adapters. Refer to the section above -on how to determine which vty-server goes with which /dev/hvcs* node. -Hint; look at the sysfs "index" attribute for the vty-server. - ---------------------------------------------------------------------------- -Q: Can I use /dev/hvcs* as a conduit to another partition and use a tty -device on that partition as the other end of the pipe? - -A: Yes, on Power5 platforms the hvc_console driver provides a tty interface -for extra /dev/hvc* devices (where /dev/hvc0 is most likely the console). -In order to get a tty conduit working between the two partitions the HMC -Super Admin must create an additional "serial server" for the target -partition with the HMC gui which will show up as /dev/hvc* when the target -partition is rebooted. - -The HMC Super Admin then creates an additional "serial client" for the -current partition and points this at the target partition's newly created -"serial server" adapter (remember the slot). This shows up as an -additional /dev/hvcs* device. - -Now a program on the target system can be configured to read or write to -/dev/hvc* and another program on the current partition can be configured to -read or write to /dev/hvcs*. Now you have a tty conduit between two -partitions. - ---------------------------------------------------------------------------- -9. Reporting Bugs: - -The proper channel for reporting bugs is either through the Linux OS -distribution company that provided your OS or by posting issues to the -PowerPC development mailing list at: - -linuxppc-dev@lists.ozlabs.org - -This request is to provide a documented and searchable public exchange -of the problems and solutions surrounding this driver for the benefit of -all users. diff --git a/Documentation/powerpc/index.rst b/Documentation/powerpc/index.rst new file mode 100644 index 000000000000..549b1cdd77ae --- /dev/null +++ b/Documentation/powerpc/index.rst @@ -0,0 +1,34 @@ +.. SPDX-License-Identifier: GPL-2.0 + +======= +powerpc +======= + +.. toctree:: + :maxdepth: 1 + + bootwrapper + cpu_families + cpu_features + cxl + cxlflash + dawr-power9 + dscr + eeh-pci-error-recovery + firmware-assisted-dump + hvcs + isa-versions + mpc52xx + pci_iov_resource_on_powernv + pmu-ebb + ptrace + qe_firmware + syscall64-abi + transactional_memory + +.. only:: subproject and html + + Indices + ======= + + * :ref:`genindex` diff --git a/Documentation/powerpc/isa-versions.rst b/Documentation/powerpc/isa-versions.rst index 66c24140ebf1..a363d8c1603c 100644 --- a/Documentation/powerpc/isa-versions.rst +++ b/Documentation/powerpc/isa-versions.rst @@ -1,13 +1,12 @@ -:orphan: - +========================== CPU to ISA Version Mapping ========================== Mapping of some CPU versions to relevant ISA versions. -========= ==================== +========= ==================================================================== CPU Architecture version -========= ==================== +========= ==================================================================== Power9 Power ISA v3.0B Power8 Power ISA v2.07 Power7 Power ISA v2.06 @@ -24,7 +23,7 @@ PPC970 - PowerPC User Instruction Set Architecture Book I v2.01 - PowerPC Virtual Environment Architecture Book II v2.01 - PowerPC Operating Environment Architecture Book III v2.01 - Plus Altivec/VMX ~= 2.03 -========= ==================== +========= ==================================================================== Key Features @@ -60,9 +59,9 @@ Power5 No PPC970 No ========== ==== -========== ==================== +========== ==================================== CPU Transactional Memory -========== ==================== +========== ==================================== Power9 Yes (* see transactional_memory.txt) Power8 Yes Power7 No @@ -73,4 +72,4 @@ Power5++ No Power5+ No Power5 No PPC970 No -========== ==================== +========== ==================================== diff --git a/Documentation/powerpc/mpc52xx.rst b/Documentation/powerpc/mpc52xx.rst new file mode 100644 index 000000000000..8676ac63e077 --- /dev/null +++ b/Documentation/powerpc/mpc52xx.rst @@ -0,0 +1,43 @@ +============================= +Linux 2.6.x on MPC52xx family +============================= + +For the latest info, go to http://www.246tNt.com/mpc52xx/ + +To compile/use : + + - U-Boot:: + + # tftpboot 200000 uImage + => tftpboot 400000 pRamdisk + => bootm 200000 400000 + + - DBug:: + + # dn -i zImage.initrd.lite5200 + + +Some remarks: + + - The port is named mpc52xxx, and config options are PPC_MPC52xx. The MGT5100 + is not supported, and I'm not sure anyone is interesting in working on it + so. I didn't took 5xxx because there's apparently a lot of 5xxx that have + nothing to do with the MPC5200. I also included the 'MPC' for the same + reason. + - Of course, I inspired myself from the 2.4 port. If you think I forgot to + mention you/your company in the copyright of some code, I'll correct it + ASAP. diff --git a/Documentation/powerpc/mpc52xx.txt b/Documentation/powerpc/mpc52xx.txt deleted file mode 100644 index 0d540a31ea1a..000000000000 --- a/Documentation/powerpc/mpc52xx.txt +++ /dev/null @@ -1,39 +0,0 @@ -Linux 2.6.x on MPC52xx family ------------------------------ - -For the latest info, go to http://www.246tNt.com/mpc52xx/ - -To compile/use : - - - U-Boot: - # tftpboot 200000 uImage - => tftpboot 400000 pRamdisk - => bootm 200000 400000 - - - DBug: - # dn -i zImage.initrd.lite5200 - - -Some remarks : - - The port is named mpc52xxx, and config options are PPC_MPC52xx. The MGT5100 - is not supported, and I'm not sure anyone is interesting in working on it - so. I didn't took 5xxx because there's apparently a lot of 5xxx that have - nothing to do with the MPC5200. I also included the 'MPC' for the same - reason. - - Of course, I inspired myself from the 2.4 port. If you think I forgot to - mention you/your company in the copyright of some code, I'll correct it - ASAP. diff --git a/Documentation/powerpc/pci_iov_resource_on_powernv.rst b/Documentation/powerpc/pci_iov_resource_on_powernv.rst new file mode 100644 index 000000000000..f5a5793e1613 --- /dev/null +++ b/Documentation/powerpc/pci_iov_resource_on_powernv.rst @@ -0,0 +1,312 @@ +=================================================== +PCI Express I/O Virtualization Resource on Powerenv +=================================================== + +Wei Yang + +Benjamin Herrenschmidt + +Bjorn Helgaas + +26 Aug 2014 + +This document describes the requirement from hardware for PCI MMIO resource +sizing and assignment on PowerKVM and how generic PCI code handles this +requirement. The first two sections describe the concepts of Partitionable +Endpoints and the implementation on P8 (IODA2). The next two sections talks +about considerations on enabling SRIOV on IODA2. + +1. Introduction to Partitionable Endpoints +========================================== + +A Partitionable Endpoint (PE) is a way to group the various resources +associated with a device or a set of devices to provide isolation between +partitions (i.e., filtering of DMA, MSIs etc.) and to provide a mechanism +to freeze a device that is causing errors in order to limit the possibility +of propagation of bad data. + +There is thus, in HW, a table of PE states that contains a pair of "frozen" +state bits (one for MMIO and one for DMA, they get set together but can be +cleared independently) for each PE. + +When a PE is frozen, all stores in any direction are dropped and all loads +return all 1's value. MSIs are also blocked. There's a bit more state that +captures things like the details of the error that caused the freeze etc., but +that's not critical. + +The interesting part is how the various PCIe transactions (MMIO, DMA, ...) +are matched to their corresponding PEs. + +The following section provides a rough description of what we have on P8 +(IODA2). Keep in mind that this is all per PHB (PCI host bridge). Each PHB +is a completely separate HW entity that replicates the entire logic, so has +its own set of PEs, etc. + +2. Implementation of Partitionable Endpoints on P8 (IODA2) +========================================================== + +P8 supports up to 256 Partitionable Endpoints per PHB. + + * Inbound + + For DMA, MSIs and inbound PCIe error messages, we have a table (in + memory but accessed in HW by the chip) that provides a direct + correspondence between a PCIe RID (bus/dev/fn) with a PE number. + We call this the RTT. + + - For DMA we then provide an entire address space for each PE that can + contain two "windows", depending on the value of PCI address bit 59. + Each window can be configured to be remapped via a "TCE table" (IOMMU + translation table), which has various configurable characteristics + not described here. + + - For MSIs, we have two windows in the address space (one at the top of + the 32-bit space and one much higher) which, via a combination of the + address and MSI value, will result in one of the 2048 interrupts per + bridge being triggered. There's a PE# in the interrupt controller + descriptor table as well which is compared with the PE# obtained from + the RTT to "authorize" the device to emit that specific interrupt. + + - Error messages just use the RTT. + + * Outbound. That's where the tricky part is. + + Like other PCI host bridges, the Power8 IODA2 PHB supports "windows" + from the CPU address space to the PCI address space. There is one M32 + window and sixteen M64 windows. They have different characteristics. + First what they have in common: they forward a configurable portion of + the CPU address space to the PCIe bus and must be naturally aligned + power of two in size. The rest is different: + + - The M32 window: + + * Is limited to 4GB in size. + + * Drops the top bits of the address (above the size) and replaces + them with a configurable value. This is typically used to generate + 32-bit PCIe accesses. We configure that window at boot from FW and + don't touch it from Linux; it's usually set to forward a 2GB + portion of address space from the CPU to PCIe + 0x8000_0000..0xffff_ffff. (Note: The top 64KB are actually + reserved for MSIs but this is not a problem at this point; we just + need to ensure Linux doesn't assign anything there, the M32 logic + ignores that however and will forward in that space if we try). + + * It is divided into 256 segments of equal size. A table in the chip + maps each segment to a PE#. That allows portions of the MMIO space + to be assigned to PEs on a segment granularity. For a 2GB window, + the segment granularity is 2GB/256 = 8MB. + + Now, this is the "main" window we use in Linux today (excluding + SR-IOV). We basically use the trick of forcing the bridge MMIO windows + onto a segment alignment/granularity so that the space behind a bridge + can be assigned to a PE. + + Ideally we would like to be able to have individual functions in PEs + but that would mean using a completely different address allocation + scheme where individual function BARs can be "grouped" to fit in one or + more segments. + + - The M64 windows: + + * Must be at least 256MB in size. + + * Do not translate addresses (the address on PCIe is the same as the + address on the PowerBus). There is a way to also set the top 14 + bits which are not conveyed by PowerBus but we don't use this. + + * Can be configured to be segmented. When not segmented, we can + specify the PE# for the entire window. When segmented, a window + has 256 segments; however, there is no table for mapping a segment + to a PE#. The segment number *is* the PE#. + + * Support overlaps. If an address is covered by multiple windows, + there's a defined ordering for which window applies. + + We have code (fairly new compared to the M32 stuff) that exploits that + for large BARs in 64-bit space: + + We configure an M64 window to cover the entire region of address space + that has been assigned by FW for the PHB (about 64GB, ignore the space + for the M32, it comes out of a different "reserve"). We configure it + as segmented. + + Then we do the same thing as with M32, using the bridge alignment + trick, to match to those giant segments. + + Since we cannot remap, we have two additional constraints: + + - We do the PE# allocation *after* the 64-bit space has been assigned + because the addresses we use directly determine the PE#. We then + update the M32 PE# for the devices that use both 32-bit and 64-bit + spaces or assign the remaining PE# to 32-bit only devices. + + - We cannot "group" segments in HW, so if a device ends up using more + than one segment, we end up with more than one PE#. There is a HW + mechanism to make the freeze state cascade to "companion" PEs but + that only works for PCIe error messages (typically used so that if + you freeze a switch, it freezes all its children). So we do it in + SW. We lose a bit of effectiveness of EEH in that case, but that's + the best we found. So when any of the PEs freezes, we freeze the + other ones for that "domain". We thus introduce the concept of + "master PE" which is the one used for DMA, MSIs, etc., and "secondary + PEs" that are used for the remaining M64 segments. + + We would like to investigate using additional M64 windows in "single + PE" mode to overlay over specific BARs to work around some of that, for + example for devices with very large BARs, e.g., GPUs. It would make + sense, but we haven't done it yet. + +3. Considerations for SR-IOV on PowerKVM +======================================== + + * SR-IOV Background + + The PCIe SR-IOV feature allows a single Physical Function (PF) to + support several Virtual Functions (VFs). Registers in the PF's SR-IOV + Capability control the number of VFs and whether they are enabled. + + When VFs are enabled, they appear in Configuration Space like normal + PCI devices, but the BARs in VF config space headers are unusual. For + a non-VF device, software uses BARs in the config space header to + discover the BAR sizes and assign addresses for them. For VF devices, + software uses VF BAR registers in the *PF* SR-IOV Capability to + discover sizes and assign addresses. The BARs in the VF's config space + header are read-only zeros. + + When a VF BAR in the PF SR-IOV Capability is programmed, it sets the + base address for all the corresponding VF(n) BARs. For example, if the + PF SR-IOV Capability is programmed to enable eight VFs, and it has a + 1MB VF BAR0, the address in that VF BAR sets the base of an 8MB region. + This region is divided into eight contiguous 1MB regions, each of which + is a BAR0 for one of the VFs. Note that even though the VF BAR + describes an 8MB region, the alignment requirement is for a single VF, + i.e., 1MB in this example. + + There are several strategies for isolating VFs in PEs: + + - M32 window: There's one M32 window, and it is split into 256 + equally-sized segments. The finest granularity possible is a 256MB + window with 1MB segments. VF BARs that are 1MB or larger could be + mapped to separate PEs in this window. Each segment can be + individually mapped to a PE via the lookup table, so this is quite + flexible, but it works best when all the VF BARs are the same size. If + they are different sizes, the entire window has to be small enough that + the segment size matches the smallest VF BAR, which means larger VF + BARs span several segments. + + - Non-segmented M64 window: A non-segmented M64 window is mapped entirely + to a single PE, so it could only isolate one VF. + + - Single segmented M64 windows: A segmented M64 window could be used just + like the M32 window, but the segments can't be individually mapped to + PEs (the segment number is the PE#), so there isn't as much + flexibility. A VF with multiple BARs would have to be in a "domain" of + multiple PEs, which is not as well isolated as a single PE. + + - Multiple segmented M64 windows: As usual, each window is split into 256 + equally-sized segments, and the segment number is the PE#. But if we + use several M64 windows, they can be set to different base addresses + and different segment sizes. If we have VFs that each have a 1MB BAR + and a 32MB BAR, we could use one M64 window to assign 1MB segments and + another M64 window to assign 32MB segments. + + Finally, the plan to use M64 windows for SR-IOV, which will be described + more in the next two sections. For a given VF BAR, we need to + effectively reserve the entire 256 segments (256 * VF BAR size) and + position the VF BAR to start at the beginning of a free range of + segments/PEs inside that M64 window. + + The goal is of course to be able to give a separate PE for each VF. + + The IODA2 platform has 16 M64 windows, which are used to map MMIO + range to PE#. Each M64 window defines one MMIO range and this range is + divided into 256 segments, with each segment corresponding to one PE. + + We decide to leverage this M64 window to map VFs to individual PEs, since + SR-IOV VF BARs are all the same size. + + But doing so introduces another problem: total_VFs is usually smaller + than the number of M64 window segments, so if we map one VF BAR directly + to one M64 window, some part of the M64 window will map to another + device's MMIO range. + + IODA supports 256 PEs, so segmented windows contain 256 segments, so if + total_VFs is less than 256, we have the situation in Figure 1.0, where + segments [total_VFs, 255] of the M64 window may map to some MMIO range on + other devices:: + + 0 1 total_VFs - 1 + +------+------+- -+------+------+ + | | | ... | | | + +------+------+- -+------+------+ + + VF(n) BAR space + + 0 1 total_VFs - 1 255 + +------+------+- -+------+------+- -+------+------+ + | | | ... | | | ... | | | + +------+------+- -+------+------+- -+------+------+ + + M64 window + + Figure 1.0 Direct map VF(n) BAR space + + Our current solution is to allocate 256 segments even if the VF(n) BAR + space doesn't need that much, as shown in Figure 1.1:: + + 0 1 total_VFs - 1 255 + +------+------+- -+------+------+- -+------+------+ + | | | ... | | | ... | | | + +------+------+- -+------+------+- -+------+------+ + + VF(n) BAR space + extra + + 0 1 total_VFs - 1 255 + +------+------+- -+------+------+- -+------+------+ + | | | ... | | | ... | | | + +------+------+- -+------+------+- -+------+------+ + + M64 window + + Figure 1.1 Map VF(n) BAR space + extra + + Allocating the extra space ensures that the entire M64 window will be + assigned to this one SR-IOV device and none of the space will be + available for other devices. Note that this only expands the space + reserved in software; there are still only total_VFs VFs, and they only + respond to segments [0, total_VFs - 1]. There's nothing in hardware that + responds to segments [total_VFs, 255]. + +4. Implications for the Generic PCI Code +======================================== + +The PCIe SR-IOV spec requires that the base of the VF(n) BAR space be +aligned to the size of an individual VF BAR. + +In IODA2, the MMIO address determines the PE#. If the address is in an M32 +window, we can set the PE# by updating the table that translates segments +to PE#s. Similarly, if the address is in an unsegmented M64 window, we can +set the PE# for the window. But if it's in a segmented M64 window, the +segment number is the PE#. + +Therefore, the only way to control the PE# for a VF is to change the base +of the VF(n) BAR space in the VF BAR. If the PCI core allocates the exact +amount of space required for the VF(n) BAR space, the VF BAR value is fixed +and cannot be changed. + +On the other hand, if the PCI core allocates additional space, the VF BAR +value can be changed as long as the entire VF(n) BAR space remains inside +the space allocated by the core. + +Ideally the segment size will be the same as an individual VF BAR size. +Then each VF will be in its own PE. The VF BARs (and therefore the PE#s) +are contiguous. If VF0 is in PE(x), then VF(n) is in PE(x+n). If we +allocate 256 segments, there are (256 - numVFs) choices for the PE# of VF0. + +If the segment size is smaller than the VF BAR size, it will take several +segments to cover a VF BAR, and a VF will be in several PEs. This is +possible, but the isolation isn't as good, and it reduces the number of PE# +choices because instead of consuming only numVFs segments, the VF(n) BAR +space will consume (numVFs * n) segments. That means there aren't as many +available segments for adjusting base of the VF(n) BAR space. diff --git a/Documentation/powerpc/pci_iov_resource_on_powernv.txt b/Documentation/powerpc/pci_iov_resource_on_powernv.txt deleted file mode 100644 index b55c5cd83f8d..000000000000 --- a/Documentation/powerpc/pci_iov_resource_on_powernv.txt +++ /dev/null @@ -1,301 +0,0 @@ -Wei Yang -Benjamin Herrenschmidt -Bjorn Helgaas -26 Aug 2014 - -This document describes the requirement from hardware for PCI MMIO resource -sizing and assignment on PowerKVM and how generic PCI code handles this -requirement. The first two sections describe the concepts of Partitionable -Endpoints and the implementation on P8 (IODA2). The next two sections talks -about considerations on enabling SRIOV on IODA2. - -1. Introduction to Partitionable Endpoints - -A Partitionable Endpoint (PE) is a way to group the various resources -associated with a device or a set of devices to provide isolation between -partitions (i.e., filtering of DMA, MSIs etc.) and to provide a mechanism -to freeze a device that is causing errors in order to limit the possibility -of propagation of bad data. - -There is thus, in HW, a table of PE states that contains a pair of "frozen" -state bits (one for MMIO and one for DMA, they get set together but can be -cleared independently) for each PE. - -When a PE is frozen, all stores in any direction are dropped and all loads -return all 1's value. MSIs are also blocked. There's a bit more state that -captures things like the details of the error that caused the freeze etc., but -that's not critical. - -The interesting part is how the various PCIe transactions (MMIO, DMA, ...) -are matched to their corresponding PEs. - -The following section provides a rough description of what we have on P8 -(IODA2). Keep in mind that this is all per PHB (PCI host bridge). Each PHB -is a completely separate HW entity that replicates the entire logic, so has -its own set of PEs, etc. - -2. Implementation of Partitionable Endpoints on P8 (IODA2) - -P8 supports up to 256 Partitionable Endpoints per PHB. - - * Inbound - - For DMA, MSIs and inbound PCIe error messages, we have a table (in - memory but accessed in HW by the chip) that provides a direct - correspondence between a PCIe RID (bus/dev/fn) with a PE number. - We call this the RTT. - - - For DMA we then provide an entire address space for each PE that can - contain two "windows", depending on the value of PCI address bit 59. - Each window can be configured to be remapped via a "TCE table" (IOMMU - translation table), which has various configurable characteristics - not described here. - - - For MSIs, we have two windows in the address space (one at the top of - the 32-bit space and one much higher) which, via a combination of the - address and MSI value, will result in one of the 2048 interrupts per - bridge being triggered. There's a PE# in the interrupt controller - descriptor table as well which is compared with the PE# obtained from - the RTT to "authorize" the device to emit that specific interrupt. - - - Error messages just use the RTT. - - * Outbound. That's where the tricky part is. - - Like other PCI host bridges, the Power8 IODA2 PHB supports "windows" - from the CPU address space to the PCI address space. There is one M32 - window and sixteen M64 windows. They have different characteristics. - First what they have in common: they forward a configurable portion of - the CPU address space to the PCIe bus and must be naturally aligned - power of two in size. The rest is different: - - - The M32 window: - - * Is limited to 4GB in size. - - * Drops the top bits of the address (above the size) and replaces - them with a configurable value. This is typically used to generate - 32-bit PCIe accesses. We configure that window at boot from FW and - don't touch it from Linux; it's usually set to forward a 2GB - portion of address space from the CPU to PCIe - 0x8000_0000..0xffff_ffff. (Note: The top 64KB are actually - reserved for MSIs but this is not a problem at this point; we just - need to ensure Linux doesn't assign anything there, the M32 logic - ignores that however and will forward in that space if we try). - - * It is divided into 256 segments of equal size. A table in the chip - maps each segment to a PE#. That allows portions of the MMIO space - to be assigned to PEs on a segment granularity. For a 2GB window, - the segment granularity is 2GB/256 = 8MB. - - Now, this is the "main" window we use in Linux today (excluding - SR-IOV). We basically use the trick of forcing the bridge MMIO windows - onto a segment alignment/granularity so that the space behind a bridge - can be assigned to a PE. - - Ideally we would like to be able to have individual functions in PEs - but that would mean using a completely different address allocation - scheme where individual function BARs can be "grouped" to fit in one or - more segments. - - - The M64 windows: - - * Must be at least 256MB in size. - - * Do not translate addresses (the address on PCIe is the same as the - address on the PowerBus). There is a way to also set the top 14 - bits which are not conveyed by PowerBus but we don't use this. - - * Can be configured to be segmented. When not segmented, we can - specify the PE# for the entire window. When segmented, a window - has 256 segments; however, there is no table for mapping a segment - to a PE#. The segment number *is* the PE#. - - * Support overlaps. If an address is covered by multiple windows, - there's a defined ordering for which window applies. - - We have code (fairly new compared to the M32 stuff) that exploits that - for large BARs in 64-bit space: - - We configure an M64 window to cover the entire region of address space - that has been assigned by FW for the PHB (about 64GB, ignore the space - for the M32, it comes out of a different "reserve"). We configure it - as segmented. - - Then we do the same thing as with M32, using the bridge alignment - trick, to match to those giant segments. - - Since we cannot remap, we have two additional constraints: - - - We do the PE# allocation *after* the 64-bit space has been assigned - because the addresses we use directly determine the PE#. We then - update the M32 PE# for the devices that use both 32-bit and 64-bit - spaces or assign the remaining PE# to 32-bit only devices. - - - We cannot "group" segments in HW, so if a device ends up using more - than one segment, we end up with more than one PE#. There is a HW - mechanism to make the freeze state cascade to "companion" PEs but - that only works for PCIe error messages (typically used so that if - you freeze a switch, it freezes all its children). So we do it in - SW. We lose a bit of effectiveness of EEH in that case, but that's - the best we found. So when any of the PEs freezes, we freeze the - other ones for that "domain". We thus introduce the concept of - "master PE" which is the one used for DMA, MSIs, etc., and "secondary - PEs" that are used for the remaining M64 segments. - - We would like to investigate using additional M64 windows in "single - PE" mode to overlay over specific BARs to work around some of that, for - example for devices with very large BARs, e.g., GPUs. It would make - sense, but we haven't done it yet. - -3. Considerations for SR-IOV on PowerKVM - - * SR-IOV Background - - The PCIe SR-IOV feature allows a single Physical Function (PF) to - support several Virtual Functions (VFs). Registers in the PF's SR-IOV - Capability control the number of VFs and whether they are enabled. - - When VFs are enabled, they appear in Configuration Space like normal - PCI devices, but the BARs in VF config space headers are unusual. For - a non-VF device, software uses BARs in the config space header to - discover the BAR sizes and assign addresses for them. For VF devices, - software uses VF BAR registers in the *PF* SR-IOV Capability to - discover sizes and assign addresses. The BARs in the VF's config space - header are read-only zeros. - - When a VF BAR in the PF SR-IOV Capability is programmed, it sets the - base address for all the corresponding VF(n) BARs. For example, if the - PF SR-IOV Capability is programmed to enable eight VFs, and it has a - 1MB VF BAR0, the address in that VF BAR sets the base of an 8MB region. - This region is divided into eight contiguous 1MB regions, each of which - is a BAR0 for one of the VFs. Note that even though the VF BAR - describes an 8MB region, the alignment requirement is for a single VF, - i.e., 1MB in this example. - - There are several strategies for isolating VFs in PEs: - - - M32 window: There's one M32 window, and it is split into 256 - equally-sized segments. The finest granularity possible is a 256MB - window with 1MB segments. VF BARs that are 1MB or larger could be - mapped to separate PEs in this window. Each segment can be - individually mapped to a PE via the lookup table, so this is quite - flexible, but it works best when all the VF BARs are the same size. If - they are different sizes, the entire window has to be small enough that - the segment size matches the smallest VF BAR, which means larger VF - BARs span several segments. - - - Non-segmented M64 window: A non-segmented M64 window is mapped entirely - to a single PE, so it could only isolate one VF. - - - Single segmented M64 windows: A segmented M64 window could be used just - like the M32 window, but the segments can't be individually mapped to - PEs (the segment number is the PE#), so there isn't as much - flexibility. A VF with multiple BARs would have to be in a "domain" of - multiple PEs, which is not as well isolated as a single PE. - - - Multiple segmented M64 windows: As usual, each window is split into 256 - equally-sized segments, and the segment number is the PE#. But if we - use several M64 windows, they can be set to different base addresses - and different segment sizes. If we have VFs that each have a 1MB BAR - and a 32MB BAR, we could use one M64 window to assign 1MB segments and - another M64 window to assign 32MB segments. - - Finally, the plan to use M64 windows for SR-IOV, which will be described - more in the next two sections. For a given VF BAR, we need to - effectively reserve the entire 256 segments (256 * VF BAR size) and - position the VF BAR to start at the beginning of a free range of - segments/PEs inside that M64 window. - - The goal is of course to be able to give a separate PE for each VF. - - The IODA2 platform has 16 M64 windows, which are used to map MMIO - range to PE#. Each M64 window defines one MMIO range and this range is - divided into 256 segments, with each segment corresponding to one PE. - - We decide to leverage this M64 window to map VFs to individual PEs, since - SR-IOV VF BARs are all the same size. - - But doing so introduces another problem: total_VFs is usually smaller - than the number of M64 window segments, so if we map one VF BAR directly - to one M64 window, some part of the M64 window will map to another - device's MMIO range. - - IODA supports 256 PEs, so segmented windows contain 256 segments, so if - total_VFs is less than 256, we have the situation in Figure 1.0, where - segments [total_VFs, 255] of the M64 window may map to some MMIO range on - other devices: - - 0 1 total_VFs - 1 - +------+------+- -+------+------+ - | | | ... | | | - +------+------+- -+------+------+ - - VF(n) BAR space - - 0 1 total_VFs - 1 255 - +------+------+- -+------+------+- -+------+------+ - | | | ... | | | ... | | | - +------+------+- -+------+------+- -+------+------+ - - M64 window - - Figure 1.0 Direct map VF(n) BAR space - - Our current solution is to allocate 256 segments even if the VF(n) BAR - space doesn't need that much, as shown in Figure 1.1: - - 0 1 total_VFs - 1 255 - +------+------+- -+------+------+- -+------+------+ - | | | ... | | | ... | | | - +------+------+- -+------+------+- -+------+------+ - - VF(n) BAR space + extra - - 0 1 total_VFs - 1 255 - +------+------+- -+------+------+- -+------+------+ - | | | ... | | | ... | | | - +------+------+- -+------+------+- -+------+------+ - - M64 window - - Figure 1.1 Map VF(n) BAR space + extra - - Allocating the extra space ensures that the entire M64 window will be - assigned to this one SR-IOV device and none of the space will be - available for other devices. Note that this only expands the space - reserved in software; there are still only total_VFs VFs, and they only - respond to segments [0, total_VFs - 1]. There's nothing in hardware that - responds to segments [total_VFs, 255]. - -4. Implications for the Generic PCI Code - -The PCIe SR-IOV spec requires that the base of the VF(n) BAR space be -aligned to the size of an individual VF BAR. - -In IODA2, the MMIO address determines the PE#. If the address is in an M32 -window, we can set the PE# by updating the table that translates segments -to PE#s. Similarly, if the address is in an unsegmented M64 window, we can -set the PE# for the window. But if it's in a segmented M64 window, the -segment number is the PE#. - -Therefore, the only way to control the PE# for a VF is to change the base -of the VF(n) BAR space in the VF BAR. If the PCI core allocates the exact -amount of space required for the VF(n) BAR space, the VF BAR value is fixed -and cannot be changed. - -On the other hand, if the PCI core allocates additional space, the VF BAR -value can be changed as long as the entire VF(n) BAR space remains inside -the space allocated by the core. - -Ideally the segment size will be the same as an individual VF BAR size. -Then each VF will be in its own PE. The VF BARs (and therefore the PE#s) -are contiguous. If VF0 is in PE(x), then VF(n) is in PE(x+n). If we -allocate 256 segments, there are (256 - numVFs) choices for the PE# of VF0. - -If the segment size is smaller than the VF BAR size, it will take several -segments to cover a VF BAR, and a VF will be in several PEs. This is -possible, but the isolation isn't as good, and it reduces the number of PE# -choices because instead of consuming only numVFs segments, the VF(n) BAR -space will consume (numVFs * n) segments. That means there aren't as many -available segments for adjusting base of the VF(n) BAR space. diff --git a/Documentation/powerpc/pmu-ebb.rst b/Documentation/powerpc/pmu-ebb.rst new file mode 100644 index 000000000000..4f474758eb55 --- /dev/null +++ b/Documentation/powerpc/pmu-ebb.rst @@ -0,0 +1,138 @@ +======================== +PMU Event Based Branches +======================== + +Event Based Branches (EBBs) are a feature which allows the hardware to +branch directly to a specified user space address when certain events occur. + +The full specification is available in Power ISA v2.07: + + https://www.power.org/documentation/power-isa-version-2-07/ + +One type of event for which EBBs can be configured is PMU exceptions. This +document describes the API for configuring the Power PMU to generate EBBs, +using the Linux perf_events API. + + +Terminology +----------- + +Throughout this document we will refer to an "EBB event" or "EBB events". This +just refers to a struct perf_event which has set the "EBB" flag in its +attr.config. All events which can be configured on the hardware PMU are +possible "EBB events". + + +Background +---------- + +When a PMU EBB occurs it is delivered to the currently running process. As such +EBBs can only sensibly be used by programs for self-monitoring. + +It is a feature of the perf_events API that events can be created on other +processes, subject to standard permission checks. This is also true of EBB +events, however unless the target process enables EBBs (via mtspr(BESCR)) no +EBBs will ever be delivered. + +This makes it possible for a process to enable EBBs for itself, but not +actually configure any events. At a later time another process can come along +and attach an EBB event to the process, which will then cause EBBs to be +delivered to the first process. It's not clear if this is actually useful. + + +When the PMU is configured for EBBs, all PMU interrupts are delivered to the +user process. This means once an EBB event is scheduled on the PMU, no non-EBB +events can be configured. This means that EBB events can not be run +concurrently with regular 'perf' commands, or any other perf events. + +It is however safe to run 'perf' commands on a process which is using EBBs. The +kernel will in general schedule the EBB event, and perf will be notified that +its events could not run. + +The exclusion between EBB events and regular events is implemented using the +existing "pinned" and "exclusive" attributes of perf_events. This means EBB +events will be given priority over other events, unless they are also pinned. +If an EBB event and a regular event are both pinned, then whichever is enabled +first will be scheduled and the other will be put in error state. See the +section below titled "Enabling an EBB event" for more information. + + +Creating an EBB event +--------------------- + +To request that an event is counted using EBB, the event code should have bit +63 set. + +EBB events must be created with a particular, and restrictive, set of +attributes - this is so that they interoperate correctly with the rest of the +perf_events subsystem. + +An EBB event must be created with the "pinned" and "exclusive" attributes set. +Note that if you are creating a group of EBB events, only the leader can have +these attributes set. + +An EBB event must NOT set any of the "inherit", "sample_period", "freq" or +"enable_on_exec" attributes. + +An EBB event must be attached to a task. This is specified to perf_event_open() +by passing a pid value, typically 0 indicating the current task. + +All events in a group must agree on whether they want EBB. That is all events +must request EBB, or none may request EBB. + +EBB events must specify the PMC they are to be counted on. This ensures +userspace is able to reliably determine which PMC the event is scheduled on. + + +Enabling an EBB event +--------------------- + +Once an EBB event has been successfully opened, it must be enabled with the +perf_events API. This can be achieved either via the ioctl() interface, or the +prctl() interface. + +However, due to the design of the perf_events API, enabling an event does not +guarantee that it has been scheduled on the PMU. To ensure that the EBB event +has been scheduled on the PMU, you must perform a read() on the event. If the +read() returns EOF, then the event has not been scheduled and EBBs are not +enabled. + +This behaviour occurs because the EBB event is pinned and exclusive. When the +EBB event is enabled it will force all other non-pinned events off the PMU. In +this case the enable will be successful. However if there is already an event +pinned on the PMU then the enable will not be successful. + + +Reading an EBB event +-------------------- + +It is possible to read() from an EBB event. However the results are +meaningless. Because interrupts are being delivered to the user process the +kernel is not able to count the event, and so will return a junk value. + + +Closing an EBB event +-------------------- + +When an EBB event is finished with, you can close it using close() as for any +regular event. If this is the last EBB event the PMU will be deconfigured and +no further PMU EBBs will be delivered. + + +EBB Handler +----------- + +The EBB handler is just regular userspace code, however it must be written in +the style of an interrupt handler. When the handler is entered all registers +are live (possibly) and so must be saved somehow before the handler can invoke +other code. + +It's up to the program how to handle this. For C programs a relatively simple +option is to create an interrupt frame on the stack and save registers there. + +Fork +---- + +EBB events are not inherited across fork. If the child process wishes to use +EBBs it should open a new event for itself. Similarly the EBB state in +BESCR/EBBHR/EBBRR is cleared across fork(). diff --git a/Documentation/powerpc/pmu-ebb.txt b/Documentation/powerpc/pmu-ebb.txt deleted file mode 100644 index 73cd163dbfb8..000000000000 --- a/Documentation/powerpc/pmu-ebb.txt +++ /dev/null @@ -1,137 +0,0 @@ -PMU Event Based Branches -======================== - -Event Based Branches (EBBs) are a feature which allows the hardware to -branch directly to a specified user space address when certain events occur. - -The full specification is available in Power ISA v2.07: - - https://www.power.org/documentation/power-isa-version-2-07/ - -One type of event for which EBBs can be configured is PMU exceptions. This -document describes the API for configuring the Power PMU to generate EBBs, -using the Linux perf_events API. - - -Terminology ------------ - -Throughout this document we will refer to an "EBB event" or "EBB events". This -just refers to a struct perf_event which has set the "EBB" flag in its -attr.config. All events which can be configured on the hardware PMU are -possible "EBB events". - - -Background ----------- - -When a PMU EBB occurs it is delivered to the currently running process. As such -EBBs can only sensibly be used by programs for self-monitoring. - -It is a feature of the perf_events API that events can be created on other -processes, subject to standard permission checks. This is also true of EBB -events, however unless the target process enables EBBs (via mtspr(BESCR)) no -EBBs will ever be delivered. - -This makes it possible for a process to enable EBBs for itself, but not -actually configure any events. At a later time another process can come along -and attach an EBB event to the process, which will then cause EBBs to be -delivered to the first process. It's not clear if this is actually useful. - - -When the PMU is configured for EBBs, all PMU interrupts are delivered to the -user process. This means once an EBB event is scheduled on the PMU, no non-EBB -events can be configured. This means that EBB events can not be run -concurrently with regular 'perf' commands, or any other perf events. - -It is however safe to run 'perf' commands on a process which is using EBBs. The -kernel will in general schedule the EBB event, and perf will be notified that -its events could not run. - -The exclusion between EBB events and regular events is implemented using the -existing "pinned" and "exclusive" attributes of perf_events. This means EBB -events will be given priority over other events, unless they are also pinned. -If an EBB event and a regular event are both pinned, then whichever is enabled -first will be scheduled and the other will be put in error state. See the -section below titled "Enabling an EBB event" for more information. - - -Creating an EBB event ---------------------- - -To request that an event is counted using EBB, the event code should have bit -63 set. - -EBB events must be created with a particular, and restrictive, set of -attributes - this is so that they interoperate correctly with the rest of the -perf_events subsystem. - -An EBB event must be created with the "pinned" and "exclusive" attributes set. -Note that if you are creating a group of EBB events, only the leader can have -these attributes set. - -An EBB event must NOT set any of the "inherit", "sample_period", "freq" or -"enable_on_exec" attributes. - -An EBB event must be attached to a task. This is specified to perf_event_open() -by passing a pid value, typically 0 indicating the current task. - -All events in a group must agree on whether they want EBB. That is all events -must request EBB, or none may request EBB. - -EBB events must specify the PMC they are to be counted on. This ensures -userspace is able to reliably determine which PMC the event is scheduled on. - - -Enabling an EBB event ---------------------- - -Once an EBB event has been successfully opened, it must be enabled with the -perf_events API. This can be achieved either via the ioctl() interface, or the -prctl() interface. - -However, due to the design of the perf_events API, enabling an event does not -guarantee that it has been scheduled on the PMU. To ensure that the EBB event -has been scheduled on the PMU, you must perform a read() on the event. If the -read() returns EOF, then the event has not been scheduled and EBBs are not -enabled. - -This behaviour occurs because the EBB event is pinned and exclusive. When the -EBB event is enabled it will force all other non-pinned events off the PMU. In -this case the enable will be successful. However if there is already an event -pinned on the PMU then the enable will not be successful. - - -Reading an EBB event --------------------- - -It is possible to read() from an EBB event. However the results are -meaningless. Because interrupts are being delivered to the user process the -kernel is not able to count the event, and so will return a junk value. - - -Closing an EBB event --------------------- - -When an EBB event is finished with, you can close it using close() as for any -regular event. If this is the last EBB event the PMU will be deconfigured and -no further PMU EBBs will be delivered. - - -EBB Handler ------------ - -The EBB handler is just regular userspace code, however it must be written in -the style of an interrupt handler. When the handler is entered all registers -are live (possibly) and so must be saved somehow before the handler can invoke -other code. - -It's up to the program how to handle this. For C programs a relatively simple -option is to create an interrupt frame on the stack and save registers there. - -Fork ----- - -EBB events are not inherited across fork. If the child process wishes to use -EBBs it should open a new event for itself. Similarly the EBB state in -BESCR/EBBHR/EBBRR is cleared across fork(). diff --git a/Documentation/powerpc/ptrace.rst b/Documentation/powerpc/ptrace.rst new file mode 100644 index 000000000000..864d4b6dddd1 --- /dev/null +++ b/Documentation/powerpc/ptrace.rst @@ -0,0 +1,156 @@ +====== +Ptrace +====== + +GDB intends to support the following hardware debug features of BookE +processors: + +4 hardware breakpoints (IAC) +2 hardware watchpoints (read, write and read-write) (DAC) +2 value conditions for the hardware watchpoints (DVC) + +For that, we need to extend ptrace so that GDB can query and set these +resources. Since we're extending, we're trying to create an interface +that's extendable and that covers both BookE and server processors, so +that GDB doesn't need to special-case each of them. We added the +following 3 new ptrace requests. + +1. PTRACE_PPC_GETHWDEBUGINFO +============================ + +Query for GDB to discover the hardware debug features. The main info to +be returned here is the minimum alignment for the hardware watchpoints. +BookE processors don't have restrictions here, but server processors have +an 8-byte alignment restriction for hardware watchpoints. We'd like to avoid +adding special cases to GDB based on what it sees in AUXV. + +Since we're at it, we added other useful info that the kernel can return to +GDB: this query will return the number of hardware breakpoints, hardware +watchpoints and whether it supports a range of addresses and a condition. +The query will fill the following structure provided by the requesting process:: + + struct ppc_debug_info { + unit32_t version; + unit32_t num_instruction_bps; + unit32_t num_data_bps; + unit32_t num_condition_regs; + unit32_t data_bp_alignment; + unit32_t sizeof_condition; /* size of the DVC register */ + uint64_t features; /* bitmask of the individual flags */ + }; + +features will have bits indicating whether there is support for:: + + #define PPC_DEBUG_FEATURE_INSN_BP_RANGE 0x1 + #define PPC_DEBUG_FEATURE_INSN_BP_MASK 0x2 + #define PPC_DEBUG_FEATURE_DATA_BP_RANGE 0x4 + #define PPC_DEBUG_FEATURE_DATA_BP_MASK 0x8 + #define PPC_DEBUG_FEATURE_DATA_BP_DAWR 0x10 + +2. PTRACE_SETHWDEBUG + +Sets a hardware breakpoint or watchpoint, according to the provided structure:: + + struct ppc_hw_breakpoint { + uint32_t version; + #define PPC_BREAKPOINT_TRIGGER_EXECUTE 0x1 + #define PPC_BREAKPOINT_TRIGGER_READ 0x2 + #define PPC_BREAKPOINT_TRIGGER_WRITE 0x4 + uint32_t trigger_type; /* only some combinations allowed */ + #define PPC_BREAKPOINT_MODE_EXACT 0x0 + #define PPC_BREAKPOINT_MODE_RANGE_INCLUSIVE 0x1 + #define PPC_BREAKPOINT_MODE_RANGE_EXCLUSIVE 0x2 + #define PPC_BREAKPOINT_MODE_MASK 0x3 + uint32_t addr_mode; /* address match mode */ + + #define PPC_BREAKPOINT_CONDITION_MODE 0x3 + #define PPC_BREAKPOINT_CONDITION_NONE 0x0 + #define PPC_BREAKPOINT_CONDITION_AND 0x1 + #define PPC_BREAKPOINT_CONDITION_EXACT 0x1 /* different name for the same thing as above */ + #define PPC_BREAKPOINT_CONDITION_OR 0x2 + #define PPC_BREAKPOINT_CONDITION_AND_OR 0x3 + #define PPC_BREAKPOINT_CONDITION_BE_ALL 0x00ff0000 /* byte enable bits */ + #define PPC_BREAKPOINT_CONDITION_BE(n) (1<<((n)+16)) + uint32_t condition_mode; /* break/watchpoint condition flags */ + + uint64_t addr; + uint64_t addr2; + uint64_t condition_value; + }; + +A request specifies one event, not necessarily just one register to be set. +For instance, if the request is for a watchpoint with a condition, both the +DAC and DVC registers will be set in the same request. + +With this GDB can ask for all kinds of hardware breakpoints and watchpoints +that the BookE supports. COMEFROM breakpoints available in server processors +are not contemplated, but that is out of the scope of this work. + +ptrace will return an integer (handle) uniquely identifying the breakpoint or +watchpoint just created. This integer will be used in the PTRACE_DELHWDEBUG +request to ask for its removal. Return -ENOSPC if the requested breakpoint +can't be allocated on the registers. + +Some examples of using the structure to: + +- set a breakpoint in the first breakpoint register:: + + p.version = PPC_DEBUG_CURRENT_VERSION; + p.trigger_type = PPC_BREAKPOINT_TRIGGER_EXECUTE; + p.addr_mode = PPC_BREAKPOINT_MODE_EXACT; + p.condition_mode = PPC_BREAKPOINT_CONDITION_NONE; + p.addr = (uint64_t) address; + p.addr2 = 0; + p.condition_value = 0; + +- set a watchpoint which triggers on reads in the second watchpoint register:: + + p.version = PPC_DEBUG_CURRENT_VERSION; + p.trigger_type = PPC_BREAKPOINT_TRIGGER_READ; + p.addr_mode = PPC_BREAKPOINT_MODE_EXACT; + p.condition_mode = PPC_BREAKPOINT_CONDITION_NONE; + p.addr = (uint64_t) address; + p.addr2 = 0; + p.condition_value = 0; + +- set a watchpoint which triggers only with a specific value:: + + p.version = PPC_DEBUG_CURRENT_VERSION; + p.trigger_type = PPC_BREAKPOINT_TRIGGER_READ; + p.addr_mode = PPC_BREAKPOINT_MODE_EXACT; + p.condition_mode = PPC_BREAKPOINT_CONDITION_AND | PPC_BREAKPOINT_CONDITION_BE_ALL; + p.addr = (uint64_t) address; + p.addr2 = 0; + p.condition_value = (uint64_t) condition; + +- set a ranged hardware breakpoint:: + + p.version = PPC_DEBUG_CURRENT_VERSION; + p.trigger_type = PPC_BREAKPOINT_TRIGGER_EXECUTE; + p.addr_mode = PPC_BREAKPOINT_MODE_RANGE_INCLUSIVE; + p.condition_mode = PPC_BREAKPOINT_CONDITION_NONE; + p.addr = (uint64_t) begin_range; + p.addr2 = (uint64_t) end_range; + p.condition_value = 0; + +- set a watchpoint in server processors (BookS):: + + p.version = 1; + p.trigger_type = PPC_BREAKPOINT_TRIGGER_RW; + p.addr_mode = PPC_BREAKPOINT_MODE_RANGE_INCLUSIVE; + or + p.addr_mode = PPC_BREAKPOINT_MODE_EXACT; + + p.condition_mode = PPC_BREAKPOINT_CONDITION_NONE; + p.addr = (uint64_t) begin_range; + /* For PPC_BREAKPOINT_MODE_RANGE_INCLUSIVE addr2 needs to be specified, where + * addr2 - addr <= 8 Bytes. + */ + p.addr2 = (uint64_t) end_range; + p.condition_value = 0; + +3. PTRACE_DELHWDEBUG + +Takes an integer which identifies an existing breakpoint or watchpoint +(i.e., the value returned from PTRACE_SETHWDEBUG), and deletes the +corresponding breakpoint or watchpoint.. diff --git a/Documentation/powerpc/ptrace.txt b/Documentation/powerpc/ptrace.txt deleted file mode 100644 index 99c5ce88d0fe..000000000000 --- a/Documentation/powerpc/ptrace.txt +++ /dev/null @@ -1,151 +0,0 @@ -GDB intends to support the following hardware debug features of BookE -processors: - -4 hardware breakpoints (IAC) -2 hardware watchpoints (read, write and read-write) (DAC) -2 value conditions for the hardware watchpoints (DVC) - -For that, we need to extend ptrace so that GDB can query and set these -resources. Since we're extending, we're trying to create an interface -that's extendable and that covers both BookE and server processors, so -that GDB doesn't need to special-case each of them. We added the -following 3 new ptrace requests. - -1. PTRACE_PPC_GETHWDEBUGINFO - -Query for GDB to discover the hardware debug features. The main info to -be returned here is the minimum alignment for the hardware watchpoints. -BookE processors don't have restrictions here, but server processors have -an 8-byte alignment restriction for hardware watchpoints. We'd like to avoid -adding special cases to GDB based on what it sees in AUXV. - -Since we're at it, we added other useful info that the kernel can return to -GDB: this query will return the number of hardware breakpoints, hardware -watchpoints and whether it supports a range of addresses and a condition. -The query will fill the following structure provided by the requesting process: - -struct ppc_debug_info { - unit32_t version; - unit32_t num_instruction_bps; - unit32_t num_data_bps; - unit32_t num_condition_regs; - unit32_t data_bp_alignment; - unit32_t sizeof_condition; /* size of the DVC register */ - uint64_t features; /* bitmask of the individual flags */ -}; - -features will have bits indicating whether there is support for: - -#define PPC_DEBUG_FEATURE_INSN_BP_RANGE 0x1 -#define PPC_DEBUG_FEATURE_INSN_BP_MASK 0x2 -#define PPC_DEBUG_FEATURE_DATA_BP_RANGE 0x4 -#define PPC_DEBUG_FEATURE_DATA_BP_MASK 0x8 -#define PPC_DEBUG_FEATURE_DATA_BP_DAWR 0x10 - -2. PTRACE_SETHWDEBUG - -Sets a hardware breakpoint or watchpoint, according to the provided structure: - -struct ppc_hw_breakpoint { - uint32_t version; -#define PPC_BREAKPOINT_TRIGGER_EXECUTE 0x1 -#define PPC_BREAKPOINT_TRIGGER_READ 0x2 -#define PPC_BREAKPOINT_TRIGGER_WRITE 0x4 - uint32_t trigger_type; /* only some combinations allowed */ -#define PPC_BREAKPOINT_MODE_EXACT 0x0 -#define PPC_BREAKPOINT_MODE_RANGE_INCLUSIVE 0x1 -#define PPC_BREAKPOINT_MODE_RANGE_EXCLUSIVE 0x2 -#define PPC_BREAKPOINT_MODE_MASK 0x3 - uint32_t addr_mode; /* address match mode */ - -#define PPC_BREAKPOINT_CONDITION_MODE 0x3 -#define PPC_BREAKPOINT_CONDITION_NONE 0x0 -#define PPC_BREAKPOINT_CONDITION_AND 0x1 -#define PPC_BREAKPOINT_CONDITION_EXACT 0x1 /* different name for the same thing as above */ -#define PPC_BREAKPOINT_CONDITION_OR 0x2 -#define PPC_BREAKPOINT_CONDITION_AND_OR 0x3 -#define PPC_BREAKPOINT_CONDITION_BE_ALL 0x00ff0000 /* byte enable bits */ -#define PPC_BREAKPOINT_CONDITION_BE(n) (1<<((n)+16)) - uint32_t condition_mode; /* break/watchpoint condition flags */ - - uint64_t addr; - uint64_t addr2; - uint64_t condition_value; -}; - -A request specifies one event, not necessarily just one register to be set. -For instance, if the request is for a watchpoint with a condition, both the -DAC and DVC registers will be set in the same request. - -With this GDB can ask for all kinds of hardware breakpoints and watchpoints -that the BookE supports. COMEFROM breakpoints available in server processors -are not contemplated, but that is out of the scope of this work. - -ptrace will return an integer (handle) uniquely identifying the breakpoint or -watchpoint just created. This integer will be used in the PTRACE_DELHWDEBUG -request to ask for its removal. Return -ENOSPC if the requested breakpoint -can't be allocated on the registers. - -Some examples of using the structure to: - -- set a breakpoint in the first breakpoint register - - p.version = PPC_DEBUG_CURRENT_VERSION; - p.trigger_type = PPC_BREAKPOINT_TRIGGER_EXECUTE; - p.addr_mode = PPC_BREAKPOINT_MODE_EXACT; - p.condition_mode = PPC_BREAKPOINT_CONDITION_NONE; - p.addr = (uint64_t) address; - p.addr2 = 0; - p.condition_value = 0; - -- set a watchpoint which triggers on reads in the second watchpoint register - - p.version = PPC_DEBUG_CURRENT_VERSION; - p.trigger_type = PPC_BREAKPOINT_TRIGGER_READ; - p.addr_mode = PPC_BREAKPOINT_MODE_EXACT; - p.condition_mode = PPC_BREAKPOINT_CONDITION_NONE; - p.addr = (uint64_t) address; - p.addr2 = 0; - p.condition_value = 0; - -- set a watchpoint which triggers only with a specific value - - p.version = PPC_DEBUG_CURRENT_VERSION; - p.trigger_type = PPC_BREAKPOINT_TRIGGER_READ; - p.addr_mode = PPC_BREAKPOINT_MODE_EXACT; - p.condition_mode = PPC_BREAKPOINT_CONDITION_AND | PPC_BREAKPOINT_CONDITION_BE_ALL; - p.addr = (uint64_t) address; - p.addr2 = 0; - p.condition_value = (uint64_t) condition; - -- set a ranged hardware breakpoint - - p.version = PPC_DEBUG_CURRENT_VERSION; - p.trigger_type = PPC_BREAKPOINT_TRIGGER_EXECUTE; - p.addr_mode = PPC_BREAKPOINT_MODE_RANGE_INCLUSIVE; - p.condition_mode = PPC_BREAKPOINT_CONDITION_NONE; - p.addr = (uint64_t) begin_range; - p.addr2 = (uint64_t) end_range; - p.condition_value = 0; - -- set a watchpoint in server processors (BookS) - - p.version = 1; - p.trigger_type = PPC_BREAKPOINT_TRIGGER_RW; - p.addr_mode = PPC_BREAKPOINT_MODE_RANGE_INCLUSIVE; - or - p.addr_mode = PPC_BREAKPOINT_MODE_EXACT; - - p.condition_mode = PPC_BREAKPOINT_CONDITION_NONE; - p.addr = (uint64_t) begin_range; - /* For PPC_BREAKPOINT_MODE_RANGE_INCLUSIVE addr2 needs to be specified, where - * addr2 - addr <= 8 Bytes. - */ - p.addr2 = (uint64_t) end_range; - p.condition_value = 0; - -3. PTRACE_DELHWDEBUG - -Takes an integer which identifies an existing breakpoint or watchpoint -(i.e., the value returned from PTRACE_SETHWDEBUG), and deletes the -corresponding breakpoint or watchpoint.. diff --git a/Documentation/powerpc/qe_firmware.rst b/Documentation/powerpc/qe_firmware.rst new file mode 100644 index 000000000000..42f5103140c9 --- /dev/null +++ b/Documentation/powerpc/qe_firmware.rst @@ -0,0 +1,296 @@ +========================================= +Freescale QUICC Engine Firmware Uploading +========================================= + +(c) 2007 Timur Tabi , + Freescale Semiconductor + +.. Table of Contents + + I - Software License for Firmware + + II - Microcode Availability + + III - Description and Terminology + + IV - Microcode Programming Details + + V - Firmware Structure Layout + + VI - Sample Code for Creating Firmware Files + +Revision Information +==================== + +November 30, 2007: Rev 1.0 - Initial version + +I - Software License for Firmware +================================= + +Each firmware file comes with its own software license. For information on +the particular license, please see the license text that is distributed with +the firmware. + +II - Microcode Availability +=========================== + +Firmware files are distributed through various channels. Some are available on +http://opensource.freescale.com. For other firmware files, please contact +your Freescale representative or your operating system vendor. + +III - Description and Terminology +================================= + +In this document, the term 'microcode' refers to the sequence of 32-bit +integers that compose the actual QE microcode. + +The term 'firmware' refers to a binary blob that contains the microcode as +well as other data that + + 1) describes the microcode's purpose + 2) describes how and where to upload the microcode + 3) specifies the values of various registers + 4) includes additional data for use by specific device drivers + +Firmware files are binary files that contain only a firmware. + +IV - Microcode Programming Details +=================================== + +The QE architecture allows for only one microcode present in I-RAM for each +RISC processor. To replace any current microcode, a full QE reset (which +disables the microcode) must be performed first. + +QE microcode is uploaded using the following procedure: + +1) The microcode is placed into I-RAM at a specific location, using the + IRAM.IADD and IRAM.IDATA registers. + +2) The CERCR.CIR bit is set to 0 or 1, depending on whether the firmware + needs split I-RAM. Split I-RAM is only meaningful for SOCs that have + QEs with multiple RISC processors, such as the 8360. Splitting the I-RAM + allows each processor to run a different microcode, effectively creating an + asymmetric multiprocessing (AMP) system. + +3) The TIBCR trap registers are loaded with the addresses of the trap handlers + in the microcode. + +4) The RSP.ECCR register is programmed with the value provided. + +5) If necessary, device drivers that need the virtual traps and extended mode + data will use them. + +Virtual Microcode Traps + +These virtual traps are conditional branches in the microcode. These are +"soft" provisional introduced in the ROMcode in order to enable higher +flexibility and save h/w traps If new features are activated or an issue is +being fixed in the RAM package utilizing they should be activated. This data +structure signals the microcode which of these virtual traps is active. + +This structure contains 6 words that the application should copy to some +specific been defined. This table describes the structure:: + + --------------------------------------------------------------- + | Offset in | | Destination Offset | Size of | + | array | Protocol | within PRAM | Operand | + --------------------------------------------------------------| + | 0 | Ethernet | 0xF8 | 4 bytes | + | | interworking | | | + --------------------------------------------------------------- + | 4 | ATM | 0xF8 | 4 bytes | + | | interworking | | | + --------------------------------------------------------------- + | 8 | PPP | 0xF8 | 4 bytes | + | | interworking | | | + --------------------------------------------------------------- + | 12 | Ethernet RX | 0x22 | 1 byte | + | | Distributor Page | | | + --------------------------------------------------------------- + | 16 | ATM Globtal | 0x28 | 1 byte | + | | Params Table | | | + --------------------------------------------------------------- + | 20 | Insert Frame | 0xF8 | 4 bytes | + --------------------------------------------------------------- + + +Extended Modes + +This is a double word bit array (64 bits) that defines special functionality +which has an impact on the software drivers. Each bit has its own impact +and has special instructions for the s/w associated with it. This structure is +described in this table:: + + ----------------------------------------------------------------------- + | Bit # | Name | Description | + ----------------------------------------------------------------------- + | 0 | General | Indicates that prior to each host command | + | | push command | given by the application, the software must | + | | | assert a special host command (push command)| + | | | CECDR = 0x00800000. | + | | | CECR = 0x01c1000f. | + ----------------------------------------------------------------------- + | 1 | UCC ATM | Indicates that after issuing ATM RX INIT | + | | RX INIT | command, the host must issue another special| + | | push command | command (push command) and immediately | + | | | following that re-issue the ATM RX INIT | + | | | command. (This makes the sequence of | + | | | initializing the ATM receiver a sequence of | + | | | three host commands) | + | | | CECDR = 0x00800000. | + | | | CECR = 0x01c1000f. | + ----------------------------------------------------------------------- + | 2 | Add/remove | Indicates that following the specific host | + | | command | command: "Add/Remove entry in Hash Lookup | + | | validation | Table" used in Interworking setup, the user | + | | | must issue another command. | + | | | CECDR = 0xce000003. | + | | | CECR = 0x01c10f58. | + ----------------------------------------------------------------------- + | 3 | General push | Indicates that the s/w has to initialize | + | | command | some pointers in the Ethernet thread pages | + | | | which are used when Header Compression is | + | | | activated. The full details of these | + | | | pointers is located in the software drivers.| + ----------------------------------------------------------------------- + | 4 | General push | Indicates that after issuing Ethernet TX | + | | command | INIT command, user must issue this command | + | | | for each SNUM of Ethernet TX thread. | + | | | CECDR = 0x00800003. | + | | | CECR = 0x7'b{0}, 8'b{Enet TX thread SNUM}, | + | | | 1'b{1}, 12'b{0}, 4'b{1} | + ----------------------------------------------------------------------- + | 5 - 31 | N/A | Reserved, set to zero. | + ----------------------------------------------------------------------- + +V - Firmware Structure Layout +============================== + +QE microcode from Freescale is typically provided as a header file. This +header file contains macros that define the microcode binary itself as well as +some other data used in uploading that microcode. The format of these files +do not lend themselves to simple inclusion into other code. Hence, +the need for a more portable format. This section defines that format. + +Instead of distributing a header file, the microcode and related data are +embedded into a binary blob. This blob is passed to the qe_upload_firmware() +function, which parses the blob and performs everything necessary to upload +the microcode. + +All integers are big-endian. See the comments for function +qe_upload_firmware() for up-to-date implementation information. + +This structure supports versioning, where the version of the structure is +embedded into the structure itself. To ensure forward and backwards +compatibility, all versions of the structure must use the same 'qe_header' +structure at the beginning. + +'header' (type: struct qe_header): + The 'length' field is the size, in bytes, of the entire structure, + including all the microcode embedded in it, as well as the CRC (if + present). + + The 'magic' field is an array of three bytes that contains the letters + 'Q', 'E', and 'F'. This is an identifier that indicates that this + structure is a QE Firmware structure. + + The 'version' field is a single byte that indicates the version of this + structure. If the layout of the structure should ever need to be + changed to add support for additional types of microcode, then the + version number should also be changed. + +The 'id' field is a null-terminated string(suitable for printing) that +identifies the firmware. + +The 'count' field indicates the number of 'microcode' structures. There +must be one and only one 'microcode' structure for each RISC processor. +Therefore, this field also represents the number of RISC processors for this +SOC. + +The 'soc' structure contains the SOC numbers and revisions used to match +the microcode to the SOC itself. Normally, the microcode loader should +check the data in this structure with the SOC number and revisions, and +only upload the microcode if there's a match. However, this check is not +made on all platforms. + +Although it is not recommended, you can specify '0' in the soc.model +field to skip matching SOCs altogether. + +The 'model' field is a 16-bit number that matches the actual SOC. The +'major' and 'minor' fields are the major and minor revision numbers, +respectively, of the SOC. + +For example, to match the 8323, revision 1.0:: + + soc.model = 8323 + soc.major = 1 + soc.minor = 0 + +'padding' is necessary for structure alignment. This field ensures that the +'extended_modes' field is aligned on a 64-bit boundary. + +'extended_modes' is a bitfield that defines special functionality which has an +impact on the device drivers. Each bit has its own impact and has special +instructions for the driver associated with it. This field is stored in +the QE library and available to any driver that calles qe_get_firmware_info(). + +'vtraps' is an array of 8 words that contain virtual trap values for each +virtual traps. As with 'extended_modes', this field is stored in the QE +library and available to any driver that calles qe_get_firmware_info(). + +'microcode' (type: struct qe_microcode): + For each RISC processor there is one 'microcode' structure. The first + 'microcode' structure is for the first RISC, and so on. + + The 'id' field is a null-terminated string suitable for printing that + identifies this particular microcode. + + 'traps' is an array of 16 words that contain hardware trap values + for each of the 16 traps. If trap[i] is 0, then this particular + trap is to be ignored (i.e. not written to TIBCR[i]). The entire value + is written as-is to the TIBCR[i] register, so be sure to set the EN + and T_IBP bits if necessary. + + 'eccr' is the value to program into the ECCR register. + + 'iram_offset' is the offset into IRAM to start writing the + microcode. + + 'count' is the number of 32-bit words in the microcode. + + 'code_offset' is the offset, in bytes, from the beginning of this + structure where the microcode itself can be found. The first + microcode binary should be located immediately after the 'microcode' + array. + + 'major', 'minor', and 'revision' are the major, minor, and revision + version numbers, respectively, of the microcode. If all values are 0, + then these fields are ignored. + + 'reserved' is necessary for structure alignment. Since 'microcode' + is an array, the 64-bit 'extended_modes' field needs to be aligned + on a 64-bit boundary, and this can only happen if the size of + 'microcode' is a multiple of 8 bytes. To ensure that, we add + 'reserved'. + +After the last microcode is a 32-bit CRC. It can be calculated using +this algorithm:: + + u32 crc32(const u8 *p, unsigned int len) + { + unsigned int i; + u32 crc = 0; + + while (len--) { + crc ^= *p++; + for (i = 0; i < 8; i++) + crc = (crc >> 1) ^ ((crc & 1) ? 0xedb88320 : 0); + } + return crc; + } + +VI - Sample Code for Creating Firmware Files +============================================ + +A Python program that creates firmware binaries from the header files normally +distributed by Freescale can be found on http://opensource.freescale.com. diff --git a/Documentation/powerpc/qe_firmware.txt b/Documentation/powerpc/qe_firmware.txt deleted file mode 100644 index e7ac24aec4ff..000000000000 --- a/Documentation/powerpc/qe_firmware.txt +++ /dev/null @@ -1,295 +0,0 @@ - Freescale QUICC Engine Firmware Uploading - ----------------------------------------- - -(c) 2007 Timur Tabi , - Freescale Semiconductor - -Table of Contents -================= - - I - Software License for Firmware - - II - Microcode Availability - - III - Description and Terminology - - IV - Microcode Programming Details - - V - Firmware Structure Layout - - VI - Sample Code for Creating Firmware Files - -Revision Information -==================== - -November 30, 2007: Rev 1.0 - Initial version - -I - Software License for Firmware -================================= - -Each firmware file comes with its own software license. For information on -the particular license, please see the license text that is distributed with -the firmware. - -II - Microcode Availability -=========================== - -Firmware files are distributed through various channels. Some are available on -http://opensource.freescale.com. For other firmware files, please contact -your Freescale representative or your operating system vendor. - -III - Description and Terminology -================================ - -In this document, the term 'microcode' refers to the sequence of 32-bit -integers that compose the actual QE microcode. - -The term 'firmware' refers to a binary blob that contains the microcode as -well as other data that - - 1) describes the microcode's purpose - 2) describes how and where to upload the microcode - 3) specifies the values of various registers - 4) includes additional data for use by specific device drivers - -Firmware files are binary files that contain only a firmware. - -IV - Microcode Programming Details -=================================== - -The QE architecture allows for only one microcode present in I-RAM for each -RISC processor. To replace any current microcode, a full QE reset (which -disables the microcode) must be performed first. - -QE microcode is uploaded using the following procedure: - -1) The microcode is placed into I-RAM at a specific location, using the - IRAM.IADD and IRAM.IDATA registers. - -2) The CERCR.CIR bit is set to 0 or 1, depending on whether the firmware - needs split I-RAM. Split I-RAM is only meaningful for SOCs that have - QEs with multiple RISC processors, such as the 8360. Splitting the I-RAM - allows each processor to run a different microcode, effectively creating an - asymmetric multiprocessing (AMP) system. - -3) The TIBCR trap registers are loaded with the addresses of the trap handlers - in the microcode. - -4) The RSP.ECCR register is programmed with the value provided. - -5) If necessary, device drivers that need the virtual traps and extended mode - data will use them. - -Virtual Microcode Traps - -These virtual traps are conditional branches in the microcode. These are -"soft" provisional introduced in the ROMcode in order to enable higher -flexibility and save h/w traps If new features are activated or an issue is -being fixed in the RAM package utilizing they should be activated. This data -structure signals the microcode which of these virtual traps is active. - -This structure contains 6 words that the application should copy to some -specific been defined. This table describes the structure. - - --------------------------------------------------------------- - | Offset in | | Destination Offset | Size of | - | array | Protocol | within PRAM | Operand | - --------------------------------------------------------------| - | 0 | Ethernet | 0xF8 | 4 bytes | - | | interworking | | | - --------------------------------------------------------------- - | 4 | ATM | 0xF8 | 4 bytes | - | | interworking | | | - --------------------------------------------------------------- - | 8 | PPP | 0xF8 | 4 bytes | - | | interworking | | | - --------------------------------------------------------------- - | 12 | Ethernet RX | 0x22 | 1 byte | - | | Distributor Page | | | - --------------------------------------------------------------- - | 16 | ATM Globtal | 0x28 | 1 byte | - | | Params Table | | | - --------------------------------------------------------------- - | 20 | Insert Frame | 0xF8 | 4 bytes | - --------------------------------------------------------------- - - -Extended Modes - -This is a double word bit array (64 bits) that defines special functionality -which has an impact on the software drivers. Each bit has its own impact -and has special instructions for the s/w associated with it. This structure is -described in this table: - - ----------------------------------------------------------------------- - | Bit # | Name | Description | - ----------------------------------------------------------------------- - | 0 | General | Indicates that prior to each host command | - | | push command | given by the application, the software must | - | | | assert a special host command (push command)| - | | | CECDR = 0x00800000. | - | | | CECR = 0x01c1000f. | - ----------------------------------------------------------------------- - | 1 | UCC ATM | Indicates that after issuing ATM RX INIT | - | | RX INIT | command, the host must issue another special| - | | push command | command (push command) and immediately | - | | | following that re-issue the ATM RX INIT | - | | | command. (This makes the sequence of | - | | | initializing the ATM receiver a sequence of | - | | | three host commands) | - | | | CECDR = 0x00800000. | - | | | CECR = 0x01c1000f. | - ----------------------------------------------------------------------- - | 2 | Add/remove | Indicates that following the specific host | - | | command | command: "Add/Remove entry in Hash Lookup | - | | validation | Table" used in Interworking setup, the user | - | | | must issue another command. | - | | | CECDR = 0xce000003. | - | | | CECR = 0x01c10f58. | - ----------------------------------------------------------------------- - | 3 | General push | Indicates that the s/w has to initialize | - | | command | some pointers in the Ethernet thread pages | - | | | which are used when Header Compression is | - | | | activated. The full details of these | - | | | pointers is located in the software drivers.| - ----------------------------------------------------------------------- - | 4 | General push | Indicates that after issuing Ethernet TX | - | | command | INIT command, user must issue this command | - | | | for each SNUM of Ethernet TX thread. | - | | | CECDR = 0x00800003. | - | | | CECR = 0x7'b{0}, 8'b{Enet TX thread SNUM}, | - | | | 1'b{1}, 12'b{0}, 4'b{1} | - ----------------------------------------------------------------------- - | 5 - 31 | N/A | Reserved, set to zero. | - ----------------------------------------------------------------------- - -V - Firmware Structure Layout -============================== - -QE microcode from Freescale is typically provided as a header file. This -header file contains macros that define the microcode binary itself as well as -some other data used in uploading that microcode. The format of these files -do not lend themselves to simple inclusion into other code. Hence, -the need for a more portable format. This section defines that format. - -Instead of distributing a header file, the microcode and related data are -embedded into a binary blob. This blob is passed to the qe_upload_firmware() -function, which parses the blob and performs everything necessary to upload -the microcode. - -All integers are big-endian. See the comments for function -qe_upload_firmware() for up-to-date implementation information. - -This structure supports versioning, where the version of the structure is -embedded into the structure itself. To ensure forward and backwards -compatibility, all versions of the structure must use the same 'qe_header' -structure at the beginning. - -'header' (type: struct qe_header): - The 'length' field is the size, in bytes, of the entire structure, - including all the microcode embedded in it, as well as the CRC (if - present). - - The 'magic' field is an array of three bytes that contains the letters - 'Q', 'E', and 'F'. This is an identifier that indicates that this - structure is a QE Firmware structure. - - The 'version' field is a single byte that indicates the version of this - structure. If the layout of the structure should ever need to be - changed to add support for additional types of microcode, then the - version number should also be changed. - -The 'id' field is a null-terminated string(suitable for printing) that -identifies the firmware. - -The 'count' field indicates the number of 'microcode' structures. There -must be one and only one 'microcode' structure for each RISC processor. -Therefore, this field also represents the number of RISC processors for this -SOC. - -The 'soc' structure contains the SOC numbers and revisions used to match -the microcode to the SOC itself. Normally, the microcode loader should -check the data in this structure with the SOC number and revisions, and -only upload the microcode if there's a match. However, this check is not -made on all platforms. - -Although it is not recommended, you can specify '0' in the soc.model -field to skip matching SOCs altogether. - -The 'model' field is a 16-bit number that matches the actual SOC. The -'major' and 'minor' fields are the major and minor revision numbers, -respectively, of the SOC. - -For example, to match the 8323, revision 1.0: - soc.model = 8323 - soc.major = 1 - soc.minor = 0 - -'padding' is necessary for structure alignment. This field ensures that the -'extended_modes' field is aligned on a 64-bit boundary. - -'extended_modes' is a bitfield that defines special functionality which has an -impact on the device drivers. Each bit has its own impact and has special -instructions for the driver associated with it. This field is stored in -the QE library and available to any driver that calles qe_get_firmware_info(). - -'vtraps' is an array of 8 words that contain virtual trap values for each -virtual traps. As with 'extended_modes', this field is stored in the QE -library and available to any driver that calles qe_get_firmware_info(). - -'microcode' (type: struct qe_microcode): - For each RISC processor there is one 'microcode' structure. The first - 'microcode' structure is for the first RISC, and so on. - - The 'id' field is a null-terminated string suitable for printing that - identifies this particular microcode. - - 'traps' is an array of 16 words that contain hardware trap values - for each of the 16 traps. If trap[i] is 0, then this particular - trap is to be ignored (i.e. not written to TIBCR[i]). The entire value - is written as-is to the TIBCR[i] register, so be sure to set the EN - and T_IBP bits if necessary. - - 'eccr' is the value to program into the ECCR register. - - 'iram_offset' is the offset into IRAM to start writing the - microcode. - - 'count' is the number of 32-bit words in the microcode. - - 'code_offset' is the offset, in bytes, from the beginning of this - structure where the microcode itself can be found. The first - microcode binary should be located immediately after the 'microcode' - array. - - 'major', 'minor', and 'revision' are the major, minor, and revision - version numbers, respectively, of the microcode. If all values are 0, - then these fields are ignored. - - 'reserved' is necessary for structure alignment. Since 'microcode' - is an array, the 64-bit 'extended_modes' field needs to be aligned - on a 64-bit boundary, and this can only happen if the size of - 'microcode' is a multiple of 8 bytes. To ensure that, we add - 'reserved'. - -After the last microcode is a 32-bit CRC. It can be calculated using -this algorithm: - -u32 crc32(const u8 *p, unsigned int len) -{ - unsigned int i; - u32 crc = 0; - - while (len--) { - crc ^= *p++; - for (i = 0; i < 8; i++) - crc = (crc >> 1) ^ ((crc & 1) ? 0xedb88320 : 0); - } - return crc; -} - -VI - Sample Code for Creating Firmware Files -============================================ - -A Python program that creates firmware binaries from the header files normally -distributed by Freescale can be found on http://opensource.freescale.com. diff --git a/Documentation/powerpc/syscall64-abi.rst b/Documentation/powerpc/syscall64-abi.rst new file mode 100644 index 000000000000..e49f69f941b9 --- /dev/null +++ b/Documentation/powerpc/syscall64-abi.rst @@ -0,0 +1,110 @@ +=============================================== +Power Architecture 64-bit Linux system call ABI +=============================================== + +syscall +======= + +syscall calling sequence\ [1]_ matches the Power Architecture 64-bit ELF ABI +specification C function calling sequence, including register preservation +rules, with the following differences. + +.. [1] Some syscalls (typically low-level management functions) may have + different calling sequences (e.g., rt_sigreturn). + +Parameters and return value +--------------------------- +The system call number is specified in r0. + +There is a maximum of 6 integer parameters to a syscall, passed in r3-r8. + +Both a return value and a return error code are returned. cr0.SO is the return +error code, and r3 is the return value or error code. When cr0.SO is clear, +the syscall succeeded and r3 is the return value. When cr0.SO is set, the +syscall failed and r3 is the error code that generally corresponds to errno. + +Stack +----- +System calls do not modify the caller's stack frame. For example, the caller's +stack frame LR and CR save fields are not used. + +Register preservation rules +--------------------------- +Register preservation rules match the ELF ABI calling sequence with the +following differences: + +=========== ============= ======================================== +r0 Volatile (System call number.) +r3 Volatile (Parameter 1, and return value.) +r4-r8 Volatile (Parameters 2-6.) +cr0 Volatile (cr0.SO is the return error condition) +cr1, cr5-7 Nonvolatile +lr Nonvolatile +=========== ============= ======================================== + +All floating point and vector data registers as well as control and status +registers are nonvolatile. + +Invocation +---------- +The syscall is performed with the sc instruction, and returns with execution +continuing at the instruction following the sc instruction. + +Transactional Memory +-------------------- +Syscall behavior can change if the processor is in transactional or suspended +transaction state, and the syscall can affect the behavior of the transaction. + +If the processor is in suspended state when a syscall is made, the syscall +will be performed as normal, and will return as normal. The syscall will be +performed in suspended state, so its side effects will be persistent according +to the usual transactional memory semantics. A syscall may or may not result +in the transaction being doomed by hardware. + +If the processor is in transactional state when a syscall is made, then the +behavior depends on the presence of PPC_FEATURE2_HTM_NOSC in the AT_HWCAP2 ELF +auxiliary vector. + +- If present, which is the case for newer kernels, then the syscall will not + be performed and the transaction will be doomed by the kernel with the + failure code TM_CAUSE_SYSCALL | TM_CAUSE_PERSISTENT in the TEXASR SPR. + +- If not present (older kernels), then the kernel will suspend the + transactional state and the syscall will proceed as in the case of a + suspended state syscall, and will resume the transactional state before + returning to the caller. This case is not well defined or supported, so this + behavior should not be relied upon. + + +vsyscall +======== + +vsyscall calling sequence matches the syscall calling sequence, with the +following differences. Some vsyscalls may have different calling sequences. + +Parameters and return value +--------------------------- +r0 is not used as an input. The vsyscall is selected by its address. + +Stack +----- +The vsyscall may or may not use the caller's stack frame save areas. + +Register preservation rules +--------------------------- + +=========== ======== +r0 Volatile +cr1, cr5-7 Volatile +lr Volatile +=========== ======== + +Invocation +---------- +The vsyscall is performed with a branch-with-link instruction to the vsyscall +function address. + +Transactional Memory +-------------------- +vsyscalls will run in the same transactional state as the caller. A vsyscall +may or may not result in the transaction being doomed by hardware. diff --git a/Documentation/powerpc/syscall64-abi.txt b/Documentation/powerpc/syscall64-abi.txt deleted file mode 100644 index fa716a0d88bd..000000000000 --- a/Documentation/powerpc/syscall64-abi.txt +++ /dev/null @@ -1,105 +0,0 @@ -=============================================== -Power Architecture 64-bit Linux system call ABI -=============================================== - -syscall -======= - -syscall calling sequence[*] matches the Power Architecture 64-bit ELF ABI -specification C function calling sequence, including register preservation -rules, with the following differences. - -[*] Some syscalls (typically low-level management functions) may have - different calling sequences (e.g., rt_sigreturn). - -Parameters and return value ---------------------------- -The system call number is specified in r0. - -There is a maximum of 6 integer parameters to a syscall, passed in r3-r8. - -Both a return value and a return error code are returned. cr0.SO is the return -error code, and r3 is the return value or error code. When cr0.SO is clear, -the syscall succeeded and r3 is the return value. When cr0.SO is set, the -syscall failed and r3 is the error code that generally corresponds to errno. - -Stack ------ -System calls do not modify the caller's stack frame. For example, the caller's -stack frame LR and CR save fields are not used. - -Register preservation rules ---------------------------- -Register preservation rules match the ELF ABI calling sequence with the -following differences: - -r0: Volatile. (System call number.) -r3: Volatile. (Parameter 1, and return value.) -r4-r8: Volatile. (Parameters 2-6.) -cr0: Volatile (cr0.SO is the return error condition) -cr1, cr5-7: Nonvolatile. -lr: Nonvolatile. - -All floating point and vector data registers as well as control and status -registers are nonvolatile. - -Invocation ----------- -The syscall is performed with the sc instruction, and returns with execution -continuing at the instruction following the sc instruction. - -Transactional Memory --------------------- -Syscall behavior can change if the processor is in transactional or suspended -transaction state, and the syscall can affect the behavior of the transaction. - -If the processor is in suspended state when a syscall is made, the syscall -will be performed as normal, and will return as normal. The syscall will be -performed in suspended state, so its side effects will be persistent according -to the usual transactional memory semantics. A syscall may or may not result -in the transaction being doomed by hardware. - -If the processor is in transactional state when a syscall is made, then the -behavior depends on the presence of PPC_FEATURE2_HTM_NOSC in the AT_HWCAP2 ELF -auxiliary vector. - -- If present, which is the case for newer kernels, then the syscall will not - be performed and the transaction will be doomed by the kernel with the - failure code TM_CAUSE_SYSCALL | TM_CAUSE_PERSISTENT in the TEXASR SPR. - -- If not present (older kernels), then the kernel will suspend the - transactional state and the syscall will proceed as in the case of a - suspended state syscall, and will resume the transactional state before - returning to the caller. This case is not well defined or supported, so this - behavior should not be relied upon. - - -vsyscall -======== - -vsyscall calling sequence matches the syscall calling sequence, with the -following differences. Some vsyscalls may have different calling sequences. - -Parameters and return value ---------------------------- -r0 is not used as an input. The vsyscall is selected by its address. - -Stack ------ -The vsyscall may or may not use the caller's stack frame save areas. - -Register preservation rules ---------------------------- -r0: Volatile. -cr1, cr5-7: Volatile. -lr: Volatile. - -Invocation ----------- -The vsyscall is performed with a branch-with-link instruction to the vsyscall -function address. - -Transactional Memory --------------------- -vsyscalls will run in the same transactional state as the caller. A vsyscall -may or may not result in the transaction being doomed by hardware. diff --git a/Documentation/powerpc/transactional_memory.rst b/Documentation/powerpc/transactional_memory.rst new file mode 100644 index 000000000000..09955103acb4 --- /dev/null +++ b/Documentation/powerpc/transactional_memory.rst @@ -0,0 +1,247 @@ +============================ +Transactional Memory support +============================ + +POWER kernel support for this feature is currently limited to supporting +its use by user programs. It is not currently used by the kernel itself. + +This file aims to sum up how it is supported by Linux and what behaviour you +can expect from your user programs. + + +Basic overview +============== + +Hardware Transactional Memory is supported on POWER8 processors, and is a +feature that enables a different form of atomic memory access. Several new +instructions are presented to delimit transactions; transactions are +guaranteed to either complete atomically or roll back and undo any partial +changes. + +A simple transaction looks like this:: + + begin_move_money: + tbegin + beq abort_handler + + ld r4, SAVINGS_ACCT(r3) + ld r5, CURRENT_ACCT(r3) + subi r5, r5, 1 + addi r4, r4, 1 + std r4, SAVINGS_ACCT(r3) + std r5, CURRENT_ACCT(r3) + + tend + + b continue + + abort_handler: + ... test for odd failures ... + + /* Retry the transaction if it failed because it conflicted with + * someone else: */ + b begin_move_money + + +The 'tbegin' instruction denotes the start point, and 'tend' the end point. +Between these points the processor is in 'Transactional' state; any memory +references will complete in one go if there are no conflicts with other +transactional or non-transactional accesses within the system. In this +example, the transaction completes as though it were normal straight-line code +IF no other processor has touched SAVINGS_ACCT(r3) or CURRENT_ACCT(r3); an +atomic move of money from the current account to the savings account has been +performed. Even though the normal ld/std instructions are used (note no +lwarx/stwcx), either *both* SAVINGS_ACCT(r3) and CURRENT_ACCT(r3) will be +updated, or neither will be updated. + +If, in the meantime, there is a conflict with the locations accessed by the +transaction, the transaction will be aborted by the CPU. Register and memory +state will roll back to that at the 'tbegin', and control will continue from +'tbegin+4'. The branch to abort_handler will be taken this second time; the +abort handler can check the cause of the failure, and retry. + +Checkpointed registers include all GPRs, FPRs, VRs/VSRs, LR, CCR/CR, CTR, FPCSR +and a few other status/flag regs; see the ISA for details. + +Causes of transaction aborts +============================ + +- Conflicts with cache lines used by other processors +- Signals +- Context switches +- See the ISA for full documentation of everything that will abort transactions. + + +Syscalls +======== + +Syscalls made from within an active transaction will not be performed and the +transaction will be doomed by the kernel with the failure code TM_CAUSE_SYSCALL +| TM_CAUSE_PERSISTENT. + +Syscalls made from within a suspended transaction are performed as normal and +the transaction is not explicitly doomed by the kernel. However, what the +kernel does to perform the syscall may result in the transaction being doomed +by the hardware. The syscall is performed in suspended mode so any side +effects will be persistent, independent of transaction success or failure. No +guarantees are provided by the kernel about which syscalls will affect +transaction success. + +Care must be taken when relying on syscalls to abort during active transactions +if the calls are made via a library. Libraries may cache values (which may +give the appearance of success) or perform operations that cause transaction +failure before entering the kernel (which may produce different failure codes). +Examples are glibc's getpid() and lazy symbol resolution. + + +Signals +======= + +Delivery of signals (both sync and async) during transactions provides a second +thread state (ucontext/mcontext) to represent the second transactional register +state. Signal delivery 'treclaim's to capture both register states, so signals +abort transactions. The usual ucontext_t passed to the signal handler +represents the checkpointed/original register state; the signal appears to have +arisen at 'tbegin+4'. + +If the sighandler ucontext has uc_link set, a second ucontext has been +delivered. For future compatibility the MSR.TS field should be checked to +determine the transactional state -- if so, the second ucontext in uc->uc_link +represents the active transactional registers at the point of the signal. + +For 64-bit processes, uc->uc_mcontext.regs->msr is a full 64-bit MSR and its TS +field shows the transactional mode. + +For 32-bit processes, the mcontext's MSR register is only 32 bits; the top 32 +bits are stored in the MSR of the second ucontext, i.e. in +uc->uc_link->uc_mcontext.regs->msr. The top word contains the transactional +state TS. + +However, basic signal handlers don't need to be aware of transactions +and simply returning from the handler will deal with things correctly: + +Transaction-aware signal handlers can read the transactional register state +from the second ucontext. This will be necessary for crash handlers to +determine, for example, the address of the instruction causing the SIGSEGV. + +Example signal handler:: + + void crash_handler(int sig, siginfo_t *si, void *uc) + { + ucontext_t *ucp = uc; + ucontext_t *transactional_ucp = ucp->uc_link; + + if (ucp_link) { + u64 msr = ucp->uc_mcontext.regs->msr; + /* May have transactional ucontext! */ + #ifndef __powerpc64__ + msr |= ((u64)transactional_ucp->uc_mcontext.regs->msr) << 32; + #endif + if (MSR_TM_ACTIVE(msr)) { + /* Yes, we crashed during a transaction. Oops. */ + fprintf(stderr, "Transaction to be restarted at 0x%llx, but " + "crashy instruction was at 0x%llx\n", + ucp->uc_mcontext.regs->nip, + transactional_ucp->uc_mcontext.regs->nip); + } + } + + fix_the_problem(ucp->dar); + } + +When in an active transaction that takes a signal, we need to be careful with +the stack. It's possible that the stack has moved back up after the tbegin. +The obvious case here is when the tbegin is called inside a function that +returns before a tend. In this case, the stack is part of the checkpointed +transactional memory state. If we write over this non transactionally or in +suspend, we are in trouble because if we get a tm abort, the program counter and +stack pointer will be back at the tbegin but our in memory stack won't be valid +anymore. + +To avoid this, when taking a signal in an active transaction, we need to use +the stack pointer from the checkpointed state, rather than the speculated +state. This ensures that the signal context (written tm suspended) will be +written below the stack required for the rollback. The transaction is aborted +because of the treclaim, so any memory written between the tbegin and the +signal will be rolled back anyway. + +For signals taken in non-TM or suspended mode, we use the +normal/non-checkpointed stack pointer. + +Any transaction initiated inside a sighandler and suspended on return +from the sighandler to the kernel will get reclaimed and discarded. + +Failure cause codes used by kernel +================================== + +These are defined in , and distinguish different reasons why the +kernel aborted a transaction: + + ====================== ================================ + TM_CAUSE_RESCHED Thread was rescheduled. + TM_CAUSE_TLBI Software TLB invalid. + TM_CAUSE_FAC_UNAV FP/VEC/VSX unavailable trap. + TM_CAUSE_SYSCALL Syscall from active transaction. + TM_CAUSE_SIGNAL Signal delivered. + TM_CAUSE_MISC Currently unused. + TM_CAUSE_ALIGNMENT Alignment fault. + TM_CAUSE_EMULATE Emulation that touched memory. + ====================== ================================ + +These can be checked by the user program's abort handler as TEXASR[0:7]. If +bit 7 is set, it indicates that the error is consider persistent. For example +a TM_CAUSE_ALIGNMENT will be persistent while a TM_CAUSE_RESCHED will not. + +GDB +=== + +GDB and ptrace are not currently TM-aware. If one stops during a transaction, +it looks like the transaction has just started (the checkpointed state is +presented). The transaction cannot then be continued and will take the failure +handler route. Furthermore, the transactional 2nd register state will be +inaccessible. GDB can currently be used on programs using TM, but not sensibly +in parts within transactions. + +POWER9 +====== + +TM on POWER9 has issues with storing the complete register state. This +is described in this commit:: + + commit 4bb3c7a0208fc13ca70598efd109901a7cd45ae7 + Author: Paul Mackerras + Date: Wed Mar 21 21:32:01 2018 +1100 + KVM: PPC: Book3S HV: Work around transactional memory bugs in POWER9 + +To account for this different POWER9 chips have TM enabled in +different ways. + +On POWER9N DD2.01 and below, TM is disabled. ie +HWCAP2[PPC_FEATURE2_HTM] is not set. + +On POWER9N DD2.1 TM is configured by firmware to always abort a +transaction when tm suspend occurs. So tsuspend will cause a +transaction to be aborted and rolled back. Kernel exceptions will also +cause the transaction to be aborted and rolled back and the exception +will not occur. If userspace constructs a sigcontext that enables TM +suspend, the sigcontext will be rejected by the kernel. This mode is +advertised to users with HWCAP2[PPC_FEATURE2_HTM_NO_SUSPEND] set. +HWCAP2[PPC_FEATURE2_HTM] is not set in this mode. + +On POWER9N DD2.2 and above, KVM and POWERVM emulate TM for guests (as +described in commit 4bb3c7a0208f), hence TM is enabled for guests +ie. HWCAP2[PPC_FEATURE2_HTM] is set for guest userspace. Guests that +makes heavy use of TM suspend (tsuspend or kernel suspend) will result +in traps into the hypervisor and hence will suffer a performance +degradation. Host userspace has TM disabled +ie. HWCAP2[PPC_FEATURE2_HTM] is not set. (although we make enable it +at some point in the future if we bring the emulation into host +userspace context switching). + +POWER9C DD1.2 and above are only available with POWERVM and hence +Linux only runs as a guest. On these systems TM is emulated like on +POWER9N DD2.2. + +Guest migration from POWER8 to POWER9 will work with POWER9N DD2.2 and +POWER9C DD1.2. Since earlier POWER9 processors don't support TM +emulation, migration from POWER8 to POWER9 is not supported there. diff --git a/Documentation/powerpc/transactional_memory.txt b/Documentation/powerpc/transactional_memory.txt deleted file mode 100644 index 52c023e14f26..000000000000 --- a/Documentation/powerpc/transactional_memory.txt +++ /dev/null @@ -1,244 +0,0 @@ -Transactional Memory support -============================ - -POWER kernel support for this feature is currently limited to supporting -its use by user programs. It is not currently used by the kernel itself. - -This file aims to sum up how it is supported by Linux and what behaviour you -can expect from your user programs. - - -Basic overview -============== - -Hardware Transactional Memory is supported on POWER8 processors, and is a -feature that enables a different form of atomic memory access. Several new -instructions are presented to delimit transactions; transactions are -guaranteed to either complete atomically or roll back and undo any partial -changes. - -A simple transaction looks like this: - -begin_move_money: - tbegin - beq abort_handler - - ld r4, SAVINGS_ACCT(r3) - ld r5, CURRENT_ACCT(r3) - subi r5, r5, 1 - addi r4, r4, 1 - std r4, SAVINGS_ACCT(r3) - std r5, CURRENT_ACCT(r3) - - tend - - b continue - -abort_handler: - ... test for odd failures ... - - /* Retry the transaction if it failed because it conflicted with - * someone else: */ - b begin_move_money - - -The 'tbegin' instruction denotes the start point, and 'tend' the end point. -Between these points the processor is in 'Transactional' state; any memory -references will complete in one go if there are no conflicts with other -transactional or non-transactional accesses within the system. In this -example, the transaction completes as though it were normal straight-line code -IF no other processor has touched SAVINGS_ACCT(r3) or CURRENT_ACCT(r3); an -atomic move of money from the current account to the savings account has been -performed. Even though the normal ld/std instructions are used (note no -lwarx/stwcx), either *both* SAVINGS_ACCT(r3) and CURRENT_ACCT(r3) will be -updated, or neither will be updated. - -If, in the meantime, there is a conflict with the locations accessed by the -transaction, the transaction will be aborted by the CPU. Register and memory -state will roll back to that at the 'tbegin', and control will continue from -'tbegin+4'. The branch to abort_handler will be taken this second time; the -abort handler can check the cause of the failure, and retry. - -Checkpointed registers include all GPRs, FPRs, VRs/VSRs, LR, CCR/CR, CTR, FPCSR -and a few other status/flag regs; see the ISA for details. - -Causes of transaction aborts -============================ - -- Conflicts with cache lines used by other processors -- Signals -- Context switches -- See the ISA for full documentation of everything that will abort transactions. - - -Syscalls -======== - -Syscalls made from within an active transaction will not be performed and the -transaction will be doomed by the kernel with the failure code TM_CAUSE_SYSCALL -| TM_CAUSE_PERSISTENT. - -Syscalls made from within a suspended transaction are performed as normal and -the transaction is not explicitly doomed by the kernel. However, what the -kernel does to perform the syscall may result in the transaction being doomed -by the hardware. The syscall is performed in suspended mode so any side -effects will be persistent, independent of transaction success or failure. No -guarantees are provided by the kernel about which syscalls will affect -transaction success. - -Care must be taken when relying on syscalls to abort during active transactions -if the calls are made via a library. Libraries may cache values (which may -give the appearance of success) or perform operations that cause transaction -failure before entering the kernel (which may produce different failure codes). -Examples are glibc's getpid() and lazy symbol resolution. - - -Signals -======= - -Delivery of signals (both sync and async) during transactions provides a second -thread state (ucontext/mcontext) to represent the second transactional register -state. Signal delivery 'treclaim's to capture both register states, so signals -abort transactions. The usual ucontext_t passed to the signal handler -represents the checkpointed/original register state; the signal appears to have -arisen at 'tbegin+4'. - -If the sighandler ucontext has uc_link set, a second ucontext has been -delivered. For future compatibility the MSR.TS field should be checked to -determine the transactional state -- if so, the second ucontext in uc->uc_link -represents the active transactional registers at the point of the signal. - -For 64-bit processes, uc->uc_mcontext.regs->msr is a full 64-bit MSR and its TS -field shows the transactional mode. - -For 32-bit processes, the mcontext's MSR register is only 32 bits; the top 32 -bits are stored in the MSR of the second ucontext, i.e. in -uc->uc_link->uc_mcontext.regs->msr. The top word contains the transactional -state TS. - -However, basic signal handlers don't need to be aware of transactions -and simply returning from the handler will deal with things correctly: - -Transaction-aware signal handlers can read the transactional register state -from the second ucontext. This will be necessary for crash handlers to -determine, for example, the address of the instruction causing the SIGSEGV. - -Example signal handler: - - void crash_handler(int sig, siginfo_t *si, void *uc) - { - ucontext_t *ucp = uc; - ucontext_t *transactional_ucp = ucp->uc_link; - - if (ucp_link) { - u64 msr = ucp->uc_mcontext.regs->msr; - /* May have transactional ucontext! */ -#ifndef __powerpc64__ - msr |= ((u64)transactional_ucp->uc_mcontext.regs->msr) << 32; -#endif - if (MSR_TM_ACTIVE(msr)) { - /* Yes, we crashed during a transaction. Oops. */ - fprintf(stderr, "Transaction to be restarted at 0x%llx, but " - "crashy instruction was at 0x%llx\n", - ucp->uc_mcontext.regs->nip, - transactional_ucp->uc_mcontext.regs->nip); - } - } - - fix_the_problem(ucp->dar); - } - -When in an active transaction that takes a signal, we need to be careful with -the stack. It's possible that the stack has moved back up after the tbegin. -The obvious case here is when the tbegin is called inside a function that -returns before a tend. In this case, the stack is part of the checkpointed -transactional memory state. If we write over this non transactionally or in -suspend, we are in trouble because if we get a tm abort, the program counter and -stack pointer will be back at the tbegin but our in memory stack won't be valid -anymore. - -To avoid this, when taking a signal in an active transaction, we need to use -the stack pointer from the checkpointed state, rather than the speculated -state. This ensures that the signal context (written tm suspended) will be -written below the stack required for the rollback. The transaction is aborted -because of the treclaim, so any memory written between the tbegin and the -signal will be rolled back anyway. - -For signals taken in non-TM or suspended mode, we use the -normal/non-checkpointed stack pointer. - -Any transaction initiated inside a sighandler and suspended on return -from the sighandler to the kernel will get reclaimed and discarded. - -Failure cause codes used by kernel -================================== - -These are defined in , and distinguish different reasons why the -kernel aborted a transaction: - - TM_CAUSE_RESCHED Thread was rescheduled. - TM_CAUSE_TLBI Software TLB invalid. - TM_CAUSE_FAC_UNAV FP/VEC/VSX unavailable trap. - TM_CAUSE_SYSCALL Syscall from active transaction. - TM_CAUSE_SIGNAL Signal delivered. - TM_CAUSE_MISC Currently unused. - TM_CAUSE_ALIGNMENT Alignment fault. - TM_CAUSE_EMULATE Emulation that touched memory. - -These can be checked by the user program's abort handler as TEXASR[0:7]. If -bit 7 is set, it indicates that the error is consider persistent. For example -a TM_CAUSE_ALIGNMENT will be persistent while a TM_CAUSE_RESCHED will not. - -GDB -=== - -GDB and ptrace are not currently TM-aware. If one stops during a transaction, -it looks like the transaction has just started (the checkpointed state is -presented). The transaction cannot then be continued and will take the failure -handler route. Furthermore, the transactional 2nd register state will be -inaccessible. GDB can currently be used on programs using TM, but not sensibly -in parts within transactions. - -POWER9 -====== - -TM on POWER9 has issues with storing the complete register state. This -is described in this commit: - - commit 4bb3c7a0208fc13ca70598efd109901a7cd45ae7 - Author: Paul Mackerras - Date: Wed Mar 21 21:32:01 2018 +1100 - KVM: PPC: Book3S HV: Work around transactional memory bugs in POWER9 - -To account for this different POWER9 chips have TM enabled in -different ways. - -On POWER9N DD2.01 and below, TM is disabled. ie -HWCAP2[PPC_FEATURE2_HTM] is not set. - -On POWER9N DD2.1 TM is configured by firmware to always abort a -transaction when tm suspend occurs. So tsuspend will cause a -transaction to be aborted and rolled back. Kernel exceptions will also -cause the transaction to be aborted and rolled back and the exception -will not occur. If userspace constructs a sigcontext that enables TM -suspend, the sigcontext will be rejected by the kernel. This mode is -advertised to users with HWCAP2[PPC_FEATURE2_HTM_NO_SUSPEND] set. -HWCAP2[PPC_FEATURE2_HTM] is not set in this mode. - -On POWER9N DD2.2 and above, KVM and POWERVM emulate TM for guests (as -described in commit 4bb3c7a0208f), hence TM is enabled for guests -ie. HWCAP2[PPC_FEATURE2_HTM] is set for guest userspace. Guests that -makes heavy use of TM suspend (tsuspend or kernel suspend) will result -in traps into the hypervisor and hence will suffer a performance -degradation. Host userspace has TM disabled -ie. HWCAP2[PPC_FEATURE2_HTM] is not set. (although we make enable it -at some point in the future if we bring the emulation into host -userspace context switching). - -POWER9C DD1.2 and above are only available with POWERVM and hence -Linux only runs as a guest. On these systems TM is emulated like on -POWER9N DD2.2. - -Guest migration from POWER8 to POWER9 will work with POWER9N DD2.2 and -POWER9C DD1.2. Since earlier POWER9 processors don't support TM -emulation, migration from POWER8 to POWER9 is not supported there. diff --git a/MAINTAINERS b/MAINTAINERS index c144bd6a432e..8671909ee75c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4468,7 +4468,7 @@ F: arch/powerpc/platforms/powernv/pci-cxl.c F: drivers/misc/cxl/ F: include/misc/cxl* F: include/uapi/misc/cxl.h -F: Documentation/powerpc/cxl.txt +F: Documentation/powerpc/cxl.rst F: Documentation/ABI/testing/sysfs-class-cxl CXLFLASH (IBM Coherent Accelerator Processor Interface CAPI Flash) SCSI DRIVER @@ -4479,7 +4479,7 @@ L: linux-scsi@vger.kernel.org S: Supported F: drivers/scsi/cxlflash/ F: include/uapi/scsi/cxlflash_ioctl.h -F: Documentation/powerpc/cxlflash.txt +F: Documentation/powerpc/cxlflash.rst CYBERPRO FB DRIVER M: Russell King @@ -12353,7 +12353,7 @@ F: Documentation/PCI/pci-error-recovery.rst F: drivers/pci/pcie/aer.c F: drivers/pci/pcie/dpc.c F: drivers/pci/pcie/err.c -F: Documentation/powerpc/eeh-pci-error-recovery.txt +F: Documentation/powerpc/eeh-pci-error-recovery.rst F: arch/powerpc/kernel/eeh*.c F: arch/powerpc/platforms/*/eeh*.c F: arch/powerpc/include/*/eeh*.h diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index eee5bef736c8..6ba3cc2ef8ab 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -1531,7 +1531,7 @@ EXC_COMMON(trap_0b_common, 0xb00, unknown_exception) * * Call convention: * - * syscall register convention is in Documentation/powerpc/syscall64-abi.txt + * syscall register convention is in Documentation/powerpc/syscall64-abi.rst * * For hypercalls, the register convention is as follows: * r0 volatile diff --git a/drivers/soc/fsl/qe/qe.c b/drivers/soc/fsl/qe/qe.c index 62c6ba17991a..c9519e62308c 100644 --- a/drivers/soc/fsl/qe/qe.c +++ b/drivers/soc/fsl/qe/qe.c @@ -419,7 +419,7 @@ static void qe_upload_microcode(const void *base, /* * Upload a microcode to the I-RAM at a specific address. * - * See Documentation/powerpc/qe_firmware.txt for information on QE microcode + * See Documentation/powerpc/qe_firmware.rst for information on QE microcode * uploading. * * Currently, only version 1 is supported, so the 'version' field must be diff --git a/drivers/tty/hvc/hvcs.c b/drivers/tty/hvc/hvcs.c index cb4db1b3ca3c..5fb214e67d73 100644 --- a/drivers/tty/hvc/hvcs.c +++ b/drivers/tty/hvc/hvcs.c @@ -47,7 +47,7 @@ * using the 2.6 Linux kernel kref construct. * * For direction on installation and usage of this driver please reference - * Documentation/powerpc/hvcs.txt. + * Documentation/powerpc/hvcs.rst. */ #include diff --git a/include/soc/fsl/qe/qe.h b/include/soc/fsl/qe/qe.h index 3f9d6b6a5691..c1036d16ed03 100644 --- a/include/soc/fsl/qe/qe.h +++ b/include/soc/fsl/qe/qe.h @@ -259,7 +259,7 @@ static inline int qe_alive_during_sleep(void) /* Structure that defines QE firmware binary files. * - * See Documentation/powerpc/qe_firmware.txt for a description of these + * See Documentation/powerpc/qe_firmware.rst for a description of these * fields. */ struct qe_firmware { -- cgit v1.2.3 From a6d81d30d3cd87f85bfd922358eb18b8146c4925 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 16 Jul 2019 16:19:25 -0400 Subject: wait: add wq_has_single_sleeper helper rq-qos sits in the io path so we want to take locks as sparingly as possible. To accomplish this we try not to take the waitqueue head lock unless we are sure we need to go to sleep, and we have an optimization to make sure that we don't starve out existing waiters. Since we check if there are existing waiters locklessly we need to be able to update our view of the waitqueue list after we've added ourselves to the waitqueue. Accomplish this by adding this helper to see if there is more than just ourselves on the list. Reviewed-by: Oleg Nesterov Signed-off-by: Josef Bacik Signed-off-by: Jens Axboe --- include/linux/wait.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include') diff --git a/include/linux/wait.h b/include/linux/wait.h index b6f77cf60dd7..30c515520fb2 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -126,6 +126,19 @@ static inline int waitqueue_active(struct wait_queue_head *wq_head) return !list_empty(&wq_head->head); } +/** + * wq_has_single_sleeper - check if there is only one sleeper + * @wq_head: wait queue head + * + * Returns true of wq_head has only one sleeper on the list. + * + * Please refer to the comment for waitqueue_active. + */ +static inline bool wq_has_single_sleeper(struct wait_queue_head *wq_head) +{ + return list_is_singular(&wq_head->head); +} + /** * wq_has_sleeper - check if there are any waiting processes * @wq_head: wait queue head -- cgit v1.2.3 From 87dbad02d2254b741c71ce859c451fb1ae6f5340 Mon Sep 17 00:00:00 2001 From: Xiaojie Yuan Date: Mon, 17 Dec 2018 18:00:26 +0800 Subject: drm/amdgpu: add navi14 asic type Add CHIP_NAVI14 to the list of asic types. Signed-off-by: Xiaojie Yuan Reviewed-by: Alex Deucher Reviewed-by: Jack Xiao Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 1 + include/drm/amd_asic_type.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 5a7f893cf724..ba87964c6515 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -99,6 +99,7 @@ static const char *amdgpu_asic_name[] = { "VEGA20", "RAVEN", "NAVI10", + "NAVI14", "LAST", }; diff --git a/include/drm/amd_asic_type.h b/include/drm/amd_asic_type.h index bcc2bcf32886..0c4766af04af 100644 --- a/include/drm/amd_asic_type.h +++ b/include/drm/amd_asic_type.h @@ -50,6 +50,7 @@ enum amd_asic_type { CHIP_VEGA20, CHIP_RAVEN, CHIP_NAVI10, + CHIP_NAVI14, CHIP_LAST, }; -- cgit v1.2.3 From d6c3b24ea28ddae06c9ff9fe9ae58664144d7ae8 Mon Sep 17 00:00:00 2001 From: Le Ma Date: Tue, 9 Jul 2019 09:16:01 -0500 Subject: drm/amdgpu: add Arcturus asic type Add asic type for Arcturus. Signed-off-by: Le Ma Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 1 + include/drm/amd_asic_type.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 6aa3c3e5bd50..c1edf105c660 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -99,6 +99,7 @@ static const char *amdgpu_asic_name[] = { "VEGA12", "VEGA20", "RAVEN", + "ARCTURUS", "NAVI10", "NAVI14", "LAST", diff --git a/include/drm/amd_asic_type.h b/include/drm/amd_asic_type.h index 0c4766af04af..0f5a12a99948 100644 --- a/include/drm/amd_asic_type.h +++ b/include/drm/amd_asic_type.h @@ -49,6 +49,7 @@ enum amd_asic_type { CHIP_VEGA12, CHIP_VEGA20, CHIP_RAVEN, + CHIP_ARCTURUS, CHIP_NAVI10, CHIP_NAVI14, CHIP_LAST, -- cgit v1.2.3 From 00289cd87676e14913d2d8492d1ce05c4baafdae Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 17 Jul 2019 18:07:53 -0700 Subject: drivers/base: Introduce kill_device() The libnvdimm subsystem arranges for devices to be destroyed as a result of a sysfs operation. Since device_unregister() cannot be called from an actively running sysfs attribute of the same device libnvdimm arranges for device_unregister() to be performed in an out-of-line async context. The driver core maintains a 'dead' state for coordinating its own racing async registration / de-registration requests. Rather than add local 'dead' state tracking infrastructure to libnvdimm device objects, export the existing state tracking via a new kill_device() helper. The kill_device() helper simply marks the device as dead, i.e. that it is on its way to device_del(), or returns that the device was already dead. This can be used in advance of calling device_unregister() for subsystems like libnvdimm that might need to handle multiple user threads racing to delete a device. This refactoring does not change any behavior, but it is a pre-requisite for follow-on fixes and therefore marked for -stable. Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Fixes: 4d88a97aa9e8 ("libnvdimm, nvdimm: dimm driver and base libnvdimm device-driver...") Cc: Tested-by: Jane Chu Reviewed-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/156341207332.292348.14959761496009347574.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- drivers/base/core.c | 27 +++++++++++++++++++-------- include/linux/device.h | 1 + 2 files changed, 20 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/drivers/base/core.c b/drivers/base/core.c index fd7511e04e62..eaf3aa0cb803 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -2211,6 +2211,24 @@ void put_device(struct device *dev) } EXPORT_SYMBOL_GPL(put_device); +bool kill_device(struct device *dev) +{ + /* + * Require the device lock and set the "dead" flag to guarantee that + * the update behavior is consistent with the other bitfields near + * it and that we cannot have an asynchronous probe routine trying + * to run while we are tearing out the bus/class/sysfs from + * underneath the device. + */ + lockdep_assert_held(&dev->mutex); + + if (dev->p->dead) + return false; + dev->p->dead = true; + return true; +} +EXPORT_SYMBOL_GPL(kill_device); + /** * device_del - delete device from system. * @dev: device. @@ -2230,15 +2248,8 @@ void device_del(struct device *dev) struct kobject *glue_dir = NULL; struct class_interface *class_intf; - /* - * Hold the device lock and set the "dead" flag to guarantee that - * the update behavior is consistent with the other bitfields near - * it and that we cannot have an asynchronous probe routine trying - * to run while we are tearing out the bus/class/sysfs from - * underneath the device. - */ device_lock(dev); - dev->p->dead = true; + kill_device(dev); device_unlock(dev); /* Notify clients of device removal. This call must come diff --git a/include/linux/device.h b/include/linux/device.h index e85264fb6616..0da5c67f6be1 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -1373,6 +1373,7 @@ extern int (*platform_notify_remove)(struct device *dev); */ extern struct device *get_device(struct device *dev); extern void put_device(struct device *dev); +extern bool kill_device(struct device *dev); #ifdef CONFIG_DEVTMPFS extern int devtmpfs_create_node(struct device *dev); -- cgit v1.2.3 From 87a30e1f05d73a34e6d1895065541369131aaf1c Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 17 Jul 2019 18:08:26 -0700 Subject: driver-core, libnvdimm: Let device subsystems add local lockdep coverage For good reason, the standard device_lock() is marked lockdep_set_novalidate_class() because there is simply no sane way to describe the myriad ways the device_lock() ordered with other locks. However, that leaves subsystems that know their own local device_lock() ordering rules to find lock ordering mistakes manually. Instead, introduce an optional / additional lockdep-enabled lock that a subsystem can acquire in all the same paths that the device_lock() is acquired. A conversion of the NFIT driver and NVDIMM subsystem to a lockdep-validate device_lock() scheme is included. The debug_nvdimm_lock() implementation implements the correct lock-class and stacking order for the libnvdimm device topology hierarchy. Yes, this is a hack, but hopefully it is a useful hack for other subsystems device_lock() debug sessions. Quoting Greg: "Yeah, it feels a bit hacky but it's really up to a subsystem to mess up using it as much as anything else, so user beware :) I don't object to it if it makes things easier for you to debug." Cc: Ingo Molnar Cc: Ira Weiny Cc: Will Deacon Cc: Dave Jiang Cc: Keith Busch Cc: Peter Zijlstra Cc: Vishal Verma Cc: "Rafael J. Wysocki" Cc: Greg Kroah-Hartman Signed-off-by: Dan Williams Acked-by: Greg Kroah-Hartman Reviewed-by: Ira Weiny Link: https://lore.kernel.org/r/156341210661.292348.7014034644265455704.stgit@dwillia2-desk3.amr.corp.intel.com --- drivers/acpi/nfit/core.c | 28 ++++++++--------- drivers/acpi/nfit/nfit.h | 24 +++++++++++++++ drivers/base/core.c | 3 ++ drivers/nvdimm/btt_devs.c | 16 +++++----- drivers/nvdimm/bus.c | 28 ++++++++++------- drivers/nvdimm/core.c | 10 +++--- drivers/nvdimm/dimm_devs.c | 4 +-- drivers/nvdimm/namespace_devs.c | 36 +++++++++++----------- drivers/nvdimm/nd-core.h | 68 +++++++++++++++++++++++++++++++++++++++++ drivers/nvdimm/pfn_devs.c | 24 +++++++-------- drivers/nvdimm/pmem.c | 4 +-- drivers/nvdimm/region.c | 2 +- drivers/nvdimm/region_devs.c | 16 +++++----- include/linux/device.h | 5 +++ 14 files changed, 187 insertions(+), 81 deletions(-) (limited to 'include') diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c index 23022cf20d26..f22139458ce1 100644 --- a/drivers/acpi/nfit/core.c +++ b/drivers/acpi/nfit/core.c @@ -1282,7 +1282,7 @@ static ssize_t hw_error_scrub_store(struct device *dev, if (rc) return rc; - device_lock(dev); + nfit_device_lock(dev); nd_desc = dev_get_drvdata(dev); if (nd_desc) { struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc); @@ -1299,7 +1299,7 @@ static ssize_t hw_error_scrub_store(struct device *dev, break; } } - device_unlock(dev); + nfit_device_unlock(dev); if (rc) return rc; return size; @@ -1319,7 +1319,7 @@ static ssize_t scrub_show(struct device *dev, ssize_t rc = -ENXIO; bool busy; - device_lock(dev); + nfit_device_lock(dev); nd_desc = dev_get_drvdata(dev); if (!nd_desc) { device_unlock(dev); @@ -1339,7 +1339,7 @@ static ssize_t scrub_show(struct device *dev, } mutex_unlock(&acpi_desc->init_mutex); - device_unlock(dev); + nfit_device_unlock(dev); return rc; } @@ -1356,14 +1356,14 @@ static ssize_t scrub_store(struct device *dev, if (val != 1) return -EINVAL; - device_lock(dev); + nfit_device_lock(dev); nd_desc = dev_get_drvdata(dev); if (nd_desc) { struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc); rc = acpi_nfit_ars_rescan(acpi_desc, ARS_REQ_LONG); } - device_unlock(dev); + nfit_device_unlock(dev); if (rc) return rc; return size; @@ -1749,9 +1749,9 @@ static void acpi_nvdimm_notify(acpi_handle handle, u32 event, void *data) struct acpi_device *adev = data; struct device *dev = &adev->dev; - device_lock(dev->parent); + nfit_device_lock(dev->parent); __acpi_nvdimm_notify(dev, event); - device_unlock(dev->parent); + nfit_device_unlock(dev->parent); } static bool acpi_nvdimm_has_method(struct acpi_device *adev, char *method) @@ -3457,8 +3457,8 @@ static int acpi_nfit_flush_probe(struct nvdimm_bus_descriptor *nd_desc) struct device *dev = acpi_desc->dev; /* Bounce the device lock to flush acpi_nfit_add / acpi_nfit_notify */ - device_lock(dev); - device_unlock(dev); + nfit_device_lock(dev); + nfit_device_unlock(dev); /* Bounce the init_mutex to complete initial registration */ mutex_lock(&acpi_desc->init_mutex); @@ -3602,8 +3602,8 @@ void acpi_nfit_shutdown(void *data) * acpi_nfit_ars_rescan() submissions have had a chance to * either submit or see ->cancel set. */ - device_lock(bus_dev); - device_unlock(bus_dev); + nfit_device_lock(bus_dev); + nfit_device_unlock(bus_dev); flush_workqueue(nfit_wq); } @@ -3746,9 +3746,9 @@ EXPORT_SYMBOL_GPL(__acpi_nfit_notify); static void acpi_nfit_notify(struct acpi_device *adev, u32 event) { - device_lock(&adev->dev); + nfit_device_lock(&adev->dev); __acpi_nfit_notify(&adev->dev, adev->handle, event); - device_unlock(&adev->dev); + nfit_device_unlock(&adev->dev); } static const struct acpi_device_id acpi_nfit_ids[] = { diff --git a/drivers/acpi/nfit/nfit.h b/drivers/acpi/nfit/nfit.h index 6ee2b02af73e..24241941181c 100644 --- a/drivers/acpi/nfit/nfit.h +++ b/drivers/acpi/nfit/nfit.h @@ -312,6 +312,30 @@ static inline struct acpi_nfit_desc *to_acpi_desc( return container_of(nd_desc, struct acpi_nfit_desc, nd_desc); } +#ifdef CONFIG_PROVE_LOCKING +static inline void nfit_device_lock(struct device *dev) +{ + device_lock(dev); + mutex_lock(&dev->lockdep_mutex); +} + +static inline void nfit_device_unlock(struct device *dev) +{ + mutex_unlock(&dev->lockdep_mutex); + device_unlock(dev); +} +#else +static inline void nfit_device_lock(struct device *dev) +{ + device_lock(dev); +} + +static inline void nfit_device_unlock(struct device *dev) +{ + device_unlock(dev); +} +#endif + const guid_t *to_nfit_uuid(enum nfit_uuids id); int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, void *nfit, acpi_size sz); void acpi_nfit_shutdown(void *data); diff --git a/drivers/base/core.c b/drivers/base/core.c index eaf3aa0cb803..4825949d6547 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -1663,6 +1663,9 @@ void device_initialize(struct device *dev) kobject_init(&dev->kobj, &device_ktype); INIT_LIST_HEAD(&dev->dma_pools); mutex_init(&dev->mutex); +#ifdef CONFIG_PROVE_LOCKING + mutex_init(&dev->lockdep_mutex); +#endif lockdep_set_novalidate_class(&dev->mutex); spin_lock_init(&dev->devres_lock); INIT_LIST_HEAD(&dev->devres_head); diff --git a/drivers/nvdimm/btt_devs.c b/drivers/nvdimm/btt_devs.c index 62d00fffa4af..3508a79110c7 100644 --- a/drivers/nvdimm/btt_devs.c +++ b/drivers/nvdimm/btt_devs.c @@ -62,14 +62,14 @@ static ssize_t sector_size_store(struct device *dev, struct nd_btt *nd_btt = to_nd_btt(dev); ssize_t rc; - device_lock(dev); + nd_device_lock(dev); nvdimm_bus_lock(dev); rc = nd_size_select_store(dev, buf, &nd_btt->lbasize, btt_lbasize_supported); dev_dbg(dev, "result: %zd wrote: %s%s", rc, buf, buf[len - 1] == '\n' ? "" : "\n"); nvdimm_bus_unlock(dev); - device_unlock(dev); + nd_device_unlock(dev); return rc ? rc : len; } @@ -91,11 +91,11 @@ static ssize_t uuid_store(struct device *dev, struct nd_btt *nd_btt = to_nd_btt(dev); ssize_t rc; - device_lock(dev); + nd_device_lock(dev); rc = nd_uuid_store(dev, &nd_btt->uuid, buf, len); dev_dbg(dev, "result: %zd wrote: %s%s", rc, buf, buf[len - 1] == '\n' ? "" : "\n"); - device_unlock(dev); + nd_device_unlock(dev); return rc ? rc : len; } @@ -120,13 +120,13 @@ static ssize_t namespace_store(struct device *dev, struct nd_btt *nd_btt = to_nd_btt(dev); ssize_t rc; - device_lock(dev); + nd_device_lock(dev); nvdimm_bus_lock(dev); rc = nd_namespace_store(dev, &nd_btt->ndns, buf, len); dev_dbg(dev, "result: %zd wrote: %s%s", rc, buf, buf[len - 1] == '\n' ? "" : "\n"); nvdimm_bus_unlock(dev); - device_unlock(dev); + nd_device_unlock(dev); return rc; } @@ -138,14 +138,14 @@ static ssize_t size_show(struct device *dev, struct nd_btt *nd_btt = to_nd_btt(dev); ssize_t rc; - device_lock(dev); + nd_device_lock(dev); if (dev->driver) rc = sprintf(buf, "%llu\n", nd_btt->size); else { /* no size to convey if the btt instance is disabled */ rc = -ENXIO; } - device_unlock(dev); + nd_device_unlock(dev); return rc; } diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c index df41f3571dc9..798c5c4aea9c 100644 --- a/drivers/nvdimm/bus.c +++ b/drivers/nvdimm/bus.c @@ -26,7 +26,7 @@ int nvdimm_major; static int nvdimm_bus_major; -static struct class *nd_class; +struct class *nd_class; static DEFINE_IDA(nd_ida); static int to_nd_device_type(struct device *dev) @@ -91,7 +91,10 @@ static int nvdimm_bus_probe(struct device *dev) dev->driver->name, dev_name(dev)); nvdimm_bus_probe_start(nvdimm_bus); + debug_nvdimm_lock(dev); rc = nd_drv->probe(dev); + debug_nvdimm_unlock(dev); + if (rc == 0) nd_region_probe_success(nvdimm_bus, dev); else @@ -113,8 +116,11 @@ static int nvdimm_bus_remove(struct device *dev) struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev); int rc = 0; - if (nd_drv->remove) + if (nd_drv->remove) { + debug_nvdimm_lock(dev); rc = nd_drv->remove(dev); + debug_nvdimm_unlock(dev); + } nd_region_disable(nvdimm_bus, dev); dev_dbg(&nvdimm_bus->dev, "%s.remove(%s) = %d\n", dev->driver->name, @@ -140,7 +146,7 @@ static void nvdimm_bus_shutdown(struct device *dev) void nd_device_notify(struct device *dev, enum nvdimm_event event) { - device_lock(dev); + nd_device_lock(dev); if (dev->driver) { struct nd_device_driver *nd_drv; @@ -148,7 +154,7 @@ void nd_device_notify(struct device *dev, enum nvdimm_event event) if (nd_drv->notify) nd_drv->notify(dev, event); } - device_unlock(dev); + nd_device_unlock(dev); } EXPORT_SYMBOL(nd_device_notify); @@ -296,7 +302,7 @@ static void nvdimm_bus_release(struct device *dev) kfree(nvdimm_bus); } -static bool is_nvdimm_bus(struct device *dev) +bool is_nvdimm_bus(struct device *dev) { return dev->release == nvdimm_bus_release; } @@ -575,9 +581,9 @@ void nd_device_unregister(struct device *dev, enum nd_async_mode mode) * or otherwise let the async path handle it if the * unregistration was already queued. */ - device_lock(dev); + nd_device_lock(dev); killed = kill_device(dev); - device_unlock(dev); + nd_device_unlock(dev); if (!killed) return; @@ -888,10 +894,10 @@ void wait_nvdimm_bus_probe_idle(struct device *dev) if (nvdimm_bus->probe_active == 0) break; nvdimm_bus_unlock(dev); - device_unlock(dev); + nd_device_unlock(dev); wait_event(nvdimm_bus->wait, nvdimm_bus->probe_active == 0); - device_lock(dev); + nd_device_lock(dev); nvdimm_bus_lock(dev); } while (true); } @@ -1107,7 +1113,7 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm, goto out; } - device_lock(dev); + nd_device_lock(dev); nvdimm_bus_lock(dev); rc = nd_cmd_clear_to_send(nvdimm_bus, nvdimm, func, buf); if (rc) @@ -1129,7 +1135,7 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm, out_unlock: nvdimm_bus_unlock(dev); - device_unlock(dev); + nd_device_unlock(dev); out: kfree(in_env); kfree(out_env); diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c index 5e1f060547bf..9204f1e9fd14 100644 --- a/drivers/nvdimm/core.c +++ b/drivers/nvdimm/core.c @@ -246,7 +246,7 @@ static int nd_uuid_parse(struct device *dev, u8 *uuid_out, const char *buf, * * Enforce that uuids can only be changed while the device is disabled * (driver detached) - * LOCKING: expects device_lock() is held on entry + * LOCKING: expects nd_device_lock() is held on entry */ int nd_uuid_store(struct device *dev, u8 **uuid_out, const char *buf, size_t len) @@ -347,15 +347,15 @@ static DEVICE_ATTR_RO(provider); static int flush_namespaces(struct device *dev, void *data) { - device_lock(dev); - device_unlock(dev); + nd_device_lock(dev); + nd_device_unlock(dev); return 0; } static int flush_regions_dimms(struct device *dev, void *data) { - device_lock(dev); - device_unlock(dev); + nd_device_lock(dev); + nd_device_unlock(dev); device_for_each_child(dev, NULL, flush_namespaces); return 0; } diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c index dfecd6e17043..29a065e769ea 100644 --- a/drivers/nvdimm/dimm_devs.c +++ b/drivers/nvdimm/dimm_devs.c @@ -484,12 +484,12 @@ static ssize_t security_store(struct device *dev, * done while probing is idle and the DIMM is not in active use * in any region. */ - device_lock(dev); + nd_device_lock(dev); nvdimm_bus_lock(dev); wait_nvdimm_bus_probe_idle(dev); rc = __security_store(dev, buf, len); nvdimm_bus_unlock(dev); - device_unlock(dev); + nd_device_unlock(dev); return rc; } diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c index a434a5964cb9..92cd809d7e43 100644 --- a/drivers/nvdimm/namespace_devs.c +++ b/drivers/nvdimm/namespace_devs.c @@ -410,7 +410,7 @@ static ssize_t alt_name_store(struct device *dev, struct nd_region *nd_region = to_nd_region(dev->parent); ssize_t rc; - device_lock(dev); + nd_device_lock(dev); nvdimm_bus_lock(dev); wait_nvdimm_bus_probe_idle(dev); rc = __alt_name_store(dev, buf, len); @@ -418,7 +418,7 @@ static ssize_t alt_name_store(struct device *dev, rc = nd_namespace_label_update(nd_region, dev); dev_dbg(dev, "%s(%zd)\n", rc < 0 ? "fail " : "", rc); nvdimm_bus_unlock(dev); - device_unlock(dev); + nd_device_unlock(dev); return rc < 0 ? rc : len; } @@ -1077,7 +1077,7 @@ static ssize_t size_store(struct device *dev, if (rc) return rc; - device_lock(dev); + nd_device_lock(dev); nvdimm_bus_lock(dev); wait_nvdimm_bus_probe_idle(dev); rc = __size_store(dev, val); @@ -1103,7 +1103,7 @@ static ssize_t size_store(struct device *dev, dev_dbg(dev, "%llx %s (%d)\n", val, rc < 0 ? "fail" : "success", rc); nvdimm_bus_unlock(dev); - device_unlock(dev); + nd_device_unlock(dev); return rc < 0 ? rc : len; } @@ -1286,7 +1286,7 @@ static ssize_t uuid_store(struct device *dev, } else return -ENXIO; - device_lock(dev); + nd_device_lock(dev); nvdimm_bus_lock(dev); wait_nvdimm_bus_probe_idle(dev); if (to_ndns(dev)->claim) @@ -1302,7 +1302,7 @@ static ssize_t uuid_store(struct device *dev, dev_dbg(dev, "result: %zd wrote: %s%s", rc, buf, buf[len - 1] == '\n' ? "" : "\n"); nvdimm_bus_unlock(dev); - device_unlock(dev); + nd_device_unlock(dev); return rc < 0 ? rc : len; } @@ -1376,7 +1376,7 @@ static ssize_t sector_size_store(struct device *dev, } else return -ENXIO; - device_lock(dev); + nd_device_lock(dev); nvdimm_bus_lock(dev); if (to_ndns(dev)->claim) rc = -EBUSY; @@ -1387,7 +1387,7 @@ static ssize_t sector_size_store(struct device *dev, dev_dbg(dev, "result: %zd %s: %s%s", rc, rc < 0 ? "tried" : "wrote", buf, buf[len - 1] == '\n' ? "" : "\n"); nvdimm_bus_unlock(dev); - device_unlock(dev); + nd_device_unlock(dev); return rc ? rc : len; } @@ -1502,9 +1502,9 @@ static ssize_t holder_show(struct device *dev, struct nd_namespace_common *ndns = to_ndns(dev); ssize_t rc; - device_lock(dev); + nd_device_lock(dev); rc = sprintf(buf, "%s\n", ndns->claim ? dev_name(ndns->claim) : ""); - device_unlock(dev); + nd_device_unlock(dev); return rc; } @@ -1541,7 +1541,7 @@ static ssize_t holder_class_store(struct device *dev, struct nd_region *nd_region = to_nd_region(dev->parent); ssize_t rc; - device_lock(dev); + nd_device_lock(dev); nvdimm_bus_lock(dev); wait_nvdimm_bus_probe_idle(dev); rc = __holder_class_store(dev, buf); @@ -1549,7 +1549,7 @@ static ssize_t holder_class_store(struct device *dev, rc = nd_namespace_label_update(nd_region, dev); dev_dbg(dev, "%s(%zd)\n", rc < 0 ? "fail " : "", rc); nvdimm_bus_unlock(dev); - device_unlock(dev); + nd_device_unlock(dev); return rc < 0 ? rc : len; } @@ -1560,7 +1560,7 @@ static ssize_t holder_class_show(struct device *dev, struct nd_namespace_common *ndns = to_ndns(dev); ssize_t rc; - device_lock(dev); + nd_device_lock(dev); if (ndns->claim_class == NVDIMM_CCLASS_NONE) rc = sprintf(buf, "\n"); else if ((ndns->claim_class == NVDIMM_CCLASS_BTT) || @@ -1572,7 +1572,7 @@ static ssize_t holder_class_show(struct device *dev, rc = sprintf(buf, "dax\n"); else rc = sprintf(buf, "\n"); - device_unlock(dev); + nd_device_unlock(dev); return rc; } @@ -1586,7 +1586,7 @@ static ssize_t mode_show(struct device *dev, char *mode; ssize_t rc; - device_lock(dev); + nd_device_lock(dev); claim = ndns->claim; if (claim && is_nd_btt(claim)) mode = "safe"; @@ -1599,7 +1599,7 @@ static ssize_t mode_show(struct device *dev, else mode = "raw"; rc = sprintf(buf, "%s\n", mode); - device_unlock(dev); + nd_device_unlock(dev); return rc; } @@ -1703,8 +1703,8 @@ struct nd_namespace_common *nvdimm_namespace_common_probe(struct device *dev) * Flush any in-progess probes / removals in the driver * for the raw personality of this namespace. */ - device_lock(&ndns->dev); - device_unlock(&ndns->dev); + nd_device_lock(&ndns->dev); + nd_device_unlock(&ndns->dev); if (ndns->dev.driver) { dev_dbg(&ndns->dev, "is active, can't bind %s\n", dev_name(dev)); diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h index 6cd470547106..0ac52b6eb00e 100644 --- a/drivers/nvdimm/nd-core.h +++ b/drivers/nvdimm/nd-core.h @@ -9,6 +9,7 @@ #include #include #include +#include "nd.h" extern struct list_head nvdimm_bus_list; extern struct mutex nvdimm_bus_list_mutex; @@ -182,4 +183,71 @@ ssize_t nd_namespace_store(struct device *dev, struct nd_namespace_common **_ndns, const char *buf, size_t len); struct nd_pfn *to_nd_pfn_safe(struct device *dev); +bool is_nvdimm_bus(struct device *dev); + +#ifdef CONFIG_PROVE_LOCKING +extern struct class *nd_class; + +enum { + LOCK_BUS, + LOCK_NDCTL, + LOCK_REGION, + LOCK_DIMM = LOCK_REGION, + LOCK_NAMESPACE, + LOCK_CLAIM, +}; + +static inline void debug_nvdimm_lock(struct device *dev) +{ + if (is_nd_region(dev)) + mutex_lock_nested(&dev->lockdep_mutex, LOCK_REGION); + else if (is_nvdimm(dev)) + mutex_lock_nested(&dev->lockdep_mutex, LOCK_DIMM); + else if (is_nd_btt(dev) || is_nd_pfn(dev) || is_nd_dax(dev)) + mutex_lock_nested(&dev->lockdep_mutex, LOCK_CLAIM); + else if (dev->parent && (is_nd_region(dev->parent))) + mutex_lock_nested(&dev->lockdep_mutex, LOCK_NAMESPACE); + else if (is_nvdimm_bus(dev)) + mutex_lock_nested(&dev->lockdep_mutex, LOCK_BUS); + else if (dev->class && dev->class == nd_class) + mutex_lock_nested(&dev->lockdep_mutex, LOCK_NDCTL); + else + dev_WARN(dev, "unknown lock level\n"); +} + +static inline void debug_nvdimm_unlock(struct device *dev) +{ + mutex_unlock(&dev->lockdep_mutex); +} + +static inline void nd_device_lock(struct device *dev) +{ + device_lock(dev); + debug_nvdimm_lock(dev); +} + +static inline void nd_device_unlock(struct device *dev) +{ + debug_nvdimm_unlock(dev); + device_unlock(dev); +} +#else +static inline void nd_device_lock(struct device *dev) +{ + device_lock(dev); +} + +static inline void nd_device_unlock(struct device *dev) +{ + device_unlock(dev); +} + +static inline void debug_nvdimm_lock(struct device *dev) +{ +} + +static inline void debug_nvdimm_unlock(struct device *dev) +{ +} +#endif #endif /* __ND_CORE_H__ */ diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c index 0f81fc56bbfd..9b09fe18e666 100644 --- a/drivers/nvdimm/pfn_devs.c +++ b/drivers/nvdimm/pfn_devs.c @@ -67,7 +67,7 @@ static ssize_t mode_store(struct device *dev, struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev); ssize_t rc = 0; - device_lock(dev); + nd_device_lock(dev); nvdimm_bus_lock(dev); if (dev->driver) rc = -EBUSY; @@ -89,7 +89,7 @@ static ssize_t mode_store(struct device *dev, dev_dbg(dev, "result: %zd wrote: %s%s", rc, buf, buf[len - 1] == '\n' ? "" : "\n"); nvdimm_bus_unlock(dev); - device_unlock(dev); + nd_device_unlock(dev); return rc ? rc : len; } @@ -132,14 +132,14 @@ static ssize_t align_store(struct device *dev, struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev); ssize_t rc; - device_lock(dev); + nd_device_lock(dev); nvdimm_bus_lock(dev); rc = nd_size_select_store(dev, buf, &nd_pfn->align, nd_pfn_supported_alignments()); dev_dbg(dev, "result: %zd wrote: %s%s", rc, buf, buf[len - 1] == '\n' ? "" : "\n"); nvdimm_bus_unlock(dev); - device_unlock(dev); + nd_device_unlock(dev); return rc ? rc : len; } @@ -161,11 +161,11 @@ static ssize_t uuid_store(struct device *dev, struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev); ssize_t rc; - device_lock(dev); + nd_device_lock(dev); rc = nd_uuid_store(dev, &nd_pfn->uuid, buf, len); dev_dbg(dev, "result: %zd wrote: %s%s", rc, buf, buf[len - 1] == '\n' ? "" : "\n"); - device_unlock(dev); + nd_device_unlock(dev); return rc ? rc : len; } @@ -190,13 +190,13 @@ static ssize_t namespace_store(struct device *dev, struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev); ssize_t rc; - device_lock(dev); + nd_device_lock(dev); nvdimm_bus_lock(dev); rc = nd_namespace_store(dev, &nd_pfn->ndns, buf, len); dev_dbg(dev, "result: %zd wrote: %s%s", rc, buf, buf[len - 1] == '\n' ? "" : "\n"); nvdimm_bus_unlock(dev); - device_unlock(dev); + nd_device_unlock(dev); return rc; } @@ -208,7 +208,7 @@ static ssize_t resource_show(struct device *dev, struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev); ssize_t rc; - device_lock(dev); + nd_device_lock(dev); if (dev->driver) { struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb; u64 offset = __le64_to_cpu(pfn_sb->dataoff); @@ -222,7 +222,7 @@ static ssize_t resource_show(struct device *dev, /* no address to convey if the pfn instance is disabled */ rc = -ENXIO; } - device_unlock(dev); + nd_device_unlock(dev); return rc; } @@ -234,7 +234,7 @@ static ssize_t size_show(struct device *dev, struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev); ssize_t rc; - device_lock(dev); + nd_device_lock(dev); if (dev->driver) { struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb; u64 offset = __le64_to_cpu(pfn_sb->dataoff); @@ -250,7 +250,7 @@ static ssize_t size_show(struct device *dev, /* no size to convey if the pfn instance is disabled */ rc = -ENXIO; } - device_unlock(dev); + nd_device_unlock(dev); return rc; } diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 28cb44c61d4a..53797e7be18a 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -520,8 +520,8 @@ static int nd_pmem_remove(struct device *dev) nvdimm_namespace_detach_btt(to_nd_btt(dev)); else { /* - * Note, this assumes device_lock() context to not race - * nd_pmem_notify() + * Note, this assumes nd_device_lock() context to not + * race nd_pmem_notify() */ sysfs_put(pmem->bb_state); pmem->bb_state = NULL; diff --git a/drivers/nvdimm/region.c b/drivers/nvdimm/region.c index 488c47ac4c4a..37bf8719a2a4 100644 --- a/drivers/nvdimm/region.c +++ b/drivers/nvdimm/region.c @@ -102,7 +102,7 @@ static int nd_region_remove(struct device *dev) nvdimm_bus_unlock(dev); /* - * Note, this assumes device_lock() context to not race + * Note, this assumes nd_device_lock() context to not race * nd_region_notify() */ sysfs_put(nd_region->bb_state); diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c index a15276cdec7d..91b5a7ade0d5 100644 --- a/drivers/nvdimm/region_devs.c +++ b/drivers/nvdimm/region_devs.c @@ -329,7 +329,7 @@ static ssize_t set_cookie_show(struct device *dev, * the v1.1 namespace label cookie definition. To read all this * data we need to wait for probing to settle. */ - device_lock(dev); + nd_device_lock(dev); nvdimm_bus_lock(dev); wait_nvdimm_bus_probe_idle(dev); if (nd_region->ndr_mappings) { @@ -346,7 +346,7 @@ static ssize_t set_cookie_show(struct device *dev, } } nvdimm_bus_unlock(dev); - device_unlock(dev); + nd_device_unlock(dev); if (rc) return rc; @@ -422,12 +422,12 @@ static ssize_t available_size_show(struct device *dev, * memory nvdimm_bus_lock() is dropped, but that's userspace's * problem to not race itself. */ - device_lock(dev); + nd_device_lock(dev); nvdimm_bus_lock(dev); wait_nvdimm_bus_probe_idle(dev); available = nd_region_available_dpa(nd_region); nvdimm_bus_unlock(dev); - device_unlock(dev); + nd_device_unlock(dev); return sprintf(buf, "%llu\n", available); } @@ -439,12 +439,12 @@ static ssize_t max_available_extent_show(struct device *dev, struct nd_region *nd_region = to_nd_region(dev); unsigned long long available = 0; - device_lock(dev); + nd_device_lock(dev); nvdimm_bus_lock(dev); wait_nvdimm_bus_probe_idle(dev); available = nd_region_allocatable_dpa(nd_region); nvdimm_bus_unlock(dev); - device_unlock(dev); + nd_device_unlock(dev); return sprintf(buf, "%llu\n", available); } @@ -563,12 +563,12 @@ static ssize_t region_badblocks_show(struct device *dev, struct nd_region *nd_region = to_nd_region(dev); ssize_t rc; - device_lock(dev); + nd_device_lock(dev); if (dev->driver) rc = badblocks_show(&nd_region->bb, buf, 0); else rc = -ENXIO; - device_unlock(dev); + nd_device_unlock(dev); return rc; } diff --git a/include/linux/device.h b/include/linux/device.h index 0da5c67f6be1..9237b857b598 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -909,6 +909,8 @@ struct dev_links_info { * This identifies the device type and carries type-specific * information. * @mutex: Mutex to synchronize calls to its driver. + * @lockdep_mutex: An optional debug lock that a subsystem can use as a + * peer lock to gain localized lockdep coverage of the device_lock. * @bus: Type of bus device is on. * @driver: Which driver has allocated this * @platform_data: Platform data specific to the device. @@ -991,6 +993,9 @@ struct device { core doesn't touch it */ void *driver_data; /* Driver data, set and get with dev_set_drvdata/dev_get_drvdata */ +#ifdef CONFIG_PROVE_LOCKING + struct mutex lockdep_mutex; +#endif struct mutex mutex; /* mutex to synchronize calls to * its driver. */ -- cgit v1.2.3 From 0c7294ddae73ad8d7532f95a86259e311e991a55 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 19 Jul 2019 18:20:14 +0200 Subject: net: flow_offload: remove netns parameter from flow_block_cb_alloc() No need to annotate the netns on the flow block callback object, flow_block_cb_is_busy() already checks for used blocks. Fixes: d63db30c8537 ("net: flow_offload: add flow_block_cb_alloc() and flow_block_cb_free()") Signed-off-by: Pablo Neira Ayuso Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 3 +-- drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 5 ++--- drivers/net/ethernet/mscc/ocelot_flower.c | 3 +-- drivers/net/ethernet/mscc/ocelot_tc.c | 2 +- drivers/net/ethernet/netronome/nfp/flower/offload.c | 6 ++---- include/net/flow_offload.h | 3 +-- net/core/flow_offload.c | 9 +++------ net/dsa/slave.c | 2 +- 8 files changed, 12 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index 7245d287633d..2162412073c5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -735,8 +735,7 @@ mlx5e_rep_indr_setup_tc_block(struct net_device *netdev, list_add(&indr_priv->list, &rpriv->uplink_priv.tc_indr_block_priv_list); - block_cb = flow_block_cb_alloc(f->net, - mlx5e_rep_indr_setup_block_cb, + block_cb = flow_block_cb_alloc(mlx5e_rep_indr_setup_block_cb, indr_priv, indr_priv, mlx5e_rep_indr_tc_block_unbind); if (IS_ERR(block_cb)) { diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index 4d34d42b3b0e..a469035400cf 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -1610,8 +1610,7 @@ mlxsw_sp_setup_tc_block_flower_bind(struct mlxsw_sp_port *mlxsw_sp_port, acl_block = mlxsw_sp_acl_block_create(mlxsw_sp, f->net); if (!acl_block) return -ENOMEM; - block_cb = flow_block_cb_alloc(f->net, - mlxsw_sp_setup_tc_block_cb_flower, + block_cb = flow_block_cb_alloc(mlxsw_sp_setup_tc_block_cb_flower, mlxsw_sp, acl_block, mlxsw_sp_tc_block_flower_release); if (IS_ERR(block_cb)) { @@ -1702,7 +1701,7 @@ static int mlxsw_sp_setup_tc_block(struct mlxsw_sp_port *mlxsw_sp_port, &mlxsw_sp_block_cb_list)) return -EBUSY; - block_cb = flow_block_cb_alloc(f->net, cb, mlxsw_sp_port, + block_cb = flow_block_cb_alloc(cb, mlxsw_sp_port, mlxsw_sp_port, NULL); if (IS_ERR(block_cb)) return PTR_ERR(block_cb); diff --git a/drivers/net/ethernet/mscc/ocelot_flower.c b/drivers/net/ethernet/mscc/ocelot_flower.c index 7aaddc09c185..6a11aea8b186 100644 --- a/drivers/net/ethernet/mscc/ocelot_flower.c +++ b/drivers/net/ethernet/mscc/ocelot_flower.c @@ -323,8 +323,7 @@ int ocelot_setup_tc_block_flower_bind(struct ocelot_port *port, if (!port_block) return -ENOMEM; - block_cb = flow_block_cb_alloc(f->net, - ocelot_setup_tc_block_cb_flower, + block_cb = flow_block_cb_alloc(ocelot_setup_tc_block_cb_flower, port, port_block, ocelot_tc_block_unbind); if (IS_ERR(block_cb)) { diff --git a/drivers/net/ethernet/mscc/ocelot_tc.c b/drivers/net/ethernet/mscc/ocelot_tc.c index 9e6464ffae5d..abbcb66bf5ac 100644 --- a/drivers/net/ethernet/mscc/ocelot_tc.c +++ b/drivers/net/ethernet/mscc/ocelot_tc.c @@ -156,7 +156,7 @@ static int ocelot_setup_tc_block(struct ocelot_port *port, if (flow_block_cb_is_busy(cb, port, &ocelot_block_cb_list)) return -EBUSY; - block_cb = flow_block_cb_alloc(f->net, cb, port, port, NULL); + block_cb = flow_block_cb_alloc(cb, port, port, NULL); if (IS_ERR(block_cb)) return PTR_ERR(block_cb); diff --git a/drivers/net/ethernet/netronome/nfp/flower/offload.c b/drivers/net/ethernet/netronome/nfp/flower/offload.c index faa8ba012a37..93ab0db6c504 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/offload.c +++ b/drivers/net/ethernet/netronome/nfp/flower/offload.c @@ -1318,8 +1318,7 @@ static int nfp_flower_setup_tc_block(struct net_device *netdev, &nfp_block_cb_list)) return -EBUSY; - block_cb = flow_block_cb_alloc(f->net, - nfp_flower_setup_tc_block_cb, + block_cb = flow_block_cb_alloc(nfp_flower_setup_tc_block_cb, repr, repr, NULL); if (IS_ERR(block_cb)) return PTR_ERR(block_cb); @@ -1424,8 +1423,7 @@ nfp_flower_setup_indr_tc_block(struct net_device *netdev, struct nfp_app *app, cb_priv->app = app; list_add(&cb_priv->list, &priv->indr_block_cb_priv); - block_cb = flow_block_cb_alloc(f->net, - nfp_flower_setup_indr_block_cb, + block_cb = flow_block_cb_alloc(nfp_flower_setup_indr_block_cb, cb_priv, cb_priv, nfp_flower_setup_indr_tc_release); if (IS_ERR(block_cb)) { diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h index db337299e81e..aa9b5287b231 100644 --- a/include/net/flow_offload.h +++ b/include/net/flow_offload.h @@ -264,7 +264,6 @@ struct flow_block_offload { struct flow_block_cb { struct list_head driver_list; struct list_head list; - struct net *net; tc_setup_cb_t *cb; void *cb_ident; void *cb_priv; @@ -272,7 +271,7 @@ struct flow_block_cb { unsigned int refcnt; }; -struct flow_block_cb *flow_block_cb_alloc(struct net *net, tc_setup_cb_t *cb, +struct flow_block_cb *flow_block_cb_alloc(tc_setup_cb_t *cb, void *cb_ident, void *cb_priv, void (*release)(void *cb_priv)); void flow_block_cb_free(struct flow_block_cb *block_cb); diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c index 76f8db3841d7..507de4b48815 100644 --- a/net/core/flow_offload.c +++ b/net/core/flow_offload.c @@ -165,7 +165,7 @@ void flow_rule_match_enc_opts(const struct flow_rule *rule, } EXPORT_SYMBOL(flow_rule_match_enc_opts); -struct flow_block_cb *flow_block_cb_alloc(struct net *net, tc_setup_cb_t *cb, +struct flow_block_cb *flow_block_cb_alloc(tc_setup_cb_t *cb, void *cb_ident, void *cb_priv, void (*release)(void *cb_priv)) { @@ -175,7 +175,6 @@ struct flow_block_cb *flow_block_cb_alloc(struct net *net, tc_setup_cb_t *cb, if (!block_cb) return ERR_PTR(-ENOMEM); - block_cb->net = net; block_cb->cb = cb; block_cb->cb_ident = cb_ident; block_cb->cb_priv = cb_priv; @@ -200,8 +199,7 @@ struct flow_block_cb *flow_block_cb_lookup(struct flow_block_offload *f, struct flow_block_cb *block_cb; list_for_each_entry(block_cb, f->driver_block_list, driver_list) { - if (block_cb->net == f->net && - block_cb->cb == cb && + if (block_cb->cb == cb && block_cb->cb_ident == cb_ident) return block_cb; } @@ -261,8 +259,7 @@ int flow_block_cb_setup_simple(struct flow_block_offload *f, if (flow_block_cb_is_busy(cb, cb_ident, driver_block_list)) return -EBUSY; - block_cb = flow_block_cb_alloc(f->net, cb, cb_ident, - cb_priv, NULL); + block_cb = flow_block_cb_alloc(cb, cb_ident, cb_priv, NULL); if (IS_ERR(block_cb)) return PTR_ERR(block_cb); diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 614c38ece104..6ca9ec58f881 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -967,7 +967,7 @@ static int dsa_slave_setup_tc_block(struct net_device *dev, if (flow_block_cb_is_busy(cb, dev, &dsa_slave_block_cb_list)) return -EBUSY; - block_cb = flow_block_cb_alloc(f->net, cb, dev, dev, NULL); + block_cb = flow_block_cb_alloc(cb, dev, dev, NULL); if (IS_ERR(block_cb)) return PTR_ERR(block_cb); -- cgit v1.2.3 From a7323311515d488b7714bb7504a1d50fabb0bfcf Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 19 Jul 2019 18:20:15 +0200 Subject: net: flow_offload: rename tc_setup_cb_t to flow_setup_cb_t Rename this type definition and adapt users. Signed-off-by: Pablo Neira Ayuso Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 2 +- drivers/net/ethernet/mscc/ocelot_tc.c | 2 +- include/net/flow_offload.h | 17 +++++++++++------ include/net/pkt_cls.h | 5 ++--- include/net/sch_generic.h | 6 ++---- net/core/flow_offload.c | 9 +++++---- net/dsa/slave.c | 2 +- net/sched/cls_api.c | 2 +- net/sched/cls_bpf.c | 2 +- net/sched/cls_flower.c | 2 +- net/sched/cls_matchall.c | 2 +- net/sched/cls_u32.c | 6 +++--- 12 files changed, 30 insertions(+), 27 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index a469035400cf..51cd0b6f1f3e 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -1679,7 +1679,7 @@ static int mlxsw_sp_setup_tc_block(struct mlxsw_sp_port *mlxsw_sp_port, struct flow_block_offload *f) { struct flow_block_cb *block_cb; - tc_setup_cb_t *cb; + flow_setup_cb_t *cb; bool ingress; int err; diff --git a/drivers/net/ethernet/mscc/ocelot_tc.c b/drivers/net/ethernet/mscc/ocelot_tc.c index abbcb66bf5ac..fba9512e9ca6 100644 --- a/drivers/net/ethernet/mscc/ocelot_tc.c +++ b/drivers/net/ethernet/mscc/ocelot_tc.c @@ -134,7 +134,7 @@ static int ocelot_setup_tc_block(struct ocelot_port *port, struct flow_block_offload *f) { struct flow_block_cb *block_cb; - tc_setup_cb_t *cb; + flow_setup_cb_t *cb; int err; netdev_dbg(port->dev, "tc_block command %d, binder_type %d\n", diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h index aa9b5287b231..23b299235baf 100644 --- a/include/net/flow_offload.h +++ b/include/net/flow_offload.h @@ -2,8 +2,8 @@ #define _NET_FLOW_OFFLOAD_H #include +#include #include -#include struct flow_match { struct flow_dissector *dissector; @@ -261,23 +261,27 @@ struct flow_block_offload { struct netlink_ext_ack *extack; }; +enum tc_setup_type; +typedef int flow_setup_cb_t(enum tc_setup_type type, void *type_data, + void *cb_priv); + struct flow_block_cb { struct list_head driver_list; struct list_head list; - tc_setup_cb_t *cb; + flow_setup_cb_t *cb; void *cb_ident; void *cb_priv; void (*release)(void *cb_priv); unsigned int refcnt; }; -struct flow_block_cb *flow_block_cb_alloc(tc_setup_cb_t *cb, +struct flow_block_cb *flow_block_cb_alloc(flow_setup_cb_t *cb, void *cb_ident, void *cb_priv, void (*release)(void *cb_priv)); void flow_block_cb_free(struct flow_block_cb *block_cb); struct flow_block_cb *flow_block_cb_lookup(struct flow_block_offload *offload, - tc_setup_cb_t *cb, void *cb_ident); + flow_setup_cb_t *cb, void *cb_ident); void *flow_block_cb_priv(struct flow_block_cb *block_cb); void flow_block_cb_incref(struct flow_block_cb *block_cb); @@ -295,11 +299,12 @@ static inline void flow_block_cb_remove(struct flow_block_cb *block_cb, list_move(&block_cb->list, &offload->cb_list); } -bool flow_block_cb_is_busy(tc_setup_cb_t *cb, void *cb_ident, +bool flow_block_cb_is_busy(flow_setup_cb_t *cb, void *cb_ident, struct list_head *driver_block_list); int flow_block_cb_setup_simple(struct flow_block_offload *f, - struct list_head *driver_list, tc_setup_cb_t *cb, + struct list_head *driver_list, + flow_setup_cb_t *cb, void *cb_ident, void *cb_priv, bool ingress_only); enum flow_cls_command { diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 841faadceb6e..e429809ca90d 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -6,7 +6,6 @@ #include #include #include -#include #include /* TC action not accessible from user space */ @@ -126,14 +125,14 @@ static inline struct Qdisc *tcf_block_q(struct tcf_block *block) } static inline -int tc_setup_cb_block_register(struct tcf_block *block, tc_setup_cb_t *cb, +int tc_setup_cb_block_register(struct tcf_block *block, flow_setup_cb_t *cb, void *cb_priv) { return 0; } static inline -void tc_setup_cb_block_unregister(struct tcf_block *block, tc_setup_cb_t *cb, +void tc_setup_cb_block_unregister(struct tcf_block *block, flow_setup_cb_t *cb, void *cb_priv) { } diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 855167bbc372..9482e060483b 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -15,6 +15,7 @@ #include #include #include +#include struct Qdisc_ops; struct qdisc_walker; @@ -22,9 +23,6 @@ struct tcf_walker; struct module; struct bpf_flow_keys; -typedef int tc_setup_cb_t(enum tc_setup_type type, - void *type_data, void *cb_priv); - typedef int tc_indr_block_bind_cb_t(struct net_device *dev, void *cb_priv, enum tc_setup_type type, void *type_data); @@ -313,7 +311,7 @@ struct tcf_proto_ops { void (*walk)(struct tcf_proto *tp, struct tcf_walker *arg, bool rtnl_held); int (*reoffload)(struct tcf_proto *tp, bool add, - tc_setup_cb_t *cb, void *cb_priv, + flow_setup_cb_t *cb, void *cb_priv, struct netlink_ext_ack *extack); void (*bind_class)(void *, u32, unsigned long); void * (*tmplt_create)(struct net *net, diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c index 507de4b48815..a800fa78d96c 100644 --- a/net/core/flow_offload.c +++ b/net/core/flow_offload.c @@ -165,7 +165,7 @@ void flow_rule_match_enc_opts(const struct flow_rule *rule, } EXPORT_SYMBOL(flow_rule_match_enc_opts); -struct flow_block_cb *flow_block_cb_alloc(tc_setup_cb_t *cb, +struct flow_block_cb *flow_block_cb_alloc(flow_setup_cb_t *cb, void *cb_ident, void *cb_priv, void (*release)(void *cb_priv)) { @@ -194,7 +194,7 @@ void flow_block_cb_free(struct flow_block_cb *block_cb) EXPORT_SYMBOL(flow_block_cb_free); struct flow_block_cb *flow_block_cb_lookup(struct flow_block_offload *f, - tc_setup_cb_t *cb, void *cb_ident) + flow_setup_cb_t *cb, void *cb_ident) { struct flow_block_cb *block_cb; @@ -226,7 +226,7 @@ unsigned int flow_block_cb_decref(struct flow_block_cb *block_cb) } EXPORT_SYMBOL(flow_block_cb_decref); -bool flow_block_cb_is_busy(tc_setup_cb_t *cb, void *cb_ident, +bool flow_block_cb_is_busy(flow_setup_cb_t *cb, void *cb_ident, struct list_head *driver_block_list) { struct flow_block_cb *block_cb; @@ -243,7 +243,8 @@ EXPORT_SYMBOL(flow_block_cb_is_busy); int flow_block_cb_setup_simple(struct flow_block_offload *f, struct list_head *driver_block_list, - tc_setup_cb_t *cb, void *cb_ident, void *cb_priv, + flow_setup_cb_t *cb, + void *cb_ident, void *cb_priv, bool ingress_only) { struct flow_block_cb *block_cb; diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 6ca9ec58f881..d697a64fb564 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -951,7 +951,7 @@ static int dsa_slave_setup_tc_block(struct net_device *dev, struct flow_block_offload *f) { struct flow_block_cb *block_cb; - tc_setup_cb_t *cb; + flow_setup_cb_t *cb; if (f->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS) cb = dsa_slave_setup_tc_block_cb_ig; diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index d144233423c5..78f0f2815b8c 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1514,7 +1514,7 @@ void tcf_block_put(struct tcf_block *block) EXPORT_SYMBOL(tcf_block_put); static int -tcf_block_playback_offloads(struct tcf_block *block, tc_setup_cb_t *cb, +tcf_block_playback_offloads(struct tcf_block *block, flow_setup_cb_t *cb, void *cb_priv, bool add, bool offload_in_use, struct netlink_ext_ack *extack) { diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 691f71830134..3f7a9c02b70c 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -651,7 +651,7 @@ skip: } } -static int cls_bpf_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb, +static int cls_bpf_reoffload(struct tcf_proto *tp, bool add, flow_setup_cb_t *cb, void *cb_priv, struct netlink_ext_ack *extack) { struct cls_bpf_head *head = rtnl_dereference(tp->root); diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 38d6e85693fc..054123742e32 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -1800,7 +1800,7 @@ fl_get_next_hw_filter(struct tcf_proto *tp, struct cls_fl_filter *f, bool add) return NULL; } -static int fl_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb, +static int fl_reoffload(struct tcf_proto *tp, bool add, flow_setup_cb_t *cb, void *cb_priv, struct netlink_ext_ack *extack) { struct tcf_block *block = tp->chain->block; diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index a30d2f8feb32..455ea2793f9b 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -282,7 +282,7 @@ skip: arg->count++; } -static int mall_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb, +static int mall_reoffload(struct tcf_proto *tp, bool add, flow_setup_cb_t *cb, void *cb_priv, struct netlink_ext_ack *extack) { struct cls_mall_head *head = rtnl_dereference(tp->root); diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index be9e46c77e8b..8614088edd1b 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -1152,7 +1152,7 @@ static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg, } static int u32_reoffload_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht, - bool add, tc_setup_cb_t *cb, void *cb_priv, + bool add, flow_setup_cb_t *cb, void *cb_priv, struct netlink_ext_ack *extack) { struct tc_cls_u32_offload cls_u32 = {}; @@ -1172,7 +1172,7 @@ static int u32_reoffload_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht, } static int u32_reoffload_knode(struct tcf_proto *tp, struct tc_u_knode *n, - bool add, tc_setup_cb_t *cb, void *cb_priv, + bool add, flow_setup_cb_t *cb, void *cb_priv, struct netlink_ext_ack *extack) { struct tc_u_hnode *ht = rtnl_dereference(n->ht_down); @@ -1213,7 +1213,7 @@ static int u32_reoffload_knode(struct tcf_proto *tp, struct tc_u_knode *n, return 0; } -static int u32_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb, +static int u32_reoffload(struct tcf_proto *tp, bool add, flow_setup_cb_t *cb, void *cb_priv, struct netlink_ext_ack *extack) { struct tc_u_common *tp_c = tp->data; -- cgit v1.2.3 From 14bfb13f0ed525ed117b5d1f3e77e7c0a6be15de Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 19 Jul 2019 18:20:16 +0200 Subject: net: flow_offload: add flow_block structure and use it This object stores the flow block callbacks that are attached to this block. Update flow_block_cb_lookup() to take this new object. This patch restores the block sharing feature. Fixes: da3eeb904ff4 ("net: flow_offload: add list handling functions") Signed-off-by: Pablo Neira Ayuso Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 2 +- drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 8 +++++--- drivers/net/ethernet/mscc/ocelot_flower.c | 8 ++++---- drivers/net/ethernet/mscc/ocelot_tc.c | 2 +- drivers/net/ethernet/netronome/nfp/flower/offload.c | 5 +++-- include/net/flow_offload.h | 12 +++++++++++- include/net/netfilter/nf_tables.h | 5 +++-- include/net/sch_generic.h | 2 +- net/core/flow_offload.c | 6 +++--- net/dsa/slave.c | 2 +- net/netfilter/nf_tables_api.c | 2 +- net/netfilter/nf_tables_offload.c | 5 +++-- net/sched/cls_api.c | 10 +++++++--- 13 files changed, 44 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index 2162412073c5..7f747cb1a4f4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -752,7 +752,7 @@ mlx5e_rep_indr_setup_tc_block(struct net_device *netdev, if (!indr_priv) return -ENOENT; - block_cb = flow_block_cb_lookup(f, + block_cb = flow_block_cb_lookup(f->block, mlx5e_rep_indr_setup_block_cb, indr_priv); if (!block_cb) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index 51cd0b6f1f3e..650638152bbc 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -1604,7 +1604,8 @@ mlxsw_sp_setup_tc_block_flower_bind(struct mlxsw_sp_port *mlxsw_sp_port, bool register_block = false; int err; - block_cb = flow_block_cb_lookup(f, mlxsw_sp_setup_tc_block_cb_flower, + block_cb = flow_block_cb_lookup(f->block, + mlxsw_sp_setup_tc_block_cb_flower, mlxsw_sp); if (!block_cb) { acl_block = mlxsw_sp_acl_block_create(mlxsw_sp, f->net); @@ -1656,7 +1657,8 @@ mlxsw_sp_setup_tc_block_flower_unbind(struct mlxsw_sp_port *mlxsw_sp_port, struct flow_block_cb *block_cb; int err; - block_cb = flow_block_cb_lookup(f, mlxsw_sp_setup_tc_block_cb_flower, + block_cb = flow_block_cb_lookup(f->block, + mlxsw_sp_setup_tc_block_cb_flower, mlxsw_sp); if (!block_cb) return; @@ -1717,7 +1719,7 @@ static int mlxsw_sp_setup_tc_block(struct mlxsw_sp_port *mlxsw_sp_port, case FLOW_BLOCK_UNBIND: mlxsw_sp_setup_tc_block_flower_unbind(mlxsw_sp_port, f, ingress); - block_cb = flow_block_cb_lookup(f, cb, mlxsw_sp_port); + block_cb = flow_block_cb_lookup(f->block, cb, mlxsw_sp_port); if (!block_cb) return -ENOENT; diff --git a/drivers/net/ethernet/mscc/ocelot_flower.c b/drivers/net/ethernet/mscc/ocelot_flower.c index 6a11aea8b186..59487d446a09 100644 --- a/drivers/net/ethernet/mscc/ocelot_flower.c +++ b/drivers/net/ethernet/mscc/ocelot_flower.c @@ -316,8 +316,8 @@ int ocelot_setup_tc_block_flower_bind(struct ocelot_port *port, if (f->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS) return -EOPNOTSUPP; - block_cb = flow_block_cb_lookup(f, ocelot_setup_tc_block_cb_flower, - port); + block_cb = flow_block_cb_lookup(f->block, + ocelot_setup_tc_block_cb_flower, port); if (!block_cb) { port_block = ocelot_port_block_create(port); if (!port_block) @@ -350,8 +350,8 @@ void ocelot_setup_tc_block_flower_unbind(struct ocelot_port *port, { struct flow_block_cb *block_cb; - block_cb = flow_block_cb_lookup(f, ocelot_setup_tc_block_cb_flower, - port); + block_cb = flow_block_cb_lookup(f->block, + ocelot_setup_tc_block_cb_flower, port); if (!block_cb) return; diff --git a/drivers/net/ethernet/mscc/ocelot_tc.c b/drivers/net/ethernet/mscc/ocelot_tc.c index fba9512e9ca6..16a6db71ca5e 100644 --- a/drivers/net/ethernet/mscc/ocelot_tc.c +++ b/drivers/net/ethernet/mscc/ocelot_tc.c @@ -169,7 +169,7 @@ static int ocelot_setup_tc_block(struct ocelot_port *port, list_add_tail(&block_cb->driver_list, f->driver_block_list); return 0; case FLOW_BLOCK_UNBIND: - block_cb = flow_block_cb_lookup(f, cb, port); + block_cb = flow_block_cb_lookup(f->block, cb, port); if (!block_cb) return -ENOENT; diff --git a/drivers/net/ethernet/netronome/nfp/flower/offload.c b/drivers/net/ethernet/netronome/nfp/flower/offload.c index 93ab0db6c504..e209f150c5f2 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/offload.c +++ b/drivers/net/ethernet/netronome/nfp/flower/offload.c @@ -1327,7 +1327,8 @@ static int nfp_flower_setup_tc_block(struct net_device *netdev, list_add_tail(&block_cb->driver_list, &nfp_block_cb_list); return 0; case FLOW_BLOCK_UNBIND: - block_cb = flow_block_cb_lookup(f, nfp_flower_setup_tc_block_cb, + block_cb = flow_block_cb_lookup(f->block, + nfp_flower_setup_tc_block_cb, repr); if (!block_cb) return -ENOENT; @@ -1440,7 +1441,7 @@ nfp_flower_setup_indr_tc_block(struct net_device *netdev, struct nfp_app *app, if (!cb_priv) return -ENOENT; - block_cb = flow_block_cb_lookup(f, + block_cb = flow_block_cb_lookup(f->block, nfp_flower_setup_indr_block_cb, cb_priv); if (!block_cb) diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h index 23b299235baf..b16d21636d69 100644 --- a/include/net/flow_offload.h +++ b/include/net/flow_offload.h @@ -249,6 +249,10 @@ enum flow_block_binder_type { FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS, }; +struct flow_block { + struct list_head cb_list; +}; + struct netlink_ext_ack; struct flow_block_offload { @@ -256,6 +260,7 @@ struct flow_block_offload { enum flow_block_binder_type binder_type; bool block_shared; struct net *net; + struct flow_block *block; struct list_head cb_list; struct list_head *driver_block_list; struct netlink_ext_ack *extack; @@ -280,7 +285,7 @@ struct flow_block_cb *flow_block_cb_alloc(flow_setup_cb_t *cb, void (*release)(void *cb_priv)); void flow_block_cb_free(struct flow_block_cb *block_cb); -struct flow_block_cb *flow_block_cb_lookup(struct flow_block_offload *offload, +struct flow_block_cb *flow_block_cb_lookup(struct flow_block *block, flow_setup_cb_t *cb, void *cb_ident); void *flow_block_cb_priv(struct flow_block_cb *block_cb); @@ -337,4 +342,9 @@ flow_cls_offload_flow_rule(struct flow_cls_offload *flow_cmd) return flow_cmd->rule; } +static inline void flow_block_init(struct flow_block *flow_block) +{ + INIT_LIST_HEAD(&flow_block->cb_list); +} + #endif /* _NET_FLOW_OFFLOAD_H */ diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 35dfdd9f69b3..9b624566b82d 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -11,6 +11,7 @@ #include #include #include +#include struct module; @@ -951,7 +952,7 @@ struct nft_stats { * @stats: per-cpu chain stats * @chain: the chain * @dev_name: device name that this base chain is attached to (if any) - * @cb_list: list of flow block callbacks (for hardware offload) + * @flow_block: flow block (for hardware offload) */ struct nft_base_chain { struct nf_hook_ops ops; @@ -961,7 +962,7 @@ struct nft_base_chain { struct nft_stats __percpu *stats; struct nft_chain chain; char dev_name[IFNAMSIZ]; - struct list_head cb_list; + struct flow_block flow_block; }; static inline struct nft_base_chain *nft_base_chain(const struct nft_chain *chain) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 9482e060483b..6b6b01234dd9 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -399,7 +399,7 @@ struct tcf_block { refcount_t refcnt; struct net *net; struct Qdisc *q; - struct list_head cb_list; + struct flow_block flow_block; struct list_head owner_list; bool keep_dst; unsigned int offloadcnt; /* Number of oddloaded filters */ diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c index a800fa78d96c..d63b970784dc 100644 --- a/net/core/flow_offload.c +++ b/net/core/flow_offload.c @@ -193,12 +193,12 @@ void flow_block_cb_free(struct flow_block_cb *block_cb) } EXPORT_SYMBOL(flow_block_cb_free); -struct flow_block_cb *flow_block_cb_lookup(struct flow_block_offload *f, +struct flow_block_cb *flow_block_cb_lookup(struct flow_block *block, flow_setup_cb_t *cb, void *cb_ident) { struct flow_block_cb *block_cb; - list_for_each_entry(block_cb, f->driver_block_list, driver_list) { + list_for_each_entry(block_cb, &block->cb_list, list) { if (block_cb->cb == cb && block_cb->cb_ident == cb_ident) return block_cb; @@ -268,7 +268,7 @@ int flow_block_cb_setup_simple(struct flow_block_offload *f, list_add_tail(&block_cb->driver_list, driver_block_list); return 0; case FLOW_BLOCK_UNBIND: - block_cb = flow_block_cb_lookup(f, cb, cb_ident); + block_cb = flow_block_cb_lookup(f->block, cb, cb_ident); if (!block_cb) return -ENOENT; diff --git a/net/dsa/slave.c b/net/dsa/slave.c index d697a64fb564..33f41178afcc 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -975,7 +975,7 @@ static int dsa_slave_setup_tc_block(struct net_device *dev, list_add_tail(&block_cb->driver_list, &dsa_slave_block_cb_list); return 0; case FLOW_BLOCK_UNBIND: - block_cb = flow_block_cb_lookup(f, cb, dev); + block_cb = flow_block_cb_lookup(f->block, cb, dev); if (!block_cb) return -ENOENT; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 014e06b0b5cf..605a7cfe7ca7 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1662,7 +1662,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, chain->flags |= NFT_BASE_CHAIN | flags; basechain->policy = NF_ACCEPT; - INIT_LIST_HEAD(&basechain->cb_list); + flow_block_init(&basechain->flow_block); } else { chain = kzalloc(sizeof(*chain), GFP_KERNEL); if (chain == NULL) diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c index 2c3302845f67..64f5fd5f240e 100644 --- a/net/netfilter/nf_tables_offload.c +++ b/net/netfilter/nf_tables_offload.c @@ -116,7 +116,7 @@ static int nft_setup_cb_call(struct nft_base_chain *basechain, struct flow_block_cb *block_cb; int err; - list_for_each_entry(block_cb, &basechain->cb_list, list) { + list_for_each_entry(block_cb, &basechain->flow_block.cb_list, list) { err = block_cb->cb(type, type_data, block_cb->cb_priv); if (err < 0) return err; @@ -154,7 +154,7 @@ static int nft_flow_offload_rule(struct nft_trans *trans, static int nft_flow_offload_bind(struct flow_block_offload *bo, struct nft_base_chain *basechain) { - list_splice(&bo->cb_list, &basechain->cb_list); + list_splice(&bo->cb_list, &basechain->flow_block.cb_list); return 0; } @@ -198,6 +198,7 @@ static int nft_flow_offload_chain(struct nft_trans *trans, return -EOPNOTSUPP; bo.command = cmd; + bo.block = &basechain->flow_block; bo.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS; bo.extack = &extack; INIT_LIST_HEAD(&bo.cb_list); diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 78f0f2815b8c..15796fd47fda 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -691,6 +691,8 @@ static void tc_indr_block_ing_cmd(struct tc_indr_block_dev *indr_dev, if (!indr_dev->block) return; + bo.block = &indr_dev->block->flow_block; + indr_block_cb->cb(indr_dev->dev, indr_block_cb->cb_priv, TC_SETUP_BLOCK, &bo); tcf_block_setup(indr_dev->block, &bo); @@ -775,6 +777,7 @@ static void tc_indr_block_call(struct tcf_block *block, struct net_device *dev, .command = command, .binder_type = ei->binder_type, .net = dev_net(dev), + .block = &block->flow_block, .block_shared = tcf_block_shared(block), .extack = extack, }; @@ -810,6 +813,7 @@ static int tcf_block_offload_cmd(struct tcf_block *block, bo.net = dev_net(dev); bo.command = command; bo.binder_type = ei->binder_type; + bo.block = &block->flow_block; bo.block_shared = tcf_block_shared(block); bo.extack = extack; INIT_LIST_HEAD(&bo.cb_list); @@ -987,8 +991,8 @@ static struct tcf_block *tcf_block_create(struct net *net, struct Qdisc *q, return ERR_PTR(-ENOMEM); } mutex_init(&block->lock); + flow_block_init(&block->flow_block); INIT_LIST_HEAD(&block->chain_list); - INIT_LIST_HEAD(&block->cb_list); INIT_LIST_HEAD(&block->owner_list); INIT_LIST_HEAD(&block->chain0.filter_chain_list); @@ -1570,7 +1574,7 @@ static int tcf_block_bind(struct tcf_block *block, i++; } - list_splice(&bo->cb_list, &block->cb_list); + list_splice(&bo->cb_list, &block->flow_block.cb_list); return 0; @@ -3156,7 +3160,7 @@ int tc_setup_cb_call(struct tcf_block *block, enum tc_setup_type type, if (block->nooffloaddevcnt && err_stop) return -EOPNOTSUPP; - list_for_each_entry(block_cb, &block->cb_list, list) { + list_for_each_entry(block_cb, &block->flow_block.cb_list, list) { err = block_cb->cb(type, type_data, block_cb->cb_priv); if (err) { if (err_stop) -- cgit v1.2.3 From 91046d6364afde646734c7ead1f649d253c386e9 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 25 Jun 2019 10:04:51 +0200 Subject: nl80211: fix VENDOR_CMD_RAW_DATA Since ERR_PTR() is an inline, not a macro, just open-code it here so it's usable as an initializer, fixing the build in brcmfmac. Reported-by: Arend Van Spriel Fixes: 901bb9891855 ("nl80211: require and validate vendor command policy") Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 88c27153a4bc..45850a8391d9 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -4170,7 +4170,7 @@ struct sta_opmode_info { u8 rx_nss; }; -#define VENDOR_CMD_RAW_DATA ((const struct nla_policy *)ERR_PTR(-ENODATA)) +#define VENDOR_CMD_RAW_DATA ((const struct nla_policy *)(long)(-ENODATA)) /** * struct wiphy_vendor_command - vendor command definition -- cgit v1.2.3 From 5edaac063bbf1267260ad2a5b9bb803399343e58 Mon Sep 17 00:00:00 2001 From: John Crispin Date: Thu, 27 Jun 2019 11:58:32 +0200 Subject: nl80211: fix NL80211_HE_MAX_CAPABILITY_LEN NL80211_HE_MAX_CAPABILITY_LEN has changed between D2.0 and D4.0. It is now MAC (6) + PHY (11) + MCS (12) + PPE (25) = 54. Signed-off-by: John Crispin Link: https://lore.kernel.org/r/20190627095832.19445-1-john@phrozen.org Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 75758ec26c8b..beb9a9d0c00a 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2863,7 +2863,7 @@ enum nl80211_attrs { #define NL80211_HT_CAPABILITY_LEN 26 #define NL80211_VHT_CAPABILITY_LEN 12 #define NL80211_HE_MIN_CAPABILITY_LEN 16 -#define NL80211_HE_MAX_CAPABILITY_LEN 51 +#define NL80211_HE_MAX_CAPABILITY_LEN 54 #define NL80211_MAX_NR_CIPHER_SUITES 5 #define NL80211_MAX_NR_AKM_SUITES 2 -- cgit v1.2.3 From 408d2bbbfd4687c435ee5d4967dbe95bc9be82ed Mon Sep 17 00:00:00 2001 From: Jeremy Sowden Date: Sun, 21 Jul 2019 12:31:05 +0100 Subject: kbuild: add net/netfilter/nf_tables_offload.h to header-test blacklist. net/netfilter/nf_tables_offload.h includes net/netfilter/nf_tables.h which is itself on the blacklist. Reported-by: Jakub Kicinski Signed-off-by: Jeremy Sowden Acked-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/Kbuild | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/Kbuild b/include/Kbuild index 7e9f1acb9dd5..8de846a83d8f 100644 --- a/include/Kbuild +++ b/include/Kbuild @@ -909,6 +909,7 @@ header-test- += net/netfilter/nf_tables.h header-test- += net/netfilter/nf_tables_core.h header-test- += net/netfilter/nf_tables_ipv4.h header-test- += net/netfilter/nf_tables_ipv6.h +header-test- += net/netfilter/nf_tables_offload.h header-test- += net/netfilter/nft_fib.h header-test- += net/netfilter/nft_meta.h header-test- += net/netfilter/nft_reject.h -- cgit v1.2.3 From 903e9d1bffb557220af276eda97b9d6b103ec9e0 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Thu, 18 Jul 2019 07:26:46 +0300 Subject: connector: remove redundant input callback from cn_dev A small cleanup: this callback is never used. Originally fixed by Stanislav Kinsburskiy for OpenVZ7 bug OVZ-6877 cc: stanislav.kinsburskiy@gmail.com Signed-off-by: Vasily Averin Signed-off-by: David S. Miller --- drivers/connector/connector.c | 6 +----- include/linux/connector.h | 1 - 2 files changed, 1 insertion(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/connector/connector.c b/drivers/connector/connector.c index 23553ed6b548..2d22d6bf52f2 100644 --- a/drivers/connector/connector.c +++ b/drivers/connector/connector.c @@ -248,16 +248,12 @@ static int __maybe_unused cn_proc_show(struct seq_file *m, void *v) return 0; } -static struct cn_dev cdev = { - .input = cn_rx_skb, -}; - static int cn_init(void) { struct cn_dev *dev = &cdev; struct netlink_kernel_cfg cfg = { .groups = CN_NETLINK_USERS + 0xf, - .input = dev->input, + .input = cn_rx_skb, }; dev->nls = netlink_kernel_create(&init_net, NETLINK_CONNECTOR, &cfg); diff --git a/include/linux/connector.h b/include/linux/connector.h index 6b6c7396a584..cb732643471b 100644 --- a/include/linux/connector.h +++ b/include/linux/connector.h @@ -50,7 +50,6 @@ struct cn_dev { u32 seq, groups; struct sock *nls; - void (*input) (struct sk_buff *skb); struct cn_queue_dev *cbdev; }; -- cgit v1.2.3 From c9b07eab0c8760bdd4cf8624c482ee145a322a3b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 14 Jul 2019 13:22:27 -0400 Subject: audit_inode(): switch to passing AUDIT_INODE_... don't bother with remapping LOOKUP_... values - all callers pass constants and we can just as well pass the right ones from the very beginning. Signed-off-by: Al Viro --- fs/namei.c | 6 +++--- include/linux/audit.h | 20 +++++++------------- 2 files changed, 10 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/fs/namei.c b/fs/namei.c index 5b8c72dc0217..3fca26398bc2 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2391,7 +2391,7 @@ static struct filename *filename_parentat(int dfd, struct filename *name, if (likely(!retval)) { *last = nd.last; *type = nd.last_type; - audit_inode(name, parent->dentry, LOOKUP_PARENT); + audit_inode(name, parent->dentry, AUDIT_INODE_PARENT); } else { putname(name); name = ERR_PTR(retval); @@ -2718,7 +2718,7 @@ filename_mountpoint(int dfd, struct filename *name, struct path *path, if (unlikely(error == -ESTALE)) error = path_mountpoint(&nd, flags | LOOKUP_REVAL, path); if (likely(!error)) - audit_inode(name, path->dentry, LOOKUP_NO_EVAL); + audit_inode(name, path->dentry, AUDIT_INODE_NOEVAL); restore_nameidata(); putname(name); return error; @@ -3299,7 +3299,7 @@ static int do_last(struct nameidata *nd, if (error) return error; - audit_inode(nd->name, dir, LOOKUP_PARENT); + audit_inode(nd->name, dir, AUDIT_INODE_PARENT); /* trailing slashes? */ if (unlikely(nd->last.name[nd->last.len])) return -EISDIR; diff --git a/include/linux/audit.h b/include/linux/audit.h index 97d0925454df..543763ab0354 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -252,6 +252,10 @@ static inline int audit_signal_info(int sig, struct task_struct *t) #define audit_is_compat(arch) false #endif +#define AUDIT_INODE_PARENT 1 /* dentry represents the parent */ +#define AUDIT_INODE_HIDDEN 2 /* audit record should be hidden */ +#define AUDIT_INODE_NOEVAL 4 /* audit record incomplete */ + #ifdef CONFIG_AUDITSYSCALL #include /* for syscall_get_arch() */ @@ -265,9 +269,6 @@ extern void __audit_syscall_exit(int ret_success, long ret_value); extern struct filename *__audit_reusename(const __user char *uptr); extern void __audit_getname(struct filename *name); -#define AUDIT_INODE_PARENT 1 /* dentry represents the parent */ -#define AUDIT_INODE_HIDDEN 2 /* audit record should be hidden */ -#define AUDIT_INODE_NOEVAL 4 /* audit record incomplete */ extern void __audit_inode(struct filename *name, const struct dentry *dentry, unsigned int flags); extern void __audit_file(const struct file *); @@ -328,16 +329,9 @@ static inline void audit_getname(struct filename *name) } static inline void audit_inode(struct filename *name, const struct dentry *dentry, - unsigned int flags) { - if (unlikely(!audit_dummy_context())) { - unsigned int aflags = 0; - - if (flags & LOOKUP_PARENT) - aflags |= AUDIT_INODE_PARENT; - if (flags & LOOKUP_NO_EVAL) - aflags |= AUDIT_INODE_NOEVAL; + unsigned int aflags) { + if (unlikely(!audit_dummy_context())) __audit_inode(name, dentry, aflags); - } } static inline void audit_file(struct file *file) { @@ -561,7 +555,7 @@ static inline void __audit_inode_child(struct inode *parent, { } static inline void audit_inode(struct filename *name, const struct dentry *dentry, - unsigned int parent) + unsigned int aflags) { } static inline void audit_file(struct file *file) { -- cgit v1.2.3 From b617158dc096709d8600c53b6052144d12b89fab Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 19 Jul 2019 11:52:33 -0700 Subject: tcp: be more careful in tcp_fragment() Some applications set tiny SO_SNDBUF values and expect TCP to just work. Recent patches to address CVE-2019-11478 broke them in case of losses, since retransmits might be prevented. We should allow these flows to make progress. This patch allows the first and last skb in retransmit queue to be split even if memory limits are hit. It also adds the some room due to the fact that tcp_sendmsg() and tcp_sendpage() might overshoot sk_wmem_queued by about one full TSO skb (64KB size). Note this allowance was already present in stable backports for kernels < 4.15 Note for < 4.15 backports : tcp_rtx_queue_tail() will probably look like : static inline struct sk_buff *tcp_rtx_queue_tail(const struct sock *sk) { struct sk_buff *skb = tcp_send_head(sk); return skb ? tcp_write_queue_prev(sk, skb) : tcp_write_queue_tail(sk); } Fixes: f070ef2ac667 ("tcp: tcp_fragment() should apply sane memory limits") Signed-off-by: Eric Dumazet Reported-by: Andrew Prout Tested-by: Andrew Prout Tested-by: Jonathan Lemon Tested-by: Michal Kubecek Acked-by: Neal Cardwell Acked-by: Yuchung Cheng Acked-by: Christoph Paasch Cc: Jonathan Looney Signed-off-by: David S. Miller --- include/net/tcp.h | 5 +++++ net/ipv4/tcp_output.c | 13 +++++++++++-- 2 files changed, 16 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index f42d300f0cfa..e5cf514ba118 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1709,6 +1709,11 @@ static inline struct sk_buff *tcp_rtx_queue_head(const struct sock *sk) return skb_rb_first(&sk->tcp_rtx_queue); } +static inline struct sk_buff *tcp_rtx_queue_tail(const struct sock *sk) +{ + return skb_rb_last(&sk->tcp_rtx_queue); +} + static inline struct sk_buff *tcp_write_queue_head(const struct sock *sk) { return skb_peek(&sk->sk_write_queue); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 4af1f5dae9d3..6e4afc48d7bb 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1288,6 +1288,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *buff; int nsize, old_factor; + long limit; int nlen; u8 flags; @@ -1298,8 +1299,16 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, if (nsize < 0) nsize = 0; - if (unlikely((sk->sk_wmem_queued >> 1) > sk->sk_sndbuf && - tcp_queue != TCP_FRAG_IN_WRITE_QUEUE)) { + /* tcp_sendmsg() can overshoot sk_wmem_queued by one full size skb. + * We need some allowance to not penalize applications setting small + * SO_SNDBUF values. + * Also allow first and last skb in retransmit queue to be split. + */ + limit = sk->sk_sndbuf + 2 * SKB_TRUESIZE(GSO_MAX_SIZE); + if (unlikely((sk->sk_wmem_queued >> 1) > limit && + tcp_queue != TCP_FRAG_IN_WRITE_QUEUE && + skb != tcp_rtx_queue_head(sk) && + skb != tcp_rtx_queue_tail(sk))) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPWQUEUETOOBIG); return -ENOMEM; } -- cgit v1.2.3 From 893a1c97205a3ece0cbb3f571a3b972080f3b4c7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 16 Jul 2019 13:55:23 -0600 Subject: blk-mq: allow REQ_NOWAIT to return an error inline By default, if a caller sets REQ_NOWAIT and we need to block, we'll return -EAGAIN through the bio->bi_end_io() callback. For some use cases, this makes it hard to use. Allow a caller to ask for inline return of errors related to blocking by also setting REQ_NOWAIT_INLINE. Signed-off-by: Jens Axboe --- block/blk-mq.c | 8 ++++++-- include/linux/blk_types.h | 5 ++++- 2 files changed, 10 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/block/blk-mq.c b/block/blk-mq.c index b038ec680e84..2bc2c0705660 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1960,9 +1960,13 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) rq = blk_mq_get_request(q, bio, &data); if (unlikely(!rq)) { rq_qos_cleanup(q, bio); - if (bio->bi_opf & REQ_NOWAIT) + + cookie = BLK_QC_T_NONE; + if (bio->bi_opf & REQ_NOWAIT_INLINE) + cookie = BLK_QC_T_EAGAIN; + else if (bio->bi_opf & REQ_NOWAIT) bio_wouldblock_error(bio); - return BLK_QC_T_NONE; + return cookie; } trace_block_getrq(q, bio, bio->bi_opf); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index feff3fe4467e..1b1fa1557e68 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -311,6 +311,7 @@ enum req_flag_bits { __REQ_RAHEAD, /* read ahead, can fail anytime */ __REQ_BACKGROUND, /* background IO */ __REQ_NOWAIT, /* Don't wait if request will block */ + __REQ_NOWAIT_INLINE, /* Return would-block error inline */ /* * When a shared kthread needs to issue a bio for a cgroup, doing * so synchronously can lead to priority inversions as the kthread @@ -345,6 +346,7 @@ enum req_flag_bits { #define REQ_RAHEAD (1ULL << __REQ_RAHEAD) #define REQ_BACKGROUND (1ULL << __REQ_BACKGROUND) #define REQ_NOWAIT (1ULL << __REQ_NOWAIT) +#define REQ_NOWAIT_INLINE (1ULL << __REQ_NOWAIT_INLINE) #define REQ_CGROUP_PUNT (1ULL << __REQ_CGROUP_PUNT) #define REQ_NOUNMAP (1ULL << __REQ_NOUNMAP) @@ -418,12 +420,13 @@ static inline int op_stat_group(unsigned int op) typedef unsigned int blk_qc_t; #define BLK_QC_T_NONE -1U +#define BLK_QC_T_EAGAIN -2U #define BLK_QC_T_SHIFT 16 #define BLK_QC_T_INTERNAL (1U << 31) static inline bool blk_qc_t_valid(blk_qc_t cookie) { - return cookie != BLK_QC_T_NONE; + return cookie != BLK_QC_T_NONE && cookie != BLK_QC_T_EAGAIN; } static inline unsigned int blk_qc_t_to_queue_num(blk_qc_t cookie) -- cgit v1.2.3 From 5d4b45a1dd7b00feab57624035dcdbc1bab2e0f8 Mon Sep 17 00:00:00 2001 From: Markus Koch Date: Sun, 21 Jul 2019 20:20:28 +0300 Subject: Input: add support for the FlySky FS-iA6B RC receiver This patch adds support for the FlySky FS-iA6B RC receiver (serial IBUS). It allows the usage of the FlySky FS-i6 and other AFHDS compliant remote controls as a joystick input device. To use it, a patch to inputattach which adds the FS-iA6B as a 115200 baud serial device is required. I will upstream it after this patch is merged. More information about the hardware can be found here: https://notsyncing.net/?p=blog&b=2018.linux-fsia6b Signed-off-by: Markus Koch Signed-off-by: Dmitry Torokhov --- MAINTAINERS | 6 ++ drivers/input/joystick/Kconfig | 10 ++ drivers/input/joystick/Makefile | 5 +- drivers/input/joystick/fsia6b.c | 231 ++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/serio.h | 1 + 5 files changed, 251 insertions(+), 2 deletions(-) create mode 100644 drivers/input/joystick/fsia6b.c (limited to 'include') diff --git a/MAINTAINERS b/MAINTAINERS index 677ef41cb012..cdd25b0a1218 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12394,6 +12394,12 @@ S: Maintained F: Documentation/input/devices/pxrc.rst F: drivers/input/joystick/pxrc.c +FLYSKY FSIA6B RC RECEIVER +M: Markus Koch +L: linux-input@vger.kernel.org +S: Maintained +F: drivers/input/joystick/fsia6b.c + PHONET PROTOCOL M: Remi Denis-Courmont S: Supported diff --git a/drivers/input/joystick/Kconfig b/drivers/input/joystick/Kconfig index 72b932901d00..312b854b5506 100644 --- a/drivers/input/joystick/Kconfig +++ b/drivers/input/joystick/Kconfig @@ -362,4 +362,14 @@ config JOYSTICK_PXRC To compile this driver as a module, choose M here: the module will be called pxrc. +config JOYSTICK_FSIA6B + tristate "FlySky FS-iA6B RC Receiver" + select SERIO + help + Say Y here if you use a FlySky FS-i6 RC remote control along with the + FS-iA6B RC receiver as a joystick input device. + + To compile this driver as a module, choose M here: the + module will be called fsia6b. + endif diff --git a/drivers/input/joystick/Makefile b/drivers/input/joystick/Makefile index dd0492ebbed7..8656023f6ef5 100644 --- a/drivers/input/joystick/Makefile +++ b/drivers/input/joystick/Makefile @@ -12,6 +12,7 @@ obj-$(CONFIG_JOYSTICK_AS5011) += as5011.o obj-$(CONFIG_JOYSTICK_ANALOG) += analog.o obj-$(CONFIG_JOYSTICK_COBRA) += cobra.o obj-$(CONFIG_JOYSTICK_DB9) += db9.o +obj-$(CONFIG_JOYSTICK_FSIA6B) += fsia6b.o obj-$(CONFIG_JOYSTICK_GAMECON) += gamecon.o obj-$(CONFIG_JOYSTICK_GF2K) += gf2k.o obj-$(CONFIG_JOYSTICK_GRIP) += grip.o @@ -23,7 +24,7 @@ obj-$(CONFIG_JOYSTICK_JOYDUMP) += joydump.o obj-$(CONFIG_JOYSTICK_MAGELLAN) += magellan.o obj-$(CONFIG_JOYSTICK_MAPLE) += maplecontrol.o obj-$(CONFIG_JOYSTICK_PSXPAD_SPI) += psxpad-spi.o -obj-$(CONFIG_JOYSTICK_PXRC) += pxrc.o +obj-$(CONFIG_JOYSTICK_PXRC) += pxrc.o obj-$(CONFIG_JOYSTICK_SIDEWINDER) += sidewinder.o obj-$(CONFIG_JOYSTICK_SPACEBALL) += spaceball.o obj-$(CONFIG_JOYSTICK_SPACEORB) += spaceorb.o @@ -32,7 +33,7 @@ obj-$(CONFIG_JOYSTICK_TMDC) += tmdc.o obj-$(CONFIG_JOYSTICK_TURBOGRAFX) += turbografx.o obj-$(CONFIG_JOYSTICK_TWIDJOY) += twidjoy.o obj-$(CONFIG_JOYSTICK_WARRIOR) += warrior.o +obj-$(CONFIG_JOYSTICK_WALKERA0701) += walkera0701.o obj-$(CONFIG_JOYSTICK_XPAD) += xpad.o obj-$(CONFIG_JOYSTICK_ZHENHUA) += zhenhua.o -obj-$(CONFIG_JOYSTICK_WALKERA0701) += walkera0701.o diff --git a/drivers/input/joystick/fsia6b.c b/drivers/input/joystick/fsia6b.c new file mode 100644 index 000000000000..e78c4c768990 --- /dev/null +++ b/drivers/input/joystick/fsia6b.c @@ -0,0 +1,231 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * FS-iA6B iBus RC receiver driver + * + * This driver provides all 14 channels of the FlySky FS-ia6B RC receiver + * as analog values. + * + * Additionally, the channels can be converted to discrete switch values. + * By default, it is configured for the offical FS-i6 remote control. + * If you use a different hardware configuration, you can configure it + * using the `switch_config` parameter. + */ + +#include +#include +#include +#include +#include +#include +#include + +#define DRIVER_DESC "FS-iA6B iBus RC receiver" + +MODULE_AUTHOR("Markus Koch "); +MODULE_DESCRIPTION(DRIVER_DESC); +MODULE_LICENSE("GPL"); + +#define IBUS_SERVO_COUNT 14 + +static char *switch_config = "00000022320000"; +module_param(switch_config, charp, 0444); +MODULE_PARM_DESC(switch_config, + "Amount of switch positions per channel (14 characters, 0-3)"); + +static int fsia6b_axes[IBUS_SERVO_COUNT] = { + ABS_X, ABS_Y, + ABS_Z, ABS_RX, + ABS_RY, ABS_RZ, + ABS_HAT0X, ABS_HAT0Y, + ABS_HAT1X, ABS_HAT1Y, + ABS_HAT2X, ABS_HAT2Y, + ABS_HAT3X, ABS_HAT3Y +}; + +enum ibus_state { SYNC, COLLECT, PROCESS }; + +struct ibus_packet { + enum ibus_state state; + + int offset; + u16 ibuf; + u16 channel[IBUS_SERVO_COUNT]; +}; + +struct fsia6b { + struct input_dev *dev; + struct ibus_packet packet; + + char phys[32]; +}; + +static irqreturn_t fsia6b_serio_irq(struct serio *serio, + unsigned char data, unsigned int flags) +{ + struct fsia6b *fsia6b = serio_get_drvdata(serio); + int i; + int sw_state; + int sw_id = BTN_0; + + fsia6b->packet.ibuf = (data << 8) | ((fsia6b->packet.ibuf >> 8) & 0xFF); + + switch (fsia6b->packet.state) { + case SYNC: + if (fsia6b->packet.ibuf == 0x4020) + fsia6b->packet.state = COLLECT; + break; + + case COLLECT: + fsia6b->packet.state = PROCESS; + break; + + case PROCESS: + fsia6b->packet.channel[fsia6b->packet.offset] = + fsia6b->packet.ibuf; + fsia6b->packet.offset++; + + if (fsia6b->packet.offset == IBUS_SERVO_COUNT) { + fsia6b->packet.offset = 0; + fsia6b->packet.state = SYNC; + for (i = 0; i < IBUS_SERVO_COUNT; ++i) { + input_report_abs(fsia6b->dev, fsia6b_axes[i], + fsia6b->packet.channel[i]); + + sw_state = 0; + if (fsia6b->packet.channel[i] > 1900) + sw_state = 1; + else if (fsia6b->packet.channel[i] < 1100) + sw_state = 2; + + switch (switch_config[i]) { + case '3': + input_report_key(fsia6b->dev, + sw_id++, + sw_state == 0); + /* fall-through */ + case '2': + input_report_key(fsia6b->dev, + sw_id++, + sw_state == 1); + /* fall-through */ + case '1': + input_report_key(fsia6b->dev, + sw_id++, + sw_state == 2); + } + } + input_sync(fsia6b->dev); + } else { + fsia6b->packet.state = COLLECT; + } + break; + } + + return IRQ_HANDLED; +} + +static int fsia6b_serio_connect(struct serio *serio, struct serio_driver *drv) +{ + struct fsia6b *fsia6b; + struct input_dev *input_dev; + int err; + int i, j; + int sw_id = 0; + + fsia6b = kzalloc(sizeof(*fsia6b), GFP_KERNEL); + if (!fsia6b) + return -ENOMEM; + + fsia6b->packet.ibuf = 0; + fsia6b->packet.offset = 0; + fsia6b->packet.state = SYNC; + + serio_set_drvdata(serio, fsia6b); + + input_dev = input_allocate_device(); + if (!input_dev) { + err = -ENOMEM; + goto fail1; + } + fsia6b->dev = input_dev; + + snprintf(fsia6b->phys, sizeof(fsia6b->phys), "%s/input0", serio->phys); + + input_dev->name = DRIVER_DESC; + input_dev->phys = fsia6b->phys; + input_dev->id.bustype = BUS_RS232; + input_dev->id.vendor = SERIO_FSIA6B; + input_dev->id.product = serio->id.id; + input_dev->id.version = 0x0100; + input_dev->dev.parent = &serio->dev; + + for (i = 0; i < IBUS_SERVO_COUNT; i++) + input_set_abs_params(input_dev, fsia6b_axes[i], + 1000, 2000, 2, 2); + + /* Register switch configuration */ + for (i = 0; i < IBUS_SERVO_COUNT; i++) { + if (switch_config[i] < '0' || switch_config[i] > '3') { + dev_err(&fsia6b->dev->dev, + "Invalid switch configuration supplied for fsia6b.\n"); + err = -EINVAL; + goto fail2; + } + + for (j = '1'; j <= switch_config[i]; j++) { + input_set_capability(input_dev, EV_KEY, BTN_0 + sw_id); + sw_id++; + } + } + + err = serio_open(serio, drv); + if (err) + goto fail2; + + err = input_register_device(fsia6b->dev); + if (err) + goto fail3; + + return 0; + +fail3: serio_close(serio); +fail2: input_free_device(input_dev); +fail1: serio_set_drvdata(serio, NULL); + kfree(fsia6b); + return err; +} + +static void fsia6b_serio_disconnect(struct serio *serio) +{ + struct fsia6b *fsia6b = serio_get_drvdata(serio); + + serio_close(serio); + serio_set_drvdata(serio, NULL); + input_unregister_device(fsia6b->dev); + kfree(fsia6b); +} + +static const struct serio_device_id fsia6b_serio_ids[] = { + { + .type = SERIO_RS232, + .proto = SERIO_FSIA6B, + .id = SERIO_ANY, + .extra = SERIO_ANY, + }, + { 0 } +}; + +MODULE_DEVICE_TABLE(serio, fsia6b_serio_ids); + +static struct serio_driver fsia6b_serio_drv = { + .driver = { + .name = "fsia6b" + }, + .description = DRIVER_DESC, + .id_table = fsia6b_serio_ids, + .interrupt = fsia6b_serio_irq, + .connect = fsia6b_serio_connect, + .disconnect = fsia6b_serio_disconnect +}; + +module_serio_driver(fsia6b_serio_drv) diff --git a/include/uapi/linux/serio.h b/include/uapi/linux/serio.h index a0cac1d8670d..50e991952c97 100644 --- a/include/uapi/linux/serio.h +++ b/include/uapi/linux/serio.h @@ -82,5 +82,6 @@ #define SERIO_EGALAX 0x3f #define SERIO_PULSE8_CEC 0x40 #define SERIO_RAINSHADOW_CEC 0x41 +#define SERIO_FSIA6B 0x42 #endif /* _UAPI_SERIO_H */ -- cgit v1.2.3 From f9adc23ee91e6f561bb70c6147d8d45bd164d62f Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Tue, 16 Jul 2019 09:22:03 +0300 Subject: futex: Cleanup generic SMP variant of arch_futex_atomic_op_inuser() The generic SMP variant of arch_futex_atomic_op_inuser() returns always -ENOSYS so the switch case and surrounding code are pointless. Remove it and just return -ENOSYS. Signed-off-by: Vasily Averin Signed-off-by: Thomas Gleixner Acked-by: Arnd Bergmann Link: https://lkml.kernel.org/r/12bdaca8-99eb-e576-f842-5970ab1d6a92@virtuozzo.com --- include/asm-generic/futex.h | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) (limited to 'include') diff --git a/include/asm-generic/futex.h b/include/asm-generic/futex.h index 8666fe7f35d7..02970b11f71f 100644 --- a/include/asm-generic/futex.h +++ b/include/asm-generic/futex.h @@ -118,26 +118,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, static inline int arch_futex_atomic_op_inuser(int op, u32 oparg, int *oval, u32 __user *uaddr) { - int oldval = 0, ret; - - pagefault_disable(); - - switch (op) { - case FUTEX_OP_SET: - case FUTEX_OP_ADD: - case FUTEX_OP_OR: - case FUTEX_OP_ANDN: - case FUTEX_OP_XOR: - default: - ret = -ENOSYS; - } - - pagefault_enable(); - - if (!ret) - *oval = oldval; - - return ret; + return -ENOSYS; } static inline int -- cgit v1.2.3 From f240652b6032b48ad7fa35c5e701cc4c8d697c0b Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 5 Jul 2019 10:53:21 -0700 Subject: x86/mpx: Remove MPX APIs MPX is being removed from the kernel due to a lack of support in the toolchain going forward (gcc). The first step is to remove the userspace-visible ABIs so that applications will stop using it. The most visible one are the enable/disable prctl()s. Remove them first. This is the most minimal and least invasive change needed to ensure that apps stop using MPX with new kernels. Signed-off-by: Dave Hansen Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20190705175321.DB42F0AD@viggo.jf.intel.com --- include/uapi/linux/prctl.h | 2 +- kernel/sys.c | 16 ++-------------- 2 files changed, 3 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index 094bb03b9cc2..961e0a4a0f73 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -181,7 +181,7 @@ struct prctl_mm_map { #define PR_GET_THP_DISABLE 42 /* - * Tell the kernel to start/stop helping userspace manage bounds tables. + * No longer implemented, but left here to ensure the numbers stay reserved: */ #define PR_MPX_ENABLE_MANAGEMENT 43 #define PR_MPX_DISABLE_MANAGEMENT 44 diff --git a/kernel/sys.c b/kernel/sys.c index 2969304c29fe..384b000b7865 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -103,12 +103,6 @@ #ifndef SET_TSC_CTL # define SET_TSC_CTL(a) (-EINVAL) #endif -#ifndef MPX_ENABLE_MANAGEMENT -# define MPX_ENABLE_MANAGEMENT() (-EINVAL) -#endif -#ifndef MPX_DISABLE_MANAGEMENT -# define MPX_DISABLE_MANAGEMENT() (-EINVAL) -#endif #ifndef GET_FP_MODE # define GET_FP_MODE(a) (-EINVAL) #endif @@ -2456,15 +2450,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, up_write(&me->mm->mmap_sem); break; case PR_MPX_ENABLE_MANAGEMENT: - if (arg2 || arg3 || arg4 || arg5) - return -EINVAL; - error = MPX_ENABLE_MANAGEMENT(); - break; case PR_MPX_DISABLE_MANAGEMENT: - if (arg2 || arg3 || arg4 || arg5) - return -EINVAL; - error = MPX_DISABLE_MANAGEMENT(); - break; + /* No longer implemented: */ + return -EINVAL; case PR_SET_FP_MODE: error = SET_FP_MODE(me, arg2); break; -- cgit v1.2.3 From 6b0e42771795334bd24d089402d04d93bac048d3 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sun, 21 Jul 2019 19:01:34 -0400 Subject: locks: revise generic_add_lease tracepoint Now that check_conflicting_open uses inode->i_readcount instead of the dentry->d_count to detect opens for read, revise the tracepoint to display that value instead. Also, fl is never NULL, so no need to check for that in the fast assign section. Cc: Amir Goldstein Reported-by: Al Viro Signed-off-by: Jeff Layton --- include/trace/events/filelock.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/trace/events/filelock.h b/include/trace/events/filelock.h index 4b735923f2ff..c705e4944a50 100644 --- a/include/trace/events/filelock.h +++ b/include/trace/events/filelock.h @@ -176,7 +176,7 @@ TRACE_EVENT(generic_add_lease, TP_STRUCT__entry( __field(unsigned long, i_ino) __field(int, wcount) - __field(int, dcount) + __field(int, rcount) __field(int, icount) __field(dev_t, s_dev) __field(fl_owner_t, fl_owner) @@ -188,16 +188,16 @@ TRACE_EVENT(generic_add_lease, __entry->s_dev = inode->i_sb->s_dev; __entry->i_ino = inode->i_ino; __entry->wcount = atomic_read(&inode->i_writecount); - __entry->dcount = d_count(fl->fl_file->f_path.dentry); + __entry->rcount = atomic_read(&inode->i_readcount); __entry->icount = atomic_read(&inode->i_count); - __entry->fl_owner = fl ? fl->fl_owner : NULL; - __entry->fl_flags = fl ? fl->fl_flags : 0; - __entry->fl_type = fl ? fl->fl_type : 0; + __entry->fl_owner = fl->fl_owner; + __entry->fl_flags = fl->fl_flags; + __entry->fl_type = fl->fl_type; ), - TP_printk("dev=0x%x:0x%x ino=0x%lx wcount=%d dcount=%d icount=%d fl_owner=0x%p fl_flags=%s fl_type=%s", + TP_printk("dev=0x%x:0x%x ino=0x%lx wcount=%d rcount=%d icount=%d fl_owner=0x%p fl_flags=%s fl_type=%s", MAJOR(__entry->s_dev), MINOR(__entry->s_dev), - __entry->i_ino, __entry->wcount, __entry->dcount, + __entry->i_ino, __entry->wcount, __entry->rcount, __entry->icount, __entry->fl_owner, show_fl_flags(__entry->fl_flags), show_fl_type(__entry->fl_type)) -- cgit v1.2.3 From 62ec3d13601bd626ca7a0edef6d45dbb753d94e8 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 21 Jul 2019 23:23:08 +0900 Subject: ASoC: SOF: use __u32 instead of uint32_t in uapi headers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When CONFIG_UAPI_HEADER_TEST=y, exported headers are compile-tested to make sure they can be included from user-space. Currently, header.h and fw.h are excluded from the test coverage. To make them join the compile-test, we need to fix the build errors attached below. For a case like this, we decided to use __u{8,16,32,64} variable types in this discussion: https://lkml.org/lkml/2019/6/5/18 Build log: CC usr/include/sound/sof/header.h.s CC usr/include/sound/sof/fw.h.s In file included from :32:0: ./usr/include/sound/sof/header.h:19:2: error: unknown type name ‘uint32_t’ uint32_t magic; /**< 'S', 'O', 'F', '\0' */ ^~~~~~~~ ./usr/include/sound/sof/header.h:20:2: error: unknown type name ‘uint32_t’ uint32_t type; /**< component specific type */ ^~~~~~~~ ./usr/include/sound/sof/header.h:21:2: error: unknown type name ‘uint32_t’ uint32_t size; /**< size in bytes of data excl. this struct */ ^~~~~~~~ ./usr/include/sound/sof/header.h:22:2: error: unknown type name ‘uint32_t’ uint32_t abi; /**< SOF ABI version */ ^~~~~~~~ ./usr/include/sound/sof/header.h:23:2: error: unknown type name ‘uint32_t’ uint32_t reserved[4]; /**< reserved for future use */ ^~~~~~~~ ./usr/include/sound/sof/header.h:24:2: error: unknown type name ‘uint32_t’ uint32_t data[0]; /**< Component data - opaque to core */ ^~~~~~~~ In file included from :32:0: ./usr/include/sound/sof/fw.h:49:2: error: unknown type name ‘uint32_t’ uint32_t size; /* bytes minus this header */ ^~~~~~~~ ./usr/include/sound/sof/fw.h:50:2: error: unknown type name ‘uint32_t’ uint32_t offset; /* offset from base */ ^~~~~~~~ ./usr/include/sound/sof/fw.h:64:2: error: unknown type name ‘uint32_t’ uint32_t size; /* bytes minus this header */ ^~~~~~~~ ./usr/include/sound/sof/fw.h:65:2: error: unknown type name ‘uint32_t’ uint32_t num_blocks; /* number of blocks */ ^~~~~~~~ ./usr/include/sound/sof/fw.h:73:2: error: unknown type name ‘uint32_t’ uint32_t file_size; /* size of file minus this header */ ^~~~~~~~ ./usr/include/sound/sof/fw.h:74:2: error: unknown type name ‘uint32_t’ uint32_t num_modules; /* number of modules */ ^~~~~~~~ ./usr/include/sound/sof/fw.h:75:2: error: unknown type name ‘uint32_t’ uint32_t abi; /* version of header format */ ^~~~~~~~ Signed-off-by: Masahiro Yamada Link: https://lore.kernel.org/r/20190721142308.30306-1-yamada.masahiro@socionext.com Signed-off-by: Mark Brown --- include/uapi/sound/sof/fw.h | 16 +++++++++------- include/uapi/sound/sof/header.h | 14 ++++++++------ 2 files changed, 17 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/uapi/sound/sof/fw.h b/include/uapi/sound/sof/fw.h index 1afca973eb09..e9f697467a86 100644 --- a/include/uapi/sound/sof/fw.h +++ b/include/uapi/sound/sof/fw.h @@ -13,6 +13,8 @@ #ifndef __INCLUDE_UAPI_SOF_FW_H__ #define __INCLUDE_UAPI_SOF_FW_H__ +#include + #define SND_SOF_FW_SIG_SIZE 4 #define SND_SOF_FW_ABI 1 #define SND_SOF_FW_SIG "Reef" @@ -46,8 +48,8 @@ enum snd_sof_fw_blk_type { struct snd_sof_blk_hdr { enum snd_sof_fw_blk_type type; - uint32_t size; /* bytes minus this header */ - uint32_t offset; /* offset from base */ + __u32 size; /* bytes minus this header */ + __u32 offset; /* offset from base */ } __packed; /* @@ -61,8 +63,8 @@ enum snd_sof_fw_mod_type { struct snd_sof_mod_hdr { enum snd_sof_fw_mod_type type; - uint32_t size; /* bytes minus this header */ - uint32_t num_blocks; /* number of blocks */ + __u32 size; /* bytes minus this header */ + __u32 num_blocks; /* number of blocks */ } __packed; /* @@ -70,9 +72,9 @@ struct snd_sof_mod_hdr { */ struct snd_sof_fw_header { unsigned char sig[SND_SOF_FW_SIG_SIZE]; /* "Reef" */ - uint32_t file_size; /* size of file minus this header */ - uint32_t num_modules; /* number of modules */ - uint32_t abi; /* version of header format */ + __u32 file_size; /* size of file minus this header */ + __u32 num_modules; /* number of modules */ + __u32 abi; /* version of header format */ } __packed; #endif diff --git a/include/uapi/sound/sof/header.h b/include/uapi/sound/sof/header.h index 7868990b0d6f..5f4518e7a972 100644 --- a/include/uapi/sound/sof/header.h +++ b/include/uapi/sound/sof/header.h @@ -9,6 +9,8 @@ #ifndef __INCLUDE_UAPI_SOUND_SOF_USER_HEADER_H__ #define __INCLUDE_UAPI_SOUND_SOF_USER_HEADER_H__ +#include + /* * Header for all non IPC ABI data. * @@ -16,12 +18,12 @@ * Used by any bespoke component data structures or binary blobs. */ struct sof_abi_hdr { - uint32_t magic; /**< 'S', 'O', 'F', '\0' */ - uint32_t type; /**< component specific type */ - uint32_t size; /**< size in bytes of data excl. this struct */ - uint32_t abi; /**< SOF ABI version */ - uint32_t reserved[4]; /**< reserved for future use */ - uint32_t data[0]; /**< Component data - opaque to core */ + __u32 magic; /**< 'S', 'O', 'F', '\0' */ + __u32 type; /**< component specific type */ + __u32 size; /**< size in bytes of data excl. this struct */ + __u32 abi; /**< SOF ABI version */ + __u32 reserved[4]; /**< reserved for future use */ + __u32 data[0]; /**< Component data - opaque to core */ } __packed; #endif -- cgit v1.2.3 From cfc8f568aada98f9608a0a62511ca18d647613e2 Mon Sep 17 00:00:00 2001 From: Oleksandr Suvorov Date: Fri, 19 Jul 2019 10:05:30 +0000 Subject: ASoC: Define a set of DAPM pre/post-up events Prepare to use SND_SOC_DAPM_PRE_POST_PMU definition to reduce coming code size and make it more readable. Cc: stable@vger.kernel.org Signed-off-by: Oleksandr Suvorov Reviewed-by: Marcel Ziswiler Reviewed-by: Igor Opaniuk Reviewed-by: Fabio Estevam Link: https://lore.kernel.org/r/20190719100524.23300-2-oleksandr.suvorov@toradex.com Signed-off-by: Mark Brown --- include/sound/soc-dapm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/sound/soc-dapm.h b/include/sound/soc-dapm.h index c00a0b8ade08..6c6694160130 100644 --- a/include/sound/soc-dapm.h +++ b/include/sound/soc-dapm.h @@ -353,6 +353,8 @@ struct device; #define SND_SOC_DAPM_WILL_PMD 0x80 /* called at start of sequence */ #define SND_SOC_DAPM_PRE_POST_PMD \ (SND_SOC_DAPM_PRE_PMD | SND_SOC_DAPM_POST_PMD) +#define SND_SOC_DAPM_PRE_POST_PMU \ + (SND_SOC_DAPM_PRE_PMU | SND_SOC_DAPM_POST_PMU) /* convenience event type detection */ #define SND_SOC_DAPM_EVENT_ON(e) \ -- cgit v1.2.3 From 318892ac068397f40ff81d9155898da01493b1d2 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 19 Jul 2019 10:29:14 -0700 Subject: net/tls: don't arm strparser immediately in tls_set_sw_offload() In tls_set_device_offload_rx() we prepare the software context for RX fallback and proceed to add the connection to the device. Unfortunately, software context prep includes arming strparser so in case of a later error we have to release the socket lock to call strp_done(). In preparation for not releasing the socket lock half way through callbacks move arming strparser into a separate function. Following patches will make use of that. Signed-off-by: Jakub Kicinski Reviewed-by: Dirk van der Merwe Signed-off-by: Daniel Borkmann --- include/net/tls.h | 1 + net/tls/tls_device.c | 1 + net/tls/tls_main.c | 8 +++++--- net/tls/tls_sw.c | 19 ++++++++++++------- 4 files changed, 19 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/net/tls.h b/include/net/tls.h index 584609174fe0..43f551cd508b 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -355,6 +355,7 @@ int tls_sk_attach(struct sock *sk, int optname, char __user *optval, unsigned int optlen); int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx); +void tls_sw_strparser_arm(struct sock *sk, struct tls_context *ctx); int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); int tls_sw_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 7c0b2b778703..4d67d72f007c 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -1045,6 +1045,7 @@ int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx) rc = tls_set_sw_offload(sk, ctx, 0); if (rc) goto release_ctx; + tls_sw_strparser_arm(sk, ctx); rc = netdev->tlsdev_ops->tls_dev_add(netdev, sk, TLS_OFFLOAD_CTX_DIR_RX, &ctx->crypto_recv.info, diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 4674e57e66b0..85a9d7d57b32 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -526,6 +526,8 @@ static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval, { #endif rc = tls_set_sw_offload(sk, ctx, 1); + if (rc) + goto err_crypto_info; conf = TLS_SW; } } else { @@ -537,13 +539,13 @@ static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval, { #endif rc = tls_set_sw_offload(sk, ctx, 0); + if (rc) + goto err_crypto_info; + tls_sw_strparser_arm(sk, ctx); conf = TLS_SW; } } - if (rc) - goto err_crypto_info; - if (tx) ctx->tx_conf = conf; else diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 53b4ad94e74a..f58a8ffc2a9c 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -2160,6 +2160,18 @@ void tls_sw_write_space(struct sock *sk, struct tls_context *ctx) } } +void tls_sw_strparser_arm(struct sock *sk, struct tls_context *tls_ctx) +{ + struct tls_sw_context_rx *rx_ctx = tls_sw_ctx_rx(tls_ctx); + + write_lock_bh(&sk->sk_callback_lock); + rx_ctx->saved_data_ready = sk->sk_data_ready; + sk->sk_data_ready = tls_data_ready; + write_unlock_bh(&sk->sk_callback_lock); + + strp_check_rcv(&rx_ctx->strp); +} + int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) { struct tls_context *tls_ctx = tls_get_ctx(sk); @@ -2357,13 +2369,6 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) cb.parse_msg = tls_read_size; strp_init(&sw_ctx_rx->strp, sk, &cb); - - write_lock_bh(&sk->sk_callback_lock); - sw_ctx_rx->saved_data_ready = sk->sk_data_ready; - sk->sk_data_ready = tls_data_ready; - write_unlock_bh(&sk->sk_callback_lock); - - strp_check_rcv(&sw_ctx_rx->strp); } goto out; -- cgit v1.2.3 From f87e62d45e51b12d48d2cb46b5cde8f83b866bc4 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 19 Jul 2019 10:29:16 -0700 Subject: net/tls: remove close callback sock unlock/lock around TX work flush The tls close() callback currently drops the sock lock, makes a cancel_delayed_work_sync() call, and then relocks the sock. By restructuring the code we can avoid droping lock and then reclaiming it. To simplify this we do the following, tls_sk_proto_close set_bit(CLOSING) set_bit(SCHEDULE) cancel_delay_work_sync() <- cancel workqueue lock_sock(sk) ... release_sock(sk) strp_done() Setting the CLOSING bit prevents the SCHEDULE bit from being cleared by any workqueue items e.g. if one happens to be scheduled and run between when we set SCHEDULE bit and cancel work. Then because SCHEDULE bit is set now no new work will be scheduled. Tested with net selftests and bpf selftests. Signed-off-by: John Fastabend Signed-off-by: Jakub Kicinski Reviewed-by: Dirk van der Merwe Signed-off-by: Daniel Borkmann --- include/net/tls.h | 2 ++ net/tls/tls_main.c | 3 +++ net/tls/tls_sw.c | 24 +++++++++++++++++------- 3 files changed, 22 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/net/tls.h b/include/net/tls.h index 43f551cd508b..d4276cb6de53 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -162,6 +162,7 @@ struct tls_sw_context_tx { int async_capable; #define BIT_TX_SCHEDULED 0 +#define BIT_TX_CLOSING 1 unsigned long tx_bitmask; }; @@ -360,6 +361,7 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); int tls_sw_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); void tls_sw_close(struct sock *sk, long timeout); +void tls_sw_cancel_work_tx(struct tls_context *tls_ctx); void tls_sw_free_resources_tx(struct sock *sk); void tls_sw_free_resources_rx(struct sock *sk); void tls_sw_release_resources_rx(struct sock *sk); diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 7ab682ed99fa..5c29b410cf7d 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -268,6 +268,9 @@ static void tls_sk_proto_close(struct sock *sk, long timeout) void (*sk_proto_close)(struct sock *sk, long timeout); bool free_ctx = false; + if (ctx->tx_conf == TLS_SW) + tls_sw_cancel_work_tx(ctx); + lock_sock(sk); sk_proto_close = ctx->sk_proto_close; diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index f58a8ffc2a9c..38c0e53c727d 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -2054,6 +2054,15 @@ static void tls_data_ready(struct sock *sk) } } +void tls_sw_cancel_work_tx(struct tls_context *tls_ctx) +{ + struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); + + set_bit(BIT_TX_CLOSING, &ctx->tx_bitmask); + set_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask); + cancel_delayed_work_sync(&ctx->tx_work.work); +} + void tls_sw_free_resources_tx(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); @@ -2065,11 +2074,6 @@ void tls_sw_free_resources_tx(struct sock *sk) if (atomic_read(&ctx->encrypt_pending)) crypto_wait_req(-EINPROGRESS, &ctx->async_wait); - release_sock(sk); - cancel_delayed_work_sync(&ctx->tx_work.work); - lock_sock(sk); - - /* Tx whatever records we can transmit and abandon the rest */ tls_tx_records(sk, -1); /* Free up un-sent records in tx_list. First, free @@ -2137,11 +2141,17 @@ static void tx_work_handler(struct work_struct *work) struct tx_work, work); struct sock *sk = tx_work->sk; struct tls_context *tls_ctx = tls_get_ctx(sk); - struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); + struct tls_sw_context_tx *ctx; - if (!test_and_clear_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask)) + if (unlikely(!tls_ctx)) return; + ctx = tls_sw_ctx_tx(tls_ctx); + if (test_bit(BIT_TX_CLOSING, &ctx->tx_bitmask)) + return; + + if (!test_and_clear_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask)) + return; lock_sock(sk); tls_tx_records(sk, -1); release_sock(sk); -- cgit v1.2.3 From 313ab004805cf52a42673b15852b3842474ccd87 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 19 Jul 2019 10:29:17 -0700 Subject: net/tls: remove sock unlock/lock around strp_done() The tls close() callback currently drops the sock lock to call strp_done(). Split up the RX cleanup into stopping the strparser and releasing most resources, syncing strparser and finally freeing the context. To avoid the need for a strp_done() call on the cleanup path of device offload make sure we don't arm the strparser until we are sure init will be successful. Signed-off-by: John Fastabend Signed-off-by: Jakub Kicinski Reviewed-by: Dirk van der Merwe Signed-off-by: Daniel Borkmann --- include/net/tls.h | 7 +++--- net/tls/tls_device.c | 1 - net/tls/tls_main.c | 61 ++++++++++++++++++++++++++-------------------------- net/tls/tls_sw.c | 40 +++++++++++++++++++++++++--------- 4 files changed, 64 insertions(+), 45 deletions(-) (limited to 'include') diff --git a/include/net/tls.h b/include/net/tls.h index d4276cb6de53..235508e35fd4 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -107,9 +107,7 @@ struct tls_device { enum { TLS_BASE, TLS_SW, -#ifdef CONFIG_TLS_DEVICE TLS_HW, -#endif TLS_HW_RECORD, TLS_NUM_CONFIG, }; @@ -357,14 +355,17 @@ int tls_sk_attach(struct sock *sk, int optname, char __user *optval, int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx); void tls_sw_strparser_arm(struct sock *sk, struct tls_context *ctx); +void tls_sw_strparser_done(struct tls_context *tls_ctx); int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); int tls_sw_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); void tls_sw_close(struct sock *sk, long timeout); void tls_sw_cancel_work_tx(struct tls_context *tls_ctx); -void tls_sw_free_resources_tx(struct sock *sk); +void tls_sw_release_resources_tx(struct sock *sk); +void tls_sw_free_ctx_tx(struct tls_context *tls_ctx); void tls_sw_free_resources_rx(struct sock *sk); void tls_sw_release_resources_rx(struct sock *sk); +void tls_sw_free_ctx_rx(struct tls_context *tls_ctx); int tls_sw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len); bool tls_sw_stream_read(const struct sock *sk); diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 4d67d72f007c..7c0b2b778703 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -1045,7 +1045,6 @@ int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx) rc = tls_set_sw_offload(sk, ctx, 0); if (rc) goto release_ctx; - tls_sw_strparser_arm(sk, ctx); rc = netdev->tlsdev_ops->tls_dev_add(netdev, sk, TLS_OFFLOAD_CTX_DIR_RX, &ctx->crypto_recv.info, diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 5c29b410cf7d..d152a00a7a27 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -261,24 +261,9 @@ void tls_ctx_free(struct tls_context *ctx) kfree(ctx); } -static void tls_sk_proto_close(struct sock *sk, long timeout) +static void tls_sk_proto_cleanup(struct sock *sk, + struct tls_context *ctx, long timeo) { - struct tls_context *ctx = tls_get_ctx(sk); - long timeo = sock_sndtimeo(sk, 0); - void (*sk_proto_close)(struct sock *sk, long timeout); - bool free_ctx = false; - - if (ctx->tx_conf == TLS_SW) - tls_sw_cancel_work_tx(ctx); - - lock_sock(sk); - sk_proto_close = ctx->sk_proto_close; - - if (ctx->tx_conf == TLS_BASE && ctx->rx_conf == TLS_BASE) { - free_ctx = true; - goto skip_tx_cleanup; - } - if (unlikely(sk->sk_write_pending) && !wait_on_pending_writer(sk, &timeo)) tls_handle_open_record(sk, 0); @@ -287,7 +272,7 @@ static void tls_sk_proto_close(struct sock *sk, long timeout) if (ctx->tx_conf == TLS_SW) { kfree(ctx->tx.rec_seq); kfree(ctx->tx.iv); - tls_sw_free_resources_tx(sk); + tls_sw_release_resources_tx(sk); #ifdef CONFIG_TLS_DEVICE } else if (ctx->tx_conf == TLS_HW) { tls_device_free_resources_tx(sk); @@ -295,26 +280,40 @@ static void tls_sk_proto_close(struct sock *sk, long timeout) } if (ctx->rx_conf == TLS_SW) - tls_sw_free_resources_rx(sk); + tls_sw_release_resources_rx(sk); #ifdef CONFIG_TLS_DEVICE if (ctx->rx_conf == TLS_HW) tls_device_offload_cleanup_rx(sk); - - if (ctx->tx_conf != TLS_HW && ctx->rx_conf != TLS_HW) { -#else - { #endif - tls_ctx_free(ctx); - ctx = NULL; - } +} + +static void tls_sk_proto_close(struct sock *sk, long timeout) +{ + void (*sk_proto_close)(struct sock *sk, long timeout); + struct tls_context *ctx = tls_get_ctx(sk); + long timeo = sock_sndtimeo(sk, 0); + bool free_ctx; + + if (ctx->tx_conf == TLS_SW) + tls_sw_cancel_work_tx(ctx); + + lock_sock(sk); + free_ctx = ctx->tx_conf != TLS_HW && ctx->rx_conf != TLS_HW; + sk_proto_close = ctx->sk_proto_close; + + if (ctx->tx_conf != TLS_BASE || ctx->rx_conf != TLS_BASE) + tls_sk_proto_cleanup(sk, ctx, timeo); -skip_tx_cleanup: release_sock(sk); + if (ctx->tx_conf == TLS_SW) + tls_sw_free_ctx_tx(ctx); + if (ctx->rx_conf == TLS_SW || ctx->rx_conf == TLS_HW) + tls_sw_strparser_done(ctx); + if (ctx->rx_conf == TLS_SW) + tls_sw_free_ctx_rx(ctx); sk_proto_close(sk, timeout); - /* free ctx for TLS_HW_RECORD, used by tcp_set_state - * for sk->sk_prot->unhash [tls_hw_unhash] - */ + if (free_ctx) tls_ctx_free(ctx); } @@ -541,9 +540,9 @@ static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval, rc = tls_set_sw_offload(sk, ctx, 0); if (rc) goto err_crypto_info; - tls_sw_strparser_arm(sk, ctx); conf = TLS_SW; } + tls_sw_strparser_arm(sk, ctx); } if (tx) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 38c0e53c727d..91d21b048a9b 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -2063,7 +2063,7 @@ void tls_sw_cancel_work_tx(struct tls_context *tls_ctx) cancel_delayed_work_sync(&ctx->tx_work.work); } -void tls_sw_free_resources_tx(struct sock *sk) +void tls_sw_release_resources_tx(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); @@ -2096,6 +2096,11 @@ void tls_sw_free_resources_tx(struct sock *sk) crypto_free_aead(ctx->aead_send); tls_free_open_rec(sk); +} + +void tls_sw_free_ctx_tx(struct tls_context *tls_ctx) +{ + struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); kfree(ctx); } @@ -2114,25 +2119,40 @@ void tls_sw_release_resources_rx(struct sock *sk) skb_queue_purge(&ctx->rx_list); crypto_free_aead(ctx->aead_recv); strp_stop(&ctx->strp); - write_lock_bh(&sk->sk_callback_lock); - sk->sk_data_ready = ctx->saved_data_ready; - write_unlock_bh(&sk->sk_callback_lock); - release_sock(sk); - strp_done(&ctx->strp); - lock_sock(sk); + /* If tls_sw_strparser_arm() was not called (cleanup paths) + * we still want to strp_stop(), but sk->sk_data_ready was + * never swapped. + */ + if (ctx->saved_data_ready) { + write_lock_bh(&sk->sk_callback_lock); + sk->sk_data_ready = ctx->saved_data_ready; + write_unlock_bh(&sk->sk_callback_lock); + } } } -void tls_sw_free_resources_rx(struct sock *sk) +void tls_sw_strparser_done(struct tls_context *tls_ctx) { - struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); - tls_sw_release_resources_rx(sk); + strp_done(&ctx->strp); +} + +void tls_sw_free_ctx_rx(struct tls_context *tls_ctx) +{ + struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); kfree(ctx); } +void tls_sw_free_resources_rx(struct sock *sk) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + + tls_sw_release_resources_rx(sk); + tls_sw_free_ctx_rx(tls_ctx); +} + /* The work handler to transmitt the encrypted records in tx_list */ static void tx_work_handler(struct work_struct *work) { -- cgit v1.2.3 From 32857cf57f920cdc03b5095f08febec94cf9c36b Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 19 Jul 2019 10:29:18 -0700 Subject: net/tls: fix transition through disconnect with close It is possible (via shutdown()) for TCP socks to go through TCP_CLOSE state via tcp_disconnect() without actually calling tcp_close which would then call the tls close callback. Because of this a user could disconnect a socket then put it in a LISTEN state which would break our assumptions about sockets always being ESTABLISHED state. More directly because close() can call unhash() and unhash is implemented by sockmap if a sockmap socket has TLS enabled we can incorrectly destroy the psock from unhash() and then call its close handler again. But because the psock (sockmap socket representation) is already destroyed we call close handler in sk->prot. However, in some cases (TLS BASE/BASE case) this will still point at the sockmap close handler resulting in a circular call and crash reported by syzbot. To fix both above issues implement the unhash() routine for TLS. v4: - add note about tls offload still needing the fix; - move sk_proto to the cold cache line; - split TX context free into "release" and "free", otherwise the GC work itself is in already freed memory; - more TX before RX for consistency; - reuse tls_ctx_free(); - schedule the GC work after we're done with context to avoid UAF; - don't set the unhash in all modes, all modes "inherit" TLS_BASE's callbacks anyway; - disable the unhash hook for TLS_HW. Fixes: 3c4d7559159bf ("tls: kernel TLS support") Reported-by: Eric Dumazet Signed-off-by: John Fastabend Signed-off-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- Documentation/networking/tls-offload.rst | 6 ++++ include/net/tls.h | 5 ++- net/tls/tls_main.c | 55 ++++++++++++++++++++++++++++++++ 3 files changed, 65 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/Documentation/networking/tls-offload.rst b/Documentation/networking/tls-offload.rst index 048e5ca44824..8a1eeb393316 100644 --- a/Documentation/networking/tls-offload.rst +++ b/Documentation/networking/tls-offload.rst @@ -513,3 +513,9 @@ Redirects leak clear text In the RX direction, if segment has already been decrypted by the device and it gets redirected or mirrored - clear text will be transmitted out. + +shutdown() doesn't clear TLS state +---------------------------------- + +shutdown() system call allows for a TLS socket to be reused as a different +connection. Offload doesn't currently handle that. diff --git a/include/net/tls.h b/include/net/tls.h index 235508e35fd4..9e425ac2de45 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -271,6 +271,8 @@ struct tls_context { unsigned long flags; /* cache cold stuff */ + struct proto *sk_proto; + void (*sk_destruct)(struct sock *sk); void (*sk_proto_close)(struct sock *sk, long timeout); @@ -288,6 +290,8 @@ struct tls_context { struct list_head list; refcount_t refcount; + + struct work_struct gc; }; enum tls_offload_ctx_dir { @@ -359,7 +363,6 @@ void tls_sw_strparser_done(struct tls_context *tls_ctx); int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); int tls_sw_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); -void tls_sw_close(struct sock *sk, long timeout); void tls_sw_cancel_work_tx(struct tls_context *tls_ctx); void tls_sw_release_resources_tx(struct sock *sk); void tls_sw_free_ctx_tx(struct tls_context *tls_ctx); diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index d152a00a7a27..48f1c26459d0 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -261,6 +261,33 @@ void tls_ctx_free(struct tls_context *ctx) kfree(ctx); } +static void tls_ctx_free_deferred(struct work_struct *gc) +{ + struct tls_context *ctx = container_of(gc, struct tls_context, gc); + + /* Ensure any remaining work items are completed. The sk will + * already have lost its tls_ctx reference by the time we get + * here so no xmit operation will actually be performed. + */ + if (ctx->tx_conf == TLS_SW) { + tls_sw_cancel_work_tx(ctx); + tls_sw_free_ctx_tx(ctx); + } + + if (ctx->rx_conf == TLS_SW) { + tls_sw_strparser_done(ctx); + tls_sw_free_ctx_rx(ctx); + } + + tls_ctx_free(ctx); +} + +static void tls_ctx_free_wq(struct tls_context *ctx) +{ + INIT_WORK(&ctx->gc, tls_ctx_free_deferred); + schedule_work(&ctx->gc); +} + static void tls_sk_proto_cleanup(struct sock *sk, struct tls_context *ctx, long timeo) { @@ -288,6 +315,26 @@ static void tls_sk_proto_cleanup(struct sock *sk, #endif } +static void tls_sk_proto_unhash(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + long timeo = sock_sndtimeo(sk, 0); + struct tls_context *ctx; + + if (unlikely(!icsk->icsk_ulp_data)) { + if (sk->sk_prot->unhash) + sk->sk_prot->unhash(sk); + } + + ctx = tls_get_ctx(sk); + tls_sk_proto_cleanup(sk, ctx, timeo); + icsk->icsk_ulp_data = NULL; + + if (ctx->sk_proto->unhash) + ctx->sk_proto->unhash(sk); + tls_ctx_free_wq(ctx); +} + static void tls_sk_proto_close(struct sock *sk, long timeout) { void (*sk_proto_close)(struct sock *sk, long timeout); @@ -305,6 +352,7 @@ static void tls_sk_proto_close(struct sock *sk, long timeout) if (ctx->tx_conf != TLS_BASE || ctx->rx_conf != TLS_BASE) tls_sk_proto_cleanup(sk, ctx, timeo); + sk->sk_prot = ctx->sk_proto; release_sock(sk); if (ctx->tx_conf == TLS_SW) tls_sw_free_ctx_tx(ctx); @@ -608,6 +656,7 @@ static struct tls_context *create_ctx(struct sock *sk) ctx->setsockopt = sk->sk_prot->setsockopt; ctx->getsockopt = sk->sk_prot->getsockopt; ctx->sk_proto_close = sk->sk_prot->close; + ctx->unhash = sk->sk_prot->unhash; return ctx; } @@ -731,6 +780,7 @@ static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG], prot[TLS_BASE][TLS_BASE].setsockopt = tls_setsockopt; prot[TLS_BASE][TLS_BASE].getsockopt = tls_getsockopt; prot[TLS_BASE][TLS_BASE].close = tls_sk_proto_close; + prot[TLS_BASE][TLS_BASE].unhash = tls_sk_proto_unhash; prot[TLS_SW][TLS_BASE] = prot[TLS_BASE][TLS_BASE]; prot[TLS_SW][TLS_BASE].sendmsg = tls_sw_sendmsg; @@ -748,16 +798,20 @@ static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG], #ifdef CONFIG_TLS_DEVICE prot[TLS_HW][TLS_BASE] = prot[TLS_BASE][TLS_BASE]; + prot[TLS_HW][TLS_BASE].unhash = base->unhash; prot[TLS_HW][TLS_BASE].sendmsg = tls_device_sendmsg; prot[TLS_HW][TLS_BASE].sendpage = tls_device_sendpage; prot[TLS_HW][TLS_SW] = prot[TLS_BASE][TLS_SW]; + prot[TLS_HW][TLS_SW].unhash = base->unhash; prot[TLS_HW][TLS_SW].sendmsg = tls_device_sendmsg; prot[TLS_HW][TLS_SW].sendpage = tls_device_sendpage; prot[TLS_BASE][TLS_HW] = prot[TLS_BASE][TLS_SW]; + prot[TLS_BASE][TLS_HW].unhash = base->unhash; prot[TLS_SW][TLS_HW] = prot[TLS_SW][TLS_SW]; + prot[TLS_SW][TLS_HW].unhash = base->unhash; prot[TLS_HW][TLS_HW] = prot[TLS_HW][TLS_SW]; #endif @@ -794,6 +848,7 @@ static int tls_init(struct sock *sk) tls_build_proto(sk); ctx->tx_conf = TLS_BASE; ctx->rx_conf = TLS_BASE; + ctx->sk_proto = sk->sk_prot; update_sk_prot(sk, ctx); out: return rc; -- cgit v1.2.3 From 95fa145479fbc0a0c1fd3274ceb42ec03c042a4a Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 19 Jul 2019 10:29:22 -0700 Subject: bpf: sockmap/tls, close can race with map free When a map free is called and in parallel a socket is closed we have two paths that can potentially reset the socket prot ops, the bpf close() path and the map free path. This creates a problem with which prot ops should be used from the socket closed side. If the map_free side completes first then we want to call the original lowest level ops. However, if the tls path runs first we want to call the sockmap ops. Additionally there was no locking around prot updates in TLS code paths so the prot ops could be changed multiple times once from TLS path and again from sockmap side potentially leaving ops pointed at either TLS or sockmap when psock and/or tls context have already been destroyed. To fix this race first only update ops inside callback lock so that TLS, sockmap and lowest level all agree on prot state. Second and a ULP callback update() so that lower layers can inform the upper layer when they are being removed allowing the upper layer to reset prot ops. This gets us close to allowing sockmap and tls to be stacked in arbitrary order but will save that patch for *next trees. v4: - make sure we don't free things for device; - remove the checks which swap the callbacks back only if TLS is at the top. Reported-by: syzbot+06537213db7ba2745c4a@syzkaller.appspotmail.com Fixes: 02c558b2d5d6 ("bpf: sockmap, support for msg_peek in sk_msg with redirect ingress") Signed-off-by: John Fastabend Signed-off-by: Jakub Kicinski Reviewed-by: Dirk van der Merwe Signed-off-by: Daniel Borkmann --- include/linux/skmsg.h | 8 +++++++- include/net/tcp.h | 3 +++ net/core/skmsg.c | 4 ++-- net/ipv4/tcp_ulp.c | 13 +++++++++++++ net/tls/tls_main.c | 33 ++++++++++++++++++++++++++++----- 5 files changed, 53 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 50ced8aba9db..e4b3fb4bb77c 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -354,7 +354,13 @@ static inline void sk_psock_restore_proto(struct sock *sk, sk->sk_write_space = psock->saved_write_space; if (psock->sk_proto) { - sk->sk_prot = psock->sk_proto; + struct inet_connection_sock *icsk = inet_csk(sk); + bool has_ulp = !!icsk->icsk_ulp_data; + + if (has_ulp) + tcp_update_ulp(sk, psock->sk_proto); + else + sk->sk_prot = psock->sk_proto; psock->sk_proto = NULL; } } diff --git a/include/net/tcp.h b/include/net/tcp.h index f42d300f0cfa..c82a23470081 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2103,6 +2103,8 @@ struct tcp_ulp_ops { /* initialize ulp */ int (*init)(struct sock *sk); + /* update ulp */ + void (*update)(struct sock *sk, struct proto *p); /* cleanup ulp */ void (*release)(struct sock *sk); @@ -2114,6 +2116,7 @@ void tcp_unregister_ulp(struct tcp_ulp_ops *type); int tcp_set_ulp(struct sock *sk, const char *name); void tcp_get_available_ulp(char *buf, size_t len); void tcp_cleanup_ulp(struct sock *sk); +void tcp_update_ulp(struct sock *sk, struct proto *p); #define MODULE_ALIAS_TCP_ULP(name) \ __MODULE_INFO(alias, alias_userspace, name); \ diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 93bffaad2135..6832eeb4b785 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -585,12 +585,12 @@ EXPORT_SYMBOL_GPL(sk_psock_destroy); void sk_psock_drop(struct sock *sk, struct sk_psock *psock) { - rcu_assign_sk_user_data(sk, NULL); sk_psock_cork_free(psock); sk_psock_zap_ingress(psock); - sk_psock_restore_proto(sk, psock); write_lock_bh(&sk->sk_callback_lock); + sk_psock_restore_proto(sk, psock); + rcu_assign_sk_user_data(sk, NULL); if (psock->progs.skb_parser) sk_psock_stop_strp(sk, psock); write_unlock_bh(&sk->sk_callback_lock); diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp_ulp.c index 3d8a1d835471..4849edb62d52 100644 --- a/net/ipv4/tcp_ulp.c +++ b/net/ipv4/tcp_ulp.c @@ -96,6 +96,19 @@ void tcp_get_available_ulp(char *buf, size_t maxlen) rcu_read_unlock(); } +void tcp_update_ulp(struct sock *sk, struct proto *proto) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + if (!icsk->icsk_ulp_ops) { + sk->sk_prot = proto; + return; + } + + if (icsk->icsk_ulp_ops->update) + icsk->icsk_ulp_ops->update(sk, proto); +} + void tcp_cleanup_ulp(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 48f1c26459d0..f208f8455ef2 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -328,7 +328,10 @@ static void tls_sk_proto_unhash(struct sock *sk) ctx = tls_get_ctx(sk); tls_sk_proto_cleanup(sk, ctx, timeo); + write_lock_bh(&sk->sk_callback_lock); icsk->icsk_ulp_data = NULL; + sk->sk_prot = ctx->sk_proto; + write_unlock_bh(&sk->sk_callback_lock); if (ctx->sk_proto->unhash) ctx->sk_proto->unhash(sk); @@ -337,7 +340,7 @@ static void tls_sk_proto_unhash(struct sock *sk) static void tls_sk_proto_close(struct sock *sk, long timeout) { - void (*sk_proto_close)(struct sock *sk, long timeout); + struct inet_connection_sock *icsk = inet_csk(sk); struct tls_context *ctx = tls_get_ctx(sk); long timeo = sock_sndtimeo(sk, 0); bool free_ctx; @@ -347,12 +350,15 @@ static void tls_sk_proto_close(struct sock *sk, long timeout) lock_sock(sk); free_ctx = ctx->tx_conf != TLS_HW && ctx->rx_conf != TLS_HW; - sk_proto_close = ctx->sk_proto_close; if (ctx->tx_conf != TLS_BASE || ctx->rx_conf != TLS_BASE) tls_sk_proto_cleanup(sk, ctx, timeo); + write_lock_bh(&sk->sk_callback_lock); + if (free_ctx) + icsk->icsk_ulp_data = NULL; sk->sk_prot = ctx->sk_proto; + write_unlock_bh(&sk->sk_callback_lock); release_sock(sk); if (ctx->tx_conf == TLS_SW) tls_sw_free_ctx_tx(ctx); @@ -360,7 +366,7 @@ static void tls_sk_proto_close(struct sock *sk, long timeout) tls_sw_strparser_done(ctx); if (ctx->rx_conf == TLS_SW) tls_sw_free_ctx_rx(ctx); - sk_proto_close(sk, timeout); + ctx->sk_proto_close(sk, timeout); if (free_ctx) tls_ctx_free(ctx); @@ -827,7 +833,7 @@ static int tls_init(struct sock *sk) int rc = 0; if (tls_hw_prot(sk)) - goto out; + return 0; /* The TLS ulp is currently supported only for TCP sockets * in ESTABLISHED state. @@ -838,22 +844,38 @@ static int tls_init(struct sock *sk) if (sk->sk_state != TCP_ESTABLISHED) return -ENOTSUPP; + tls_build_proto(sk); + /* allocate tls context */ + write_lock_bh(&sk->sk_callback_lock); ctx = create_ctx(sk); if (!ctx) { rc = -ENOMEM; goto out; } - tls_build_proto(sk); ctx->tx_conf = TLS_BASE; ctx->rx_conf = TLS_BASE; ctx->sk_proto = sk->sk_prot; update_sk_prot(sk, ctx); out: + write_unlock_bh(&sk->sk_callback_lock); return rc; } +static void tls_update(struct sock *sk, struct proto *p) +{ + struct tls_context *ctx; + + ctx = tls_get_ctx(sk); + if (likely(ctx)) { + ctx->sk_proto_close = p->close; + ctx->sk_proto = p; + } else { + sk->sk_prot = p; + } +} + void tls_register_device(struct tls_device *device) { spin_lock_bh(&device_spinlock); @@ -874,6 +896,7 @@ static struct tcp_ulp_ops tcp_tls_ulp_ops __read_mostly = { .name = "tls", .owner = THIS_MODULE, .init = tls_init, + .update = tls_update, }; static int __init tls_register(void) -- cgit v1.2.3 From 9c71b9eb3cb25856e29f0486eae9ee1ba864ba51 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 22 Jul 2019 10:16:44 +0200 Subject: dmaengine: omap-dma: make omap_dma_filter_fn private With the audio driver no longer referring to this function, it can be made private to the dmaengine driver itself, and the header file removed. Acked-by: Peter Ujfalusi Link: https://lore.kernel.org/lkml/20190307151646.1016966-1-arnd@arndb.de/ Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20190722081705.2084961-1-arnd@arndb.de Signed-off-by: Vinod Koul --- drivers/dma/ti/omap-dma.c | 3 ++- include/linux/omap-dma.h | 2 -- include/linux/omap-dmaengine.h | 18 ------------------ 3 files changed, 2 insertions(+), 21 deletions(-) delete mode 100644 include/linux/omap-dmaengine.h (limited to 'include') diff --git a/drivers/dma/ti/omap-dma.c b/drivers/dma/ti/omap-dma.c index ba2489d4ea24..49da402a1927 100644 --- a/drivers/dma/ti/omap-dma.c +++ b/drivers/dma/ti/omap-dma.c @@ -202,6 +202,7 @@ static const unsigned es_bytes[] = { [CSDP_DATA_TYPE_32] = 4, }; +static bool omap_dma_filter_fn(struct dma_chan *chan, void *param); static struct of_dma_filter_info omap_dma_info = { .filter_fn = omap_dma_filter_fn, }; @@ -1637,7 +1638,7 @@ static struct platform_driver omap_dma_driver = { }, }; -bool omap_dma_filter_fn(struct dma_chan *chan, void *param) +static bool omap_dma_filter_fn(struct dma_chan *chan, void *param) { if (chan->device->dev->driver == &omap_dma_driver.driver) { struct omap_dmadev *od = to_omap_dma_dev(chan->device); diff --git a/include/linux/omap-dma.h b/include/linux/omap-dma.h index 840ce551e773..ba3cfbb52312 100644 --- a/include/linux/omap-dma.h +++ b/include/linux/omap-dma.h @@ -1,8 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_OMAP_DMA_H #define __LINUX_OMAP_DMA_H -#include - /* * Legacy OMAP DMA handling defines and functions * diff --git a/include/linux/omap-dmaengine.h b/include/linux/omap-dmaengine.h deleted file mode 100644 index b6e42f933c40..000000000000 --- a/include/linux/omap-dmaengine.h +++ /dev/null @@ -1,18 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * OMAP DMA Engine support - */ -#ifndef __LINUX_OMAP_DMAENGINE_H -#define __LINUX_OMAP_DMAENGINE_H - -struct dma_chan; - -#if defined(CONFIG_DMA_OMAP) || (defined(CONFIG_DMA_OMAP_MODULE) && defined(MODULE)) -bool omap_dma_filter_fn(struct dma_chan *, void *); -#else -static inline bool omap_dma_filter_fn(struct dma_chan *c, void *d) -{ - return false; -} -#endif -#endif /* __LINUX_OMAP_DMAENGINE_H */ -- cgit v1.2.3 From d2bfe7b5d182dad54aedb9dcdb736e8816ead1c3 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 22 Jul 2019 10:16:45 +0200 Subject: dmaengine: edma: make edma_filter_fn private With the audio driver no longer referring to this function, it can be made private to the dmaengine driver itself, and the header file removed. Acked-by: Peter Ujfalusi Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20190722081705.2084961-2-arnd@arndb.de Signed-off-by: Vinod Koul --- drivers/dma/ti/edma.c | 5 +++-- include/linux/edma.h | 29 ----------------------------- 2 files changed, 3 insertions(+), 31 deletions(-) delete mode 100644 include/linux/edma.h (limited to 'include') diff --git a/drivers/dma/ti/edma.c b/drivers/dma/ti/edma.c index ceabdea40ae0..f2549ee3fb49 100644 --- a/drivers/dma/ti/edma.c +++ b/drivers/dma/ti/edma.c @@ -15,7 +15,6 @@ #include #include -#include #include #include #include @@ -2185,6 +2184,8 @@ static struct dma_chan *of_edma_xlate(struct of_phandle_args *dma_spec, } #endif +static bool edma_filter_fn(struct dma_chan *chan, void *param); + static int edma_probe(struct platform_device *pdev) { struct edma_soc_info *info = pdev->dev.platform_data; @@ -2524,7 +2525,7 @@ static struct platform_driver edma_tptc_driver = { }, }; -bool edma_filter_fn(struct dma_chan *chan, void *param) +static bool edma_filter_fn(struct dma_chan *chan, void *param) { bool match = false; diff --git a/include/linux/edma.h b/include/linux/edma.h deleted file mode 100644 index a1307e7827e8..000000000000 --- a/include/linux/edma.h +++ /dev/null @@ -1,29 +0,0 @@ -/* - * TI EDMA DMA engine driver - * - * Copyright 2012 Texas Instruments - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation version 2. - * - * This program is distributed "as is" WITHOUT ANY WARRANTY of any - * kind, whether express or implied; without even the implied warranty - * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - */ -#ifndef __LINUX_EDMA_H -#define __LINUX_EDMA_H - -struct dma_chan; - -#if defined(CONFIG_TI_EDMA) || defined(CONFIG_TI_EDMA_MODULE) -bool edma_filter_fn(struct dma_chan *, void *); -#else -static inline bool edma_filter_fn(struct dma_chan *chan, void *param) -{ - return false; -} -#endif - -#endif -- cgit v1.2.3 From effa467870c7612012885df4e246bdb8ffd8e44c Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Tue, 16 Jul 2019 22:38:05 +0100 Subject: iommu/vt-d: Don't queue_iova() if there is no flush queue Intel VT-d driver was reworked to use common deferred flushing implementation. Previously there was one global per-cpu flush queue, afterwards - one per domain. Before deferring a flush, the queue should be allocated and initialized. Currently only domains with IOMMU_DOMAIN_DMA type initialize their flush queue. It's probably worth to init it for static or unmanaged domains too, but it may be arguable - I'm leaving it to iommu folks. Prevent queuing an iova flush if the domain doesn't have a queue. The defensive check seems to be worth to keep even if queue would be initialized for all kinds of domains. And is easy backportable. On 4.19.43 stable kernel it has a user-visible effect: previously for devices in si domain there were crashes, on sata devices: BUG: spinlock bad magic on CPU#6, swapper/0/1 lock: 0xffff88844f582008, .magic: 00000000, .owner: /-1, .owner_cpu: 0 CPU: 6 PID: 1 Comm: swapper/0 Not tainted 4.19.43 #1 Call Trace: dump_stack+0x61/0x7e spin_bug+0x9d/0xa3 do_raw_spin_lock+0x22/0x8e _raw_spin_lock_irqsave+0x32/0x3a queue_iova+0x45/0x115 intel_unmap+0x107/0x113 intel_unmap_sg+0x6b/0x76 __ata_qc_complete+0x7f/0x103 ata_qc_complete+0x9b/0x26a ata_qc_complete_multiple+0xd0/0xe3 ahci_handle_port_interrupt+0x3ee/0x48a ahci_handle_port_intr+0x73/0xa9 ahci_single_level_irq_intr+0x40/0x60 __handle_irq_event_percpu+0x7f/0x19a handle_irq_event_percpu+0x32/0x72 handle_irq_event+0x38/0x56 handle_edge_irq+0x102/0x121 handle_irq+0x147/0x15c do_IRQ+0x66/0xf2 common_interrupt+0xf/0xf RIP: 0010:__do_softirq+0x8c/0x2df The same for usb devices that use ehci-pci: BUG: spinlock bad magic on CPU#0, swapper/0/1 lock: 0xffff88844f402008, .magic: 00000000, .owner: /-1, .owner_cpu: 0 CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.19.43 #4 Call Trace: dump_stack+0x61/0x7e spin_bug+0x9d/0xa3 do_raw_spin_lock+0x22/0x8e _raw_spin_lock_irqsave+0x32/0x3a queue_iova+0x77/0x145 intel_unmap+0x107/0x113 intel_unmap_page+0xe/0x10 usb_hcd_unmap_urb_setup_for_dma+0x53/0x9d usb_hcd_unmap_urb_for_dma+0x17/0x100 unmap_urb_for_dma+0x22/0x24 __usb_hcd_giveback_urb+0x51/0xc3 usb_giveback_urb_bh+0x97/0xde tasklet_action_common.isra.4+0x5f/0xa1 tasklet_action+0x2d/0x30 __do_softirq+0x138/0x2df irq_exit+0x7d/0x8b smp_apic_timer_interrupt+0x10f/0x151 apic_timer_interrupt+0xf/0x20 RIP: 0010:_raw_spin_unlock_irqrestore+0x17/0x39 Cc: David Woodhouse Cc: Joerg Roedel Cc: Lu Baolu Cc: iommu@lists.linux-foundation.org Cc: # 4.14+ Fixes: 13cf01744608 ("iommu/vt-d: Make use of iova deferred flushing") Signed-off-by: Dmitry Safonov Reviewed-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel-iommu.c | 3 ++- drivers/iommu/iova.c | 18 ++++++++++++++---- include/linux/iova.h | 6 ++++++ 3 files changed, 22 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 9b1d62d03370..72c6d647bec9 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -3561,7 +3561,8 @@ static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size) freelist = domain_unmap(domain, start_pfn, last_pfn); - if (intel_iommu_strict || (pdev && pdev->untrusted)) { + if (intel_iommu_strict || (pdev && pdev->untrusted) || + !has_iova_flush_queue(&domain->iovad)) { iommu_flush_iotlb_psi(iommu, domain, start_pfn, nrpages, !freelist, 0); /* free iova */ diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c index d499b2621239..8413ae54904a 100644 --- a/drivers/iommu/iova.c +++ b/drivers/iommu/iova.c @@ -54,9 +54,14 @@ init_iova_domain(struct iova_domain *iovad, unsigned long granule, } EXPORT_SYMBOL_GPL(init_iova_domain); +bool has_iova_flush_queue(struct iova_domain *iovad) +{ + return !!iovad->fq; +} + static void free_iova_flush_queue(struct iova_domain *iovad) { - if (!iovad->fq) + if (!has_iova_flush_queue(iovad)) return; if (timer_pending(&iovad->fq_timer)) @@ -74,13 +79,14 @@ static void free_iova_flush_queue(struct iova_domain *iovad) int init_iova_flush_queue(struct iova_domain *iovad, iova_flush_cb flush_cb, iova_entry_dtor entry_dtor) { + struct iova_fq __percpu *queue; int cpu; atomic64_set(&iovad->fq_flush_start_cnt, 0); atomic64_set(&iovad->fq_flush_finish_cnt, 0); - iovad->fq = alloc_percpu(struct iova_fq); - if (!iovad->fq) + queue = alloc_percpu(struct iova_fq); + if (!queue) return -ENOMEM; iovad->flush_cb = flush_cb; @@ -89,13 +95,17 @@ int init_iova_flush_queue(struct iova_domain *iovad, for_each_possible_cpu(cpu) { struct iova_fq *fq; - fq = per_cpu_ptr(iovad->fq, cpu); + fq = per_cpu_ptr(queue, cpu); fq->head = 0; fq->tail = 0; spin_lock_init(&fq->lock); } + smp_wmb(); + + iovad->fq = queue; + timer_setup(&iovad->fq_timer, fq_flush_timeout, 0); atomic_set(&iovad->fq_timer_on, 0); diff --git a/include/linux/iova.h b/include/linux/iova.h index 781b96ac706f..cd0f1de901a8 100644 --- a/include/linux/iova.h +++ b/include/linux/iova.h @@ -155,6 +155,7 @@ struct iova *reserve_iova(struct iova_domain *iovad, unsigned long pfn_lo, void copy_reserved_iova(struct iova_domain *from, struct iova_domain *to); void init_iova_domain(struct iova_domain *iovad, unsigned long granule, unsigned long start_pfn); +bool has_iova_flush_queue(struct iova_domain *iovad); int init_iova_flush_queue(struct iova_domain *iovad, iova_flush_cb flush_cb, iova_entry_dtor entry_dtor); struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn); @@ -235,6 +236,11 @@ static inline void init_iova_domain(struct iova_domain *iovad, { } +bool has_iova_flush_queue(struct iova_domain *iovad) +{ + return false; +} + static inline int init_iova_flush_queue(struct iova_domain *iovad, iova_flush_cb flush_cb, iova_entry_dtor entry_dtor) -- cgit v1.2.3 From ae24fb49d01103c80d6ff3b78714259c1c62c958 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Mon, 22 Jul 2019 15:40:07 +0100 Subject: iommu/virtio: Update to most recent specification Following specification review a few things were changed in v8 of the virtio-iommu series [1], but have been omitted when merging the base driver. Add them now: * Remove the EXEC flag. * Add feature bit for the MMIO flag. * Change domain_bits to domain_range. * Add NOMEM status flag. [1] https://lore.kernel.org/linux-iommu/20190530170929.19366-1-jean-philippe.brucker@arm.com/ Fixes: edcd69ab9a32 ("iommu: Add virtio-iommu driver") Reported-by: Eric Auger Signed-off-by: Jean-Philippe Brucker Signed-off-by: Michael S. Tsirkin Reviewed-by: Eric Auger Tested-by: Eric Auger Acked-by: Joerg Roedel --- drivers/iommu/virtio-iommu.c | 40 ++++++++++++++++++++++++++++----------- include/uapi/linux/virtio_iommu.h | 32 +++++++++++++++++-------------- 2 files changed, 47 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c index 433f4d2ee956..80a740df0737 100644 --- a/drivers/iommu/virtio-iommu.c +++ b/drivers/iommu/virtio-iommu.c @@ -2,7 +2,7 @@ /* * Virtio driver for the paravirtualized IOMMU * - * Copyright (C) 2018 Arm Limited + * Copyright (C) 2019 Arm Limited */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -47,7 +47,10 @@ struct viommu_dev { /* Device configuration */ struct iommu_domain_geometry geometry; u64 pgsize_bitmap; - u8 domain_bits; + u32 first_domain; + u32 last_domain; + /* Supported MAP flags */ + u32 map_flags; u32 probe_size; }; @@ -62,6 +65,7 @@ struct viommu_domain { struct viommu_dev *viommu; struct mutex mutex; /* protects viommu pointer */ unsigned int id; + u32 map_flags; spinlock_t mappings_lock; struct rb_root_cached mappings; @@ -113,6 +117,8 @@ static int viommu_get_req_errno(void *buf, size_t len) return -ENOENT; case VIRTIO_IOMMU_S_FAULT: return -EFAULT; + case VIRTIO_IOMMU_S_NOMEM: + return -ENOMEM; case VIRTIO_IOMMU_S_IOERR: case VIRTIO_IOMMU_S_DEVERR: default: @@ -607,15 +613,15 @@ static int viommu_domain_finalise(struct viommu_dev *viommu, { int ret; struct viommu_domain *vdomain = to_viommu_domain(domain); - unsigned int max_domain = viommu->domain_bits > 31 ? ~0 : - (1U << viommu->domain_bits) - 1; vdomain->viommu = viommu; + vdomain->map_flags = viommu->map_flags; domain->pgsize_bitmap = viommu->pgsize_bitmap; domain->geometry = viommu->geometry; - ret = ida_alloc_max(&viommu->domain_ids, max_domain, GFP_KERNEL); + ret = ida_alloc_range(&viommu->domain_ids, viommu->first_domain, + viommu->last_domain, GFP_KERNEL); if (ret >= 0) vdomain->id = (unsigned int)ret; @@ -710,7 +716,7 @@ static int viommu_map(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t size, int prot) { int ret; - int flags; + u32 flags; struct virtio_iommu_req_map map; struct viommu_domain *vdomain = to_viommu_domain(domain); @@ -718,6 +724,9 @@ static int viommu_map(struct iommu_domain *domain, unsigned long iova, (prot & IOMMU_WRITE ? VIRTIO_IOMMU_MAP_F_WRITE : 0) | (prot & IOMMU_MMIO ? VIRTIO_IOMMU_MAP_F_MMIO : 0); + if (flags & ~vdomain->map_flags) + return -EINVAL; + ret = viommu_add_mapping(vdomain, iova, paddr, size, flags); if (ret) return ret; @@ -1027,7 +1036,8 @@ static int viommu_probe(struct virtio_device *vdev) goto err_free_vqs; } - viommu->domain_bits = 32; + viommu->map_flags = VIRTIO_IOMMU_MAP_F_READ | VIRTIO_IOMMU_MAP_F_WRITE; + viommu->last_domain = ~0U; /* Optional features */ virtio_cread_feature(vdev, VIRTIO_IOMMU_F_INPUT_RANGE, @@ -1038,9 +1048,13 @@ static int viommu_probe(struct virtio_device *vdev) struct virtio_iommu_config, input_range.end, &input_end); - virtio_cread_feature(vdev, VIRTIO_IOMMU_F_DOMAIN_BITS, - struct virtio_iommu_config, domain_bits, - &viommu->domain_bits); + virtio_cread_feature(vdev, VIRTIO_IOMMU_F_DOMAIN_RANGE, + struct virtio_iommu_config, domain_range.start, + &viommu->first_domain); + + virtio_cread_feature(vdev, VIRTIO_IOMMU_F_DOMAIN_RANGE, + struct virtio_iommu_config, domain_range.end, + &viommu->last_domain); virtio_cread_feature(vdev, VIRTIO_IOMMU_F_PROBE, struct virtio_iommu_config, probe_size, @@ -1052,6 +1066,9 @@ static int viommu_probe(struct virtio_device *vdev) .force_aperture = true, }; + if (virtio_has_feature(vdev, VIRTIO_IOMMU_F_MMIO)) + viommu->map_flags |= VIRTIO_IOMMU_MAP_F_MMIO; + viommu_ops.pgsize_bitmap = viommu->pgsize_bitmap; virtio_device_ready(vdev); @@ -1130,9 +1147,10 @@ static void viommu_config_changed(struct virtio_device *vdev) static unsigned int features[] = { VIRTIO_IOMMU_F_MAP_UNMAP, - VIRTIO_IOMMU_F_DOMAIN_BITS, VIRTIO_IOMMU_F_INPUT_RANGE, + VIRTIO_IOMMU_F_DOMAIN_RANGE, VIRTIO_IOMMU_F_PROBE, + VIRTIO_IOMMU_F_MMIO, }; static struct virtio_device_id id_table[] = { diff --git a/include/uapi/linux/virtio_iommu.h b/include/uapi/linux/virtio_iommu.h index ba1b460c9944..237e36a280cb 100644 --- a/include/uapi/linux/virtio_iommu.h +++ b/include/uapi/linux/virtio_iommu.h @@ -1,8 +1,8 @@ /* SPDX-License-Identifier: BSD-3-Clause */ /* - * Virtio-iommu definition v0.9 + * Virtio-iommu definition v0.12 * - * Copyright (C) 2018 Arm Ltd. + * Copyright (C) 2019 Arm Ltd. */ #ifndef _UAPI_LINUX_VIRTIO_IOMMU_H #define _UAPI_LINUX_VIRTIO_IOMMU_H @@ -11,26 +11,31 @@ /* Feature bits */ #define VIRTIO_IOMMU_F_INPUT_RANGE 0 -#define VIRTIO_IOMMU_F_DOMAIN_BITS 1 +#define VIRTIO_IOMMU_F_DOMAIN_RANGE 1 #define VIRTIO_IOMMU_F_MAP_UNMAP 2 #define VIRTIO_IOMMU_F_BYPASS 3 #define VIRTIO_IOMMU_F_PROBE 4 +#define VIRTIO_IOMMU_F_MMIO 5 -struct virtio_iommu_range { - __u64 start; - __u64 end; +struct virtio_iommu_range_64 { + __le64 start; + __le64 end; +}; + +struct virtio_iommu_range_32 { + __le32 start; + __le32 end; }; struct virtio_iommu_config { /* Supported page sizes */ - __u64 page_size_mask; + __le64 page_size_mask; /* Supported IOVA range */ - struct virtio_iommu_range input_range; + struct virtio_iommu_range_64 input_range; /* Max domain ID size */ - __u8 domain_bits; - __u8 padding[3]; + struct virtio_iommu_range_32 domain_range; /* Probe buffer size */ - __u32 probe_size; + __le32 probe_size; }; /* Request types */ @@ -49,6 +54,7 @@ struct virtio_iommu_config { #define VIRTIO_IOMMU_S_RANGE 0x05 #define VIRTIO_IOMMU_S_NOENT 0x06 #define VIRTIO_IOMMU_S_FAULT 0x07 +#define VIRTIO_IOMMU_S_NOMEM 0x08 struct virtio_iommu_req_head { __u8 type; @@ -78,12 +84,10 @@ struct virtio_iommu_req_detach { #define VIRTIO_IOMMU_MAP_F_READ (1 << 0) #define VIRTIO_IOMMU_MAP_F_WRITE (1 << 1) -#define VIRTIO_IOMMU_MAP_F_EXEC (1 << 2) -#define VIRTIO_IOMMU_MAP_F_MMIO (1 << 3) +#define VIRTIO_IOMMU_MAP_F_MMIO (1 << 2) #define VIRTIO_IOMMU_MAP_F_MASK (VIRTIO_IOMMU_MAP_F_READ | \ VIRTIO_IOMMU_MAP_F_WRITE | \ - VIRTIO_IOMMU_MAP_F_EXEC | \ VIRTIO_IOMMU_MAP_F_MMIO) struct virtio_iommu_req_map { -- cgit v1.2.3 From 60c1b220d8bc6baeaf837cd60f94a331b25c26bc Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Thu, 27 Jun 2019 12:52:58 -0700 Subject: cpu-topology: Move cpu topology code to common code. Both RISC-V & ARM64 are using cpu-map device tree to describe their cpu topology. It's better to move the relevant code to a common place instead of duplicate code. To: Will Deacon To: Catalin Marinas Signed-off-by: Atish Patra [Tested on QDF2400] Tested-by: Jeffrey Hugo [Tested on Juno and other embedded platforms.] Tested-by: Sudeep Holla Reviewed-by: Sudeep Holla Acked-by: Will Deacon Acked-by: Greg Kroah-Hartman Signed-off-by: Paul Walmsley --- arch/arm64/include/asm/topology.h | 23 --- arch/arm64/kernel/topology.c | 303 +------------------------------------- drivers/base/arch_topology.c | 296 +++++++++++++++++++++++++++++++++++++ include/linux/arch_topology.h | 28 ++++ include/linux/topology.h | 1 + 5 files changed, 329 insertions(+), 322 deletions(-) (limited to 'include') diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h index 0524f2438649..a4d945db95a2 100644 --- a/arch/arm64/include/asm/topology.h +++ b/arch/arm64/include/asm/topology.h @@ -4,29 +4,6 @@ #include -struct cpu_topology { - int thread_id; - int core_id; - int package_id; - int llc_id; - cpumask_t thread_sibling; - cpumask_t core_sibling; - cpumask_t llc_sibling; -}; - -extern struct cpu_topology cpu_topology[NR_CPUS]; - -#define topology_physical_package_id(cpu) (cpu_topology[cpu].package_id) -#define topology_core_id(cpu) (cpu_topology[cpu].core_id) -#define topology_core_cpumask(cpu) (&cpu_topology[cpu].core_sibling) -#define topology_sibling_cpumask(cpu) (&cpu_topology[cpu].thread_sibling) -#define topology_llc_cpumask(cpu) (&cpu_topology[cpu].llc_sibling) - -void init_cpu_topology(void); -void store_cpu_topology(unsigned int cpuid); -void remove_cpu_topology(unsigned int cpuid); -const struct cpumask *cpu_coregroup_mask(int cpu); - #ifdef CONFIG_NUMA struct pci_bus; diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 0825c4a856e3..6b95c91e7d67 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -14,250 +14,13 @@ #include #include #include -#include -#include #include #include -#include -#include -#include -#include -#include -#include -#include -#include #include #include #include -static int __init get_cpu_for_node(struct device_node *node) -{ - struct device_node *cpu_node; - int cpu; - - cpu_node = of_parse_phandle(node, "cpu", 0); - if (!cpu_node) - return -1; - - cpu = of_cpu_node_to_id(cpu_node); - if (cpu >= 0) - topology_parse_cpu_capacity(cpu_node, cpu); - else - pr_crit("Unable to find CPU node for %pOF\n", cpu_node); - - of_node_put(cpu_node); - return cpu; -} - -static int __init parse_core(struct device_node *core, int package_id, - int core_id) -{ - char name[10]; - bool leaf = true; - int i = 0; - int cpu; - struct device_node *t; - - do { - snprintf(name, sizeof(name), "thread%d", i); - t = of_get_child_by_name(core, name); - if (t) { - leaf = false; - cpu = get_cpu_for_node(t); - if (cpu >= 0) { - cpu_topology[cpu].package_id = package_id; - cpu_topology[cpu].core_id = core_id; - cpu_topology[cpu].thread_id = i; - } else { - pr_err("%pOF: Can't get CPU for thread\n", - t); - of_node_put(t); - return -EINVAL; - } - of_node_put(t); - } - i++; - } while (t); - - cpu = get_cpu_for_node(core); - if (cpu >= 0) { - if (!leaf) { - pr_err("%pOF: Core has both threads and CPU\n", - core); - return -EINVAL; - } - - cpu_topology[cpu].package_id = package_id; - cpu_topology[cpu].core_id = core_id; - } else if (leaf) { - pr_err("%pOF: Can't get CPU for leaf core\n", core); - return -EINVAL; - } - - return 0; -} - -static int __init parse_cluster(struct device_node *cluster, int depth) -{ - char name[10]; - bool leaf = true; - bool has_cores = false; - struct device_node *c; - static int package_id __initdata; - int core_id = 0; - int i, ret; - - /* - * First check for child clusters; we currently ignore any - * information about the nesting of clusters and present the - * scheduler with a flat list of them. - */ - i = 0; - do { - snprintf(name, sizeof(name), "cluster%d", i); - c = of_get_child_by_name(cluster, name); - if (c) { - leaf = false; - ret = parse_cluster(c, depth + 1); - of_node_put(c); - if (ret != 0) - return ret; - } - i++; - } while (c); - - /* Now check for cores */ - i = 0; - do { - snprintf(name, sizeof(name), "core%d", i); - c = of_get_child_by_name(cluster, name); - if (c) { - has_cores = true; - - if (depth == 0) { - pr_err("%pOF: cpu-map children should be clusters\n", - c); - of_node_put(c); - return -EINVAL; - } - - if (leaf) { - ret = parse_core(c, package_id, core_id++); - } else { - pr_err("%pOF: Non-leaf cluster with core %s\n", - cluster, name); - ret = -EINVAL; - } - - of_node_put(c); - if (ret != 0) - return ret; - } - i++; - } while (c); - - if (leaf && !has_cores) - pr_warn("%pOF: empty cluster\n", cluster); - - if (leaf) - package_id++; - - return 0; -} - -static int __init parse_dt_topology(void) -{ - struct device_node *cn, *map; - int ret = 0; - int cpu; - - cn = of_find_node_by_path("/cpus"); - if (!cn) { - pr_err("No CPU information found in DT\n"); - return 0; - } - - /* - * When topology is provided cpu-map is essentially a root - * cluster with restricted subnodes. - */ - map = of_get_child_by_name(cn, "cpu-map"); - if (!map) - goto out; - - ret = parse_cluster(map, 0); - if (ret != 0) - goto out_map; - - topology_normalize_cpu_scale(); - - /* - * Check that all cores are in the topology; the SMP code will - * only mark cores described in the DT as possible. - */ - for_each_possible_cpu(cpu) - if (cpu_topology[cpu].package_id == -1) - ret = -EINVAL; - -out_map: - of_node_put(map); -out: - of_node_put(cn); - return ret; -} - -/* - * cpu topology table - */ -struct cpu_topology cpu_topology[NR_CPUS]; -EXPORT_SYMBOL_GPL(cpu_topology); - -const struct cpumask *cpu_coregroup_mask(int cpu) -{ - const cpumask_t *core_mask = cpumask_of_node(cpu_to_node(cpu)); - - /* Find the smaller of NUMA, core or LLC siblings */ - if (cpumask_subset(&cpu_topology[cpu].core_sibling, core_mask)) { - /* not numa in package, lets use the package siblings */ - core_mask = &cpu_topology[cpu].core_sibling; - } - if (cpu_topology[cpu].llc_id != -1) { - if (cpumask_subset(&cpu_topology[cpu].llc_sibling, core_mask)) - core_mask = &cpu_topology[cpu].llc_sibling; - } - - return core_mask; -} - -static void update_siblings_masks(unsigned int cpuid) -{ - struct cpu_topology *cpu_topo, *cpuid_topo = &cpu_topology[cpuid]; - int cpu; - - /* update core and thread sibling masks */ - for_each_online_cpu(cpu) { - cpu_topo = &cpu_topology[cpu]; - - if (cpuid_topo->llc_id == cpu_topo->llc_id) { - cpumask_set_cpu(cpu, &cpuid_topo->llc_sibling); - cpumask_set_cpu(cpuid, &cpu_topo->llc_sibling); - } - - if (cpuid_topo->package_id != cpu_topo->package_id) - continue; - - cpumask_set_cpu(cpuid, &cpu_topo->core_sibling); - cpumask_set_cpu(cpu, &cpuid_topo->core_sibling); - - if (cpuid_topo->core_id != cpu_topo->core_id) - continue; - - cpumask_set_cpu(cpuid, &cpu_topo->thread_sibling); - cpumask_set_cpu(cpu, &cpuid_topo->thread_sibling); - } -} - void store_cpu_topology(unsigned int cpuid) { struct cpu_topology *cpuid_topo = &cpu_topology[cpuid]; @@ -296,59 +59,19 @@ topology_populated: update_siblings_masks(cpuid); } -static void clear_cpu_topology(int cpu) -{ - struct cpu_topology *cpu_topo = &cpu_topology[cpu]; - - cpumask_clear(&cpu_topo->llc_sibling); - cpumask_set_cpu(cpu, &cpu_topo->llc_sibling); - - cpumask_clear(&cpu_topo->core_sibling); - cpumask_set_cpu(cpu, &cpu_topo->core_sibling); - cpumask_clear(&cpu_topo->thread_sibling); - cpumask_set_cpu(cpu, &cpu_topo->thread_sibling); -} - -static void __init reset_cpu_topology(void) -{ - unsigned int cpu; - - for_each_possible_cpu(cpu) { - struct cpu_topology *cpu_topo = &cpu_topology[cpu]; - - cpu_topo->thread_id = -1; - cpu_topo->core_id = 0; - cpu_topo->package_id = -1; - cpu_topo->llc_id = -1; - - clear_cpu_topology(cpu); - } -} - -void remove_cpu_topology(unsigned int cpu) -{ - int sibling; - - for_each_cpu(sibling, topology_core_cpumask(cpu)) - cpumask_clear_cpu(cpu, topology_core_cpumask(sibling)); - for_each_cpu(sibling, topology_sibling_cpumask(cpu)) - cpumask_clear_cpu(cpu, topology_sibling_cpumask(sibling)); - for_each_cpu(sibling, topology_llc_cpumask(cpu)) - cpumask_clear_cpu(cpu, topology_llc_cpumask(sibling)); - - clear_cpu_topology(cpu); -} - #ifdef CONFIG_ACPI /* * Propagate the topology information of the processor_topology_node tree to the * cpu_topology array. */ -static int __init parse_acpi_topology(void) +int __init parse_acpi_topology(void) { bool is_threaded; int cpu, topology_id; + if (acpi_disabled) + return 0; + is_threaded = read_cpuid_mpidr() & MPIDR_MT_BITMASK; for_each_possible_cpu(cpu) { @@ -384,24 +107,6 @@ static int __init parse_acpi_topology(void) return 0; } - -#else -static inline int __init parse_acpi_topology(void) -{ - return -EINVAL; -} #endif -void __init init_cpu_topology(void) -{ - reset_cpu_topology(); - /* - * Discard anything that was parsed if we hit an error so we - * don't use partial information. - */ - if (!acpi_disabled && parse_acpi_topology()) - reset_cpu_topology(); - else if (of_have_populated_dt() && parse_dt_topology()) - reset_cpu_topology(); -} diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index 63c1e76739f1..5dc0e1ddd080 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -15,6 +15,11 @@ #include #include #include +#include +#include +#include +#include +#include DEFINE_PER_CPU(unsigned long, freq_scale) = SCHED_CAPACITY_SCALE; @@ -241,3 +246,294 @@ static void parsing_done_workfn(struct work_struct *work) #else core_initcall(free_raw_capacity); #endif + +#if defined(CONFIG_ARM64) || defined(CONFIG_RISCV) +static int __init get_cpu_for_node(struct device_node *node) +{ + struct device_node *cpu_node; + int cpu; + + cpu_node = of_parse_phandle(node, "cpu", 0); + if (!cpu_node) + return -1; + + cpu = of_cpu_node_to_id(cpu_node); + if (cpu >= 0) + topology_parse_cpu_capacity(cpu_node, cpu); + else + pr_crit("Unable to find CPU node for %pOF\n", cpu_node); + + of_node_put(cpu_node); + return cpu; +} + +static int __init parse_core(struct device_node *core, int package_id, + int core_id) +{ + char name[10]; + bool leaf = true; + int i = 0; + int cpu; + struct device_node *t; + + do { + snprintf(name, sizeof(name), "thread%d", i); + t = of_get_child_by_name(core, name); + if (t) { + leaf = false; + cpu = get_cpu_for_node(t); + if (cpu >= 0) { + cpu_topology[cpu].package_id = package_id; + cpu_topology[cpu].core_id = core_id; + cpu_topology[cpu].thread_id = i; + } else { + pr_err("%pOF: Can't get CPU for thread\n", + t); + of_node_put(t); + return -EINVAL; + } + of_node_put(t); + } + i++; + } while (t); + + cpu = get_cpu_for_node(core); + if (cpu >= 0) { + if (!leaf) { + pr_err("%pOF: Core has both threads and CPU\n", + core); + return -EINVAL; + } + + cpu_topology[cpu].package_id = package_id; + cpu_topology[cpu].core_id = core_id; + } else if (leaf) { + pr_err("%pOF: Can't get CPU for leaf core\n", core); + return -EINVAL; + } + + return 0; +} + +static int __init parse_cluster(struct device_node *cluster, int depth) +{ + char name[10]; + bool leaf = true; + bool has_cores = false; + struct device_node *c; + static int package_id __initdata; + int core_id = 0; + int i, ret; + + /* + * First check for child clusters; we currently ignore any + * information about the nesting of clusters and present the + * scheduler with a flat list of them. + */ + i = 0; + do { + snprintf(name, sizeof(name), "cluster%d", i); + c = of_get_child_by_name(cluster, name); + if (c) { + leaf = false; + ret = parse_cluster(c, depth + 1); + of_node_put(c); + if (ret != 0) + return ret; + } + i++; + } while (c); + + /* Now check for cores */ + i = 0; + do { + snprintf(name, sizeof(name), "core%d", i); + c = of_get_child_by_name(cluster, name); + if (c) { + has_cores = true; + + if (depth == 0) { + pr_err("%pOF: cpu-map children should be clusters\n", + c); + of_node_put(c); + return -EINVAL; + } + + if (leaf) { + ret = parse_core(c, package_id, core_id++); + } else { + pr_err("%pOF: Non-leaf cluster with core %s\n", + cluster, name); + ret = -EINVAL; + } + + of_node_put(c); + if (ret != 0) + return ret; + } + i++; + } while (c); + + if (leaf && !has_cores) + pr_warn("%pOF: empty cluster\n", cluster); + + if (leaf) + package_id++; + + return 0; +} + +static int __init parse_dt_topology(void) +{ + struct device_node *cn, *map; + int ret = 0; + int cpu; + + cn = of_find_node_by_path("/cpus"); + if (!cn) { + pr_err("No CPU information found in DT\n"); + return 0; + } + + /* + * When topology is provided cpu-map is essentially a root + * cluster with restricted subnodes. + */ + map = of_get_child_by_name(cn, "cpu-map"); + if (!map) + goto out; + + ret = parse_cluster(map, 0); + if (ret != 0) + goto out_map; + + topology_normalize_cpu_scale(); + + /* + * Check that all cores are in the topology; the SMP code will + * only mark cores described in the DT as possible. + */ + for_each_possible_cpu(cpu) + if (cpu_topology[cpu].package_id == -1) + ret = -EINVAL; + +out_map: + of_node_put(map); +out: + of_node_put(cn); + return ret; +} + +/* + * cpu topology table + */ +struct cpu_topology cpu_topology[NR_CPUS]; +EXPORT_SYMBOL_GPL(cpu_topology); + +const struct cpumask *cpu_coregroup_mask(int cpu) +{ + const cpumask_t *core_mask = cpumask_of_node(cpu_to_node(cpu)); + + /* Find the smaller of NUMA, core or LLC siblings */ + if (cpumask_subset(&cpu_topology[cpu].core_sibling, core_mask)) { + /* not numa in package, lets use the package siblings */ + core_mask = &cpu_topology[cpu].core_sibling; + } + if (cpu_topology[cpu].llc_id != -1) { + if (cpumask_subset(&cpu_topology[cpu].llc_sibling, core_mask)) + core_mask = &cpu_topology[cpu].llc_sibling; + } + + return core_mask; +} + +void update_siblings_masks(unsigned int cpuid) +{ + struct cpu_topology *cpu_topo, *cpuid_topo = &cpu_topology[cpuid]; + int cpu; + + /* update core and thread sibling masks */ + for_each_online_cpu(cpu) { + cpu_topo = &cpu_topology[cpu]; + + if (cpuid_topo->llc_id == cpu_topo->llc_id) { + cpumask_set_cpu(cpu, &cpuid_topo->llc_sibling); + cpumask_set_cpu(cpuid, &cpu_topo->llc_sibling); + } + + if (cpuid_topo->package_id != cpu_topo->package_id) + continue; + + cpumask_set_cpu(cpuid, &cpu_topo->core_sibling); + cpumask_set_cpu(cpu, &cpuid_topo->core_sibling); + + if (cpuid_topo->core_id != cpu_topo->core_id) + continue; + + cpumask_set_cpu(cpuid, &cpu_topo->thread_sibling); + cpumask_set_cpu(cpu, &cpuid_topo->thread_sibling); + } +} + +static void clear_cpu_topology(int cpu) +{ + struct cpu_topology *cpu_topo = &cpu_topology[cpu]; + + cpumask_clear(&cpu_topo->llc_sibling); + cpumask_set_cpu(cpu, &cpu_topo->llc_sibling); + + cpumask_clear(&cpu_topo->core_sibling); + cpumask_set_cpu(cpu, &cpu_topo->core_sibling); + cpumask_clear(&cpu_topo->thread_sibling); + cpumask_set_cpu(cpu, &cpu_topo->thread_sibling); +} + +static void __init reset_cpu_topology(void) +{ + unsigned int cpu; + + for_each_possible_cpu(cpu) { + struct cpu_topology *cpu_topo = &cpu_topology[cpu]; + + cpu_topo->thread_id = -1; + cpu_topo->core_id = -1; + cpu_topo->package_id = -1; + cpu_topo->llc_id = -1; + + clear_cpu_topology(cpu); + } +} + +void remove_cpu_topology(unsigned int cpu) +{ + int sibling; + + for_each_cpu(sibling, topology_core_cpumask(cpu)) + cpumask_clear_cpu(cpu, topology_core_cpumask(sibling)); + for_each_cpu(sibling, topology_sibling_cpumask(cpu)) + cpumask_clear_cpu(cpu, topology_sibling_cpumask(sibling)); + for_each_cpu(sibling, topology_llc_cpumask(cpu)) + cpumask_clear_cpu(cpu, topology_llc_cpumask(sibling)); + + clear_cpu_topology(cpu); +} + +__weak int __init parse_acpi_topology(void) +{ + return 0; +} + +void __init init_cpu_topology(void) +{ + reset_cpu_topology(); + + /* + * Discard anything that was parsed if we hit an error so we + * don't use partial information. + */ + if (parse_acpi_topology()) + reset_cpu_topology(); + else if (of_have_populated_dt() && parse_dt_topology()) + reset_cpu_topology(); +} +#endif diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index 1cfe05ea1d89..ede0ce4623b4 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h @@ -33,4 +33,32 @@ unsigned long topology_get_freq_scale(int cpu) return per_cpu(freq_scale, cpu); } +struct cpu_topology { + int thread_id; + int core_id; + int package_id; + int llc_id; + cpumask_t thread_sibling; + cpumask_t core_sibling; + cpumask_t llc_sibling; +}; + +#ifdef CONFIG_GENERIC_ARCH_TOPOLOGY +extern struct cpu_topology cpu_topology[NR_CPUS]; + +#define topology_physical_package_id(cpu) (cpu_topology[cpu].package_id) +#define topology_core_id(cpu) (cpu_topology[cpu].core_id) +#define topology_core_cpumask(cpu) (&cpu_topology[cpu].core_sibling) +#define topology_sibling_cpumask(cpu) (&cpu_topology[cpu].thread_sibling) +#define topology_llc_cpumask(cpu) (&cpu_topology[cpu].llc_sibling) +void init_cpu_topology(void); +void store_cpu_topology(unsigned int cpuid); +const struct cpumask *cpu_coregroup_mask(int cpu); +#endif + +#if defined(CONFIG_ARM64) || defined(CONFIG_RISCV) +void update_siblings_masks(unsigned int cpu); +#endif +void remove_cpu_topology(unsigned int cpuid); + #endif /* _LINUX_ARCH_TOPOLOGY_H_ */ diff --git a/include/linux/topology.h b/include/linux/topology.h index 47a3e3c08036..2a19d196af28 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -27,6 +27,7 @@ #ifndef _LINUX_TOPOLOGY_H #define _LINUX_TOPOLOGY_H +#include #include #include #include -- cgit v1.2.3 From ca74b316df96d7c40ee3e8301065607c11c60c27 Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Thu, 27 Jun 2019 12:52:59 -0700 Subject: arm: Use common cpu_topology structure and functions. Currently, ARM32 and ARM64 uses different data structures to represent their cpu topologies. Since, we are moving the ARM64 topology to common code to be used by other architectures, we can reuse that for ARM32 as well. Take this opprtunity to remove the redundant functions from ARM32 and reuse the common code instead. To: Russell King Signed-off-by: Atish Patra Tested-by: Sudeep Holla (on TC2) Reviewed-by: Sudeep Holla Signed-off-by: Paul Walmsley --- arch/arm/include/asm/topology.h | 20 -------------- arch/arm/kernel/topology.c | 60 +++++------------------------------------ drivers/base/arch_topology.c | 4 ++- include/linux/arch_topology.h | 6 ++--- 4 files changed, 11 insertions(+), 79 deletions(-) (limited to 'include') diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h index 2a786f54d8b8..8a0fae94d45e 100644 --- a/arch/arm/include/asm/topology.h +++ b/arch/arm/include/asm/topology.h @@ -5,26 +5,6 @@ #ifdef CONFIG_ARM_CPU_TOPOLOGY #include - -struct cputopo_arm { - int thread_id; - int core_id; - int socket_id; - cpumask_t thread_sibling; - cpumask_t core_sibling; -}; - -extern struct cputopo_arm cpu_topology[NR_CPUS]; - -#define topology_physical_package_id(cpu) (cpu_topology[cpu].socket_id) -#define topology_core_id(cpu) (cpu_topology[cpu].core_id) -#define topology_core_cpumask(cpu) (&cpu_topology[cpu].core_sibling) -#define topology_sibling_cpumask(cpu) (&cpu_topology[cpu].thread_sibling) - -void init_cpu_topology(void); -void store_cpu_topology(unsigned int cpuid); -const struct cpumask *cpu_coregroup_mask(int cpu); - #include /* Replace task scheduler's default frequency-invariant accounting */ diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c index d17cb1e6d679..5b9faba03afb 100644 --- a/arch/arm/kernel/topology.c +++ b/arch/arm/kernel/topology.c @@ -177,17 +177,6 @@ static inline void parse_dt_topology(void) {} static inline void update_cpu_capacity(unsigned int cpuid) {} #endif - /* - * cpu topology table - */ -struct cputopo_arm cpu_topology[NR_CPUS]; -EXPORT_SYMBOL_GPL(cpu_topology); - -const struct cpumask *cpu_coregroup_mask(int cpu) -{ - return &cpu_topology[cpu].core_sibling; -} - /* * The current assumption is that we can power gate each core independently. * This will be superseded by DT binding once available. @@ -197,32 +186,6 @@ const struct cpumask *cpu_corepower_mask(int cpu) return &cpu_topology[cpu].thread_sibling; } -static void update_siblings_masks(unsigned int cpuid) -{ - struct cputopo_arm *cpu_topo, *cpuid_topo = &cpu_topology[cpuid]; - int cpu; - - /* update core and thread sibling masks */ - for_each_possible_cpu(cpu) { - cpu_topo = &cpu_topology[cpu]; - - if (cpuid_topo->socket_id != cpu_topo->socket_id) - continue; - - cpumask_set_cpu(cpuid, &cpu_topo->core_sibling); - if (cpu != cpuid) - cpumask_set_cpu(cpu, &cpuid_topo->core_sibling); - - if (cpuid_topo->core_id != cpu_topo->core_id) - continue; - - cpumask_set_cpu(cpuid, &cpu_topo->thread_sibling); - if (cpu != cpuid) - cpumask_set_cpu(cpu, &cpuid_topo->thread_sibling); - } - smp_wmb(); -} - /* * store_cpu_topology is called at boot when only one cpu is running * and with the mutex cpu_hotplug.lock locked, when several cpus have booted, @@ -230,7 +193,7 @@ static void update_siblings_masks(unsigned int cpuid) */ void store_cpu_topology(unsigned int cpuid) { - struct cputopo_arm *cpuid_topo = &cpu_topology[cpuid]; + struct cpu_topology *cpuid_topo = &cpu_topology[cpuid]; unsigned int mpidr; /* If the cpu topology has been already set, just return */ @@ -250,12 +213,12 @@ void store_cpu_topology(unsigned int cpuid) /* core performance interdependency */ cpuid_topo->thread_id = MPIDR_AFFINITY_LEVEL(mpidr, 0); cpuid_topo->core_id = MPIDR_AFFINITY_LEVEL(mpidr, 1); - cpuid_topo->socket_id = MPIDR_AFFINITY_LEVEL(mpidr, 2); + cpuid_topo->package_id = MPIDR_AFFINITY_LEVEL(mpidr, 2); } else { /* largely independent cores */ cpuid_topo->thread_id = -1; cpuid_topo->core_id = MPIDR_AFFINITY_LEVEL(mpidr, 0); - cpuid_topo->socket_id = MPIDR_AFFINITY_LEVEL(mpidr, 1); + cpuid_topo->package_id = MPIDR_AFFINITY_LEVEL(mpidr, 1); } } else { /* @@ -265,7 +228,7 @@ void store_cpu_topology(unsigned int cpuid) */ cpuid_topo->thread_id = -1; cpuid_topo->core_id = 0; - cpuid_topo->socket_id = -1; + cpuid_topo->package_id = -1; } update_siblings_masks(cpuid); @@ -275,7 +238,7 @@ void store_cpu_topology(unsigned int cpuid) pr_info("CPU%u: thread %d, cpu %d, socket %d, mpidr %x\n", cpuid, cpu_topology[cpuid].thread_id, cpu_topology[cpuid].core_id, - cpu_topology[cpuid].socket_id, mpidr); + cpu_topology[cpuid].package_id, mpidr); } static inline int cpu_corepower_flags(void) @@ -298,18 +261,7 @@ static struct sched_domain_topology_level arm_topology[] = { */ void __init init_cpu_topology(void) { - unsigned int cpu; - - /* init core mask and capacity */ - for_each_possible_cpu(cpu) { - struct cputopo_arm *cpu_topo = &(cpu_topology[cpu]); - - cpu_topo->thread_id = -1; - cpu_topo->core_id = -1; - cpu_topo->socket_id = -1; - cpumask_clear(&cpu_topo->core_sibling); - cpumask_clear(&cpu_topo->thread_sibling); - } + reset_cpu_topology(); smp_wmb(); parse_dt_topology(); diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index 5dc0e1ddd080..b54d241a2ff5 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -423,6 +423,7 @@ out: of_node_put(cn); return ret; } +#endif /* * cpu topology table @@ -488,7 +489,7 @@ static void clear_cpu_topology(int cpu) cpumask_set_cpu(cpu, &cpu_topo->thread_sibling); } -static void __init reset_cpu_topology(void) +void __init reset_cpu_topology(void) { unsigned int cpu; @@ -523,6 +524,7 @@ __weak int __init parse_acpi_topology(void) return 0; } +#if defined(CONFIG_ARM64) || defined(CONFIG_RISCV) void __init init_cpu_topology(void) { reset_cpu_topology(); diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index ede0ce4623b4..42f2b5126094 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h @@ -54,11 +54,9 @@ extern struct cpu_topology cpu_topology[NR_CPUS]; void init_cpu_topology(void); void store_cpu_topology(unsigned int cpuid); const struct cpumask *cpu_coregroup_mask(int cpu); -#endif - -#if defined(CONFIG_ARM64) || defined(CONFIG_RISCV) void update_siblings_masks(unsigned int cpu); -#endif void remove_cpu_topology(unsigned int cpuid); +void reset_cpu_topology(void); +#endif #endif /* _LINUX_ARCH_TOPOLOGY_H_ */ -- cgit v1.2.3 From 2b74c878b0eae4c32629c2d5ba69a29f69048313 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Mon, 15 Jul 2019 12:45:28 -0400 Subject: IB/hfi1: Unreserve a flushed OPFN request When an OPFN request is flushed, the request is completed without unreserving itself from the send queue. Subsequently, when a new request is post sent, the following warning will be triggered: WARNING: CPU: 4 PID: 8130 at rdmavt/qp.c:1761 rvt_post_send+0x72a/0x880 [rdmavt] Call Trace: [] dump_stack+0x19/0x1b [] __warn+0xd8/0x100 [] warn_slowpath_null+0x1d/0x20 [] rvt_post_send+0x72a/0x880 [rdmavt] [] ? account_entity_dequeue+0xae/0xd0 [] ? __kmalloc+0x55/0x230 [] ib_uverbs_post_send+0x37c/0x5d0 [ib_uverbs] [] ? rdma_lookup_put_uobject+0x26/0x60 [ib_uverbs] [] ib_uverbs_write+0x286/0x460 [ib_uverbs] [] ? security_file_permission+0x27/0xa0 [] vfs_write+0xc0/0x1f0 [] SyS_write+0x7f/0xf0 [] system_call_fastpath+0x22/0x27 This patch fixes the problem by moving rvt_qp_wqe_unreserve() into rvt_qp_complete_swqe() to simplify the code and make it less error-prone. Fixes: ca95f802ef51 ("IB/hfi1: Unreserve a reserved request when it is completed") Link: https://lore.kernel.org/r/20190715164528.74174.31364.stgit@awfm-01.aw.intel.com Cc: Reviewed-by: Mike Marciniszyn Reviewed-by: Dennis Dalessandro Signed-off-by: Kaike Wan Signed-off-by: Mike Marciniszyn Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/rc.c | 2 -- include/rdma/rdmavt_qp.h | 9 ++++----- 2 files changed, 4 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index 0477c14633ab..024a7c2b6124 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -1835,7 +1835,6 @@ void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah) cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0) break; trdma_clean_swqe(qp, wqe); - rvt_qp_wqe_unreserve(qp, wqe); trace_hfi1_qp_send_completion(qp, wqe, qp->s_last); rvt_qp_complete_swqe(qp, wqe, @@ -1882,7 +1881,6 @@ struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, if (cmp_psn(wqe->lpsn, qp->s_sending_psn) < 0 || cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) { trdma_clean_swqe(qp, wqe); - rvt_qp_wqe_unreserve(qp, wqe); trace_hfi1_qp_send_completion(qp, wqe, qp->s_last); rvt_qp_complete_swqe(qp, wqe, diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index 0eeea520a853..e06c77d76463 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -608,7 +608,7 @@ static inline void rvt_qp_wqe_reserve( /** * rvt_qp_wqe_unreserve - clean reserved operation * @qp - the rvt qp - * @wqe - the send wqe + * @flags - send wqe flags * * This decrements the reserve use count. * @@ -620,11 +620,9 @@ static inline void rvt_qp_wqe_reserve( * the compiler does not juggle the order of the s_last * ring index and the decrementing of s_reserved_used. */ -static inline void rvt_qp_wqe_unreserve( - struct rvt_qp *qp, - struct rvt_swqe *wqe) +static inline void rvt_qp_wqe_unreserve(struct rvt_qp *qp, int flags) { - if (unlikely(wqe->wr.send_flags & RVT_SEND_RESERVE_USED)) { + if (unlikely(flags & RVT_SEND_RESERVE_USED)) { atomic_dec(&qp->s_reserved_used); /* insure no compiler re-order up to s_last change */ smp_mb__after_atomic(); @@ -853,6 +851,7 @@ rvt_qp_complete_swqe(struct rvt_qp *qp, u32 byte_len, last; int flags = wqe->wr.send_flags; + rvt_qp_wqe_unreserve(qp, flags); rvt_put_qp_swqe(qp, wqe); need_completion = -- cgit v1.2.3 From 2169e6daa1ffa6e9869fcc56ff7df23c9287f1ec Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Tue, 11 Jun 2019 09:49:53 -0400 Subject: media: media/pci: don't set description for ENUM_FMT The V4L2 core sets the description for the driver in order to ensure consistent naming. So drop the strscpy of the description in drivers. Also remove any description strings in driver-internal structures since those are no longer needed. Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/media/common/saa7146/saa7146_video.c | 16 ++------------- drivers/media/pci/bt8xx/bttv-driver.c | 19 ----------------- drivers/media/pci/bt8xx/bttv-risc.c | 8 ++++---- drivers/media/pci/bt8xx/bttvp.h | 1 - drivers/media/pci/cobalt/cobalt-v4l2.c | 5 ----- drivers/media/pci/cx23885/cx23885-417.c | 1 - drivers/media/pci/cx23885/cx23885-video.c | 7 ++----- drivers/media/pci/cx23885/cx23885.h | 1 - drivers/media/pci/cx25821/cx25821-video.c | 7 ++----- drivers/media/pci/cx25821/cx25821.h | 1 - drivers/media/pci/cx88/cx88-blackbird.c | 2 -- drivers/media/pci/cx88/cx88-video.c | 17 +++------------- drivers/media/pci/cx88/cx88.h | 1 - drivers/media/pci/dt3155/dt3155.c | 1 - drivers/media/pci/meye/meye.c | 3 --- drivers/media/pci/saa7134/saa7134-empress.c | 2 -- drivers/media/pci/saa7134/saa7134-video.c | 28 ++++---------------------- drivers/media/pci/saa7134/saa7134.h | 1 - drivers/media/pci/saa7164/saa7164-encoder.c | 1 - drivers/media/pci/solo6x10/solo6x10-v4l2-enc.c | 7 ------- drivers/media/pci/solo6x10/solo6x10-v4l2.c | 2 -- drivers/media/pci/sta2x11/sta2x11_vip.c | 2 -- drivers/media/pci/tw68/tw68-video.c | 13 ------------ drivers/media/pci/tw68/tw68.h | 1 - include/media/drv-intf/saa7146_vv.h | 1 - 25 files changed, 17 insertions(+), 131 deletions(-) (limited to 'include') diff --git a/drivers/media/common/saa7146/saa7146_video.c b/drivers/media/common/saa7146/saa7146_video.c index 4c399a42e874..d16122039b0c 100644 --- a/drivers/media/common/saa7146/saa7146_video.c +++ b/drivers/media/common/saa7146/saa7146_video.c @@ -20,62 +20,52 @@ MODULE_PARM_DESC(max_memory, "maximum memory usage for capture buffers (default: /* format descriptions for capture and preview */ static struct saa7146_format formats[] = { { - .name = "RGB-8 (3-3-2)", .pixelformat = V4L2_PIX_FMT_RGB332, .trans = RGB08_COMPOSED, .depth = 8, .flags = 0, }, { - .name = "RGB-16 (5/B-6/G-5/R)", .pixelformat = V4L2_PIX_FMT_RGB565, .trans = RGB16_COMPOSED, .depth = 16, .flags = 0, }, { - .name = "RGB-24 (B-G-R)", .pixelformat = V4L2_PIX_FMT_BGR24, .trans = RGB24_COMPOSED, .depth = 24, .flags = 0, }, { - .name = "RGB-32 (B-G-R)", .pixelformat = V4L2_PIX_FMT_BGR32, .trans = RGB32_COMPOSED, .depth = 32, .flags = 0, }, { - .name = "RGB-32 (R-G-B)", .pixelformat = V4L2_PIX_FMT_RGB32, .trans = RGB32_COMPOSED, .depth = 32, .flags = 0, .swap = 0x2, }, { - .name = "Greyscale-8", .pixelformat = V4L2_PIX_FMT_GREY, .trans = Y8, .depth = 8, .flags = 0, }, { - .name = "YUV 4:2:2 planar (Y-Cb-Cr)", .pixelformat = V4L2_PIX_FMT_YUV422P, .trans = YUV422_DECOMPOSED, .depth = 16, .flags = FORMAT_BYTE_SWAP|FORMAT_IS_PLANAR, }, { - .name = "YVU 4:2:0 planar (Y-Cb-Cr)", .pixelformat = V4L2_PIX_FMT_YVU420, .trans = YUV420_DECOMPOSED, .depth = 12, .flags = FORMAT_BYTE_SWAP|FORMAT_IS_PLANAR, }, { - .name = "YUV 4:2:0 planar (Y-Cb-Cr)", .pixelformat = V4L2_PIX_FMT_YUV420, .trans = YUV420_DECOMPOSED, .depth = 12, .flags = FORMAT_IS_PLANAR, }, { - .name = "YUV 4:2:2 (U-Y-V-Y)", .pixelformat = V4L2_PIX_FMT_UYVY, .trans = YUV422_COMPOSED, .depth = 16, @@ -147,10 +137,10 @@ int saa7146_start_preview(struct saa7146_fh *fh) } vv->ov.win = fmt.fmt.win; - DEB_D("%dx%d+%d+%d %s field=%s\n", + DEB_D("%dx%d+%d+%d 0x%08x field=%s\n", vv->ov.win.w.width, vv->ov.win.w.height, vv->ov.win.w.left, vv->ov.win.w.top, - vv->ov_fmt->name, v4l2_field_names[vv->ov.win.field]); + vv->ov_fmt->pixelformat, v4l2_field_names[vv->ov.win.field]); if (0 != (ret = saa7146_enable_overlay(fh))) { DEB_D("enabling overlay failed: %d\n", ret); @@ -515,8 +505,6 @@ static int vidioc_enum_fmt_vid_cap(struct file *file, void *fh, struct v4l2_fmtd { if (f->index >= ARRAY_SIZE(formats)) return -EINVAL; - strscpy((char *)f->description, formats[f->index].name, - sizeof(f->description)); f->pixelformat = formats[f->index].pixelformat; return 0; } diff --git a/drivers/media/pci/bt8xx/bttv-driver.c b/drivers/media/pci/bt8xx/bttv-driver.c index 612d1c0010c1..a359da7773a9 100644 --- a/drivers/media/pci/bt8xx/bttv-driver.c +++ b/drivers/media/pci/bt8xx/bttv-driver.c @@ -503,77 +503,65 @@ static const unsigned int BTTV_TVNORMS = ARRAY_SIZE(bttv_tvnorms); packed pixel formats must come first */ static const struct bttv_format formats[] = { { - .name = "8 bpp, gray", .fourcc = V4L2_PIX_FMT_GREY, .btformat = BT848_COLOR_FMT_Y8, .depth = 8, .flags = FORMAT_FLAGS_PACKED, },{ - .name = "8 bpp, dithered color", .fourcc = V4L2_PIX_FMT_HI240, .btformat = BT848_COLOR_FMT_RGB8, .depth = 8, .flags = FORMAT_FLAGS_PACKED | FORMAT_FLAGS_DITHER, },{ - .name = "15 bpp RGB, le", .fourcc = V4L2_PIX_FMT_RGB555, .btformat = BT848_COLOR_FMT_RGB15, .depth = 16, .flags = FORMAT_FLAGS_PACKED, },{ - .name = "15 bpp RGB, be", .fourcc = V4L2_PIX_FMT_RGB555X, .btformat = BT848_COLOR_FMT_RGB15, .btswap = 0x03, /* byteswap */ .depth = 16, .flags = FORMAT_FLAGS_PACKED, },{ - .name = "16 bpp RGB, le", .fourcc = V4L2_PIX_FMT_RGB565, .btformat = BT848_COLOR_FMT_RGB16, .depth = 16, .flags = FORMAT_FLAGS_PACKED, },{ - .name = "16 bpp RGB, be", .fourcc = V4L2_PIX_FMT_RGB565X, .btformat = BT848_COLOR_FMT_RGB16, .btswap = 0x03, /* byteswap */ .depth = 16, .flags = FORMAT_FLAGS_PACKED, },{ - .name = "24 bpp RGB, le", .fourcc = V4L2_PIX_FMT_BGR24, .btformat = BT848_COLOR_FMT_RGB24, .depth = 24, .flags = FORMAT_FLAGS_PACKED, },{ - .name = "32 bpp RGB, le", .fourcc = V4L2_PIX_FMT_BGR32, .btformat = BT848_COLOR_FMT_RGB32, .depth = 32, .flags = FORMAT_FLAGS_PACKED, },{ - .name = "32 bpp RGB, be", .fourcc = V4L2_PIX_FMT_RGB32, .btformat = BT848_COLOR_FMT_RGB32, .btswap = 0x0f, /* byte+word swap */ .depth = 32, .flags = FORMAT_FLAGS_PACKED, },{ - .name = "4:2:2, packed, YUYV", .fourcc = V4L2_PIX_FMT_YUYV, .btformat = BT848_COLOR_FMT_YUY2, .depth = 16, .flags = FORMAT_FLAGS_PACKED, },{ - .name = "4:2:2, packed, UYVY", .fourcc = V4L2_PIX_FMT_UYVY, .btformat = BT848_COLOR_FMT_YUY2, .btswap = 0x03, /* byteswap */ .depth = 16, .flags = FORMAT_FLAGS_PACKED, },{ - .name = "4:2:2, planar, Y-Cb-Cr", .fourcc = V4L2_PIX_FMT_YUV422P, .btformat = BT848_COLOR_FMT_YCrCb422, .depth = 16, @@ -581,7 +569,6 @@ static const struct bttv_format formats[] = { .hshift = 1, .vshift = 0, },{ - .name = "4:2:0, planar, Y-Cb-Cr", .fourcc = V4L2_PIX_FMT_YUV420, .btformat = BT848_COLOR_FMT_YCrCb422, .depth = 12, @@ -589,7 +576,6 @@ static const struct bttv_format formats[] = { .hshift = 1, .vshift = 1, },{ - .name = "4:2:0, planar, Y-Cr-Cb", .fourcc = V4L2_PIX_FMT_YVU420, .btformat = BT848_COLOR_FMT_YCrCb422, .depth = 12, @@ -597,7 +583,6 @@ static const struct bttv_format formats[] = { .hshift = 1, .vshift = 1, },{ - .name = "4:1:1, planar, Y-Cb-Cr", .fourcc = V4L2_PIX_FMT_YUV411P, .btformat = BT848_COLOR_FMT_YCrCb411, .depth = 12, @@ -605,7 +590,6 @@ static const struct bttv_format formats[] = { .hshift = 2, .vshift = 0, },{ - .name = "4:1:0, planar, Y-Cb-Cr", .fourcc = V4L2_PIX_FMT_YUV410, .btformat = BT848_COLOR_FMT_YCrCb411, .depth = 9, @@ -613,7 +597,6 @@ static const struct bttv_format formats[] = { .hshift = 2, .vshift = 2, },{ - .name = "4:1:0, planar, Y-Cr-Cb", .fourcc = V4L2_PIX_FMT_YVU410, .btformat = BT848_COLOR_FMT_YCrCb411, .depth = 9, @@ -621,7 +604,6 @@ static const struct bttv_format formats[] = { .hshift = 2, .vshift = 2, },{ - .name = "raw scanlines", .fourcc = -1, .btformat = BT848_COLOR_FMT_RAW, .depth = 8, @@ -2500,7 +2482,6 @@ static int bttv_enum_fmt_cap_ovr(struct v4l2_fmtdesc *f) return -EINVAL; f->pixelformat = formats[i].fourcc; - strscpy(f->description, formats[i].name, sizeof(f->description)); return i; } diff --git a/drivers/media/pci/bt8xx/bttv-risc.c b/drivers/media/pci/bt8xx/bttv-risc.c index 6b59ca337c7f..fc8708047be8 100644 --- a/drivers/media/pci/bt8xx/bttv-risc.c +++ b/drivers/media/pci/bt8xx/bttv-risc.c @@ -699,9 +699,9 @@ bttv_buffer_risc(struct bttv *btv, struct bttv_buffer *buf) const struct bttv_tvnorm *tvnorm = bttv_tvnorms + buf->tvnorm; struct videobuf_dmabuf *dma=videobuf_to_dma(&buf->vb); - dprintk("%d: buffer field: %s format: %s size: %dx%d\n", + dprintk("%d: buffer field: %s format: 0x%08x size: %dx%d\n", btv->c.nr, v4l2_field_names[buf->vb.field], - buf->fmt->name, buf->vb.width, buf->vb.height); + buf->fmt->fourcc, buf->vb.width, buf->vb.height); /* packed pixel modes */ if (buf->fmt->flags & FORMAT_FLAGS_PACKED) { @@ -860,9 +860,9 @@ bttv_overlay_risc(struct bttv *btv, struct bttv_buffer *buf) { /* check interleave, bottom+top fields */ - dprintk("%d: overlay fields: %s format: %s size: %dx%d\n", + dprintk("%d: overlay fields: %s format: 0x%08x size: %dx%d\n", btv->c.nr, v4l2_field_names[buf->vb.field], - fmt->name, ov->w.width, ov->w.height); + fmt->fourcc, ov->w.width, ov->w.height); /* calculate geometry */ bttv_calc_geo(btv,&buf->geo,ov->w.width,ov->w.height, diff --git a/drivers/media/pci/bt8xx/bttvp.h b/drivers/media/pci/bt8xx/bttvp.h index b159d6ddbfcf..4abf43657846 100644 --- a/drivers/media/pci/bt8xx/bttvp.h +++ b/drivers/media/pci/bt8xx/bttvp.h @@ -99,7 +99,6 @@ struct bttv_tvnorm { extern const struct bttv_tvnorm bttv_tvnorms[]; struct bttv_format { - char *name; int fourcc; /* video4linux 2 */ int btformat; /* BT848_COLOR_FMT_* */ int btswap; /* BT848_COLOR_CTL_* */ diff --git a/drivers/media/pci/cobalt/cobalt-v4l2.c b/drivers/media/pci/cobalt/cobalt-v4l2.c index 39dabd4da60f..ea96f333ee2b 100644 --- a/drivers/media/pci/cobalt/cobalt-v4l2.c +++ b/drivers/media/pci/cobalt/cobalt-v4l2.c @@ -688,15 +688,12 @@ static int cobalt_enum_fmt_vid_cap(struct file *file, void *priv_fh, { switch (f->index) { case 0: - strscpy(f->description, "YUV 4:2:2", sizeof(f->description)); f->pixelformat = V4L2_PIX_FMT_YUYV; break; case 1: - strscpy(f->description, "RGB24", sizeof(f->description)); f->pixelformat = V4L2_PIX_FMT_RGB24; break; case 2: - strscpy(f->description, "RGB32", sizeof(f->description)); f->pixelformat = V4L2_PIX_FMT_BGR32; break; default: @@ -893,11 +890,9 @@ static int cobalt_enum_fmt_vid_out(struct file *file, void *priv_fh, { switch (f->index) { case 0: - strscpy(f->description, "YUV 4:2:2", sizeof(f->description)); f->pixelformat = V4L2_PIX_FMT_YUYV; break; case 1: - strscpy(f->description, "RGB32", sizeof(f->description)); f->pixelformat = V4L2_PIX_FMT_BGR32; break; default: diff --git a/drivers/media/pci/cx23885/cx23885-417.c b/drivers/media/pci/cx23885/cx23885-417.c index 82f96a4091ac..2327fe612610 100644 --- a/drivers/media/pci/cx23885/cx23885-417.c +++ b/drivers/media/pci/cx23885/cx23885-417.c @@ -1339,7 +1339,6 @@ static int vidioc_enum_fmt_vid_cap(struct file *file, void *priv, if (f->index != 0) return -EINVAL; - strscpy(f->description, "MPEG", sizeof(f->description)); f->pixelformat = V4L2_PIX_FMT_MPEG; return 0; diff --git a/drivers/media/pci/cx23885/cx23885-video.c b/drivers/media/pci/cx23885/cx23885-video.c index b254473db9a3..8098b15493de 100644 --- a/drivers/media/pci/cx23885/cx23885-video.c +++ b/drivers/media/pci/cx23885/cx23885-video.c @@ -67,7 +67,6 @@ MODULE_PARM_DESC(vid_limit, "capture memory limit in megabytes"); #define FORMAT_FLAGS_PACKED 0x01 static struct cx23885_fmt formats[] = { { - .name = "4:2:2, packed, YUYV", .fourcc = V4L2_PIX_FMT_YUYV, .depth = 16, .flags = FORMAT_FLAGS_PACKED, @@ -411,9 +410,9 @@ static int buffer_prepare(struct vb2_buffer *vb) default: BUG(); } - dprintk(2, "[%p/%d] buffer_init - %dx%d %dbpp \"%s\" - dma=0x%08lx\n", + dprintk(2, "[%p/%d] buffer_init - %dx%d %dbpp 0x%08x - dma=0x%08lx\n", buf, buf->vb.vb2_buf.index, - dev->width, dev->height, dev->fmt->depth, dev->fmt->name, + dev->width, dev->height, dev->fmt->depth, dev->fmt->fourcc, (unsigned long)buf->risc.dma); return 0; } @@ -647,8 +646,6 @@ static int vidioc_enum_fmt_vid_cap(struct file *file, void *priv, if (unlikely(f->index >= ARRAY_SIZE(formats))) return -EINVAL; - strscpy(f->description, formats[f->index].name, - sizeof(f->description)); f->pixelformat = formats[f->index].fourcc; return 0; diff --git a/drivers/media/pci/cx23885/cx23885.h b/drivers/media/pci/cx23885/cx23885.h index 9da66fdd5a0d..a95a2e4c6a0d 100644 --- a/drivers/media/pci/cx23885/cx23885.h +++ b/drivers/media/pci/cx23885/cx23885.h @@ -127,7 +127,6 @@ V4L2_STD_PAL_60 | V4L2_STD_SECAM_L | V4L2_STD_SECAM_DK) struct cx23885_fmt { - char *name; u32 fourcc; /* v4l2 format id */ int depth; int flags; diff --git a/drivers/media/pci/cx25821/cx25821-video.c b/drivers/media/pci/cx25821/cx25821-video.c index de7641170478..a10261da0db6 100644 --- a/drivers/media/pci/cx25821/cx25821-video.c +++ b/drivers/media/pci/cx25821/cx25821-video.c @@ -35,12 +35,10 @@ MODULE_PARM_DESC(irq_debug, "enable debug messages [IRQ handler]"); static const struct cx25821_fmt formats[] = { { - .name = "4:1:1, packed, Y41P", .fourcc = V4L2_PIX_FMT_Y41P, .depth = 12, .flags = FORMAT_FLAGS_PACKED, }, { - .name = "4:2:2, packed, YUYV", .fourcc = V4L2_PIX_FMT_YUYV, .depth = 16, .flags = FORMAT_FLAGS_PACKED, @@ -215,9 +213,9 @@ static int cx25821_buffer_prepare(struct vb2_buffer *vb) break; } - dprintk(2, "[%p/%d] buffer_prep - %dx%d %dbpp \"%s\" - dma=0x%08lx\n", + dprintk(2, "[%p/%d] buffer_prep - %dx%d %dbpp 0x%08x - dma=0x%08lx\n", buf, buf->vb.vb2_buf.index, chan->width, chan->height, - chan->fmt->depth, chan->fmt->name, + chan->fmt->depth, chan->fmt->fourcc, (unsigned long)buf->risc.dma); return ret; @@ -311,7 +309,6 @@ static int cx25821_vidioc_enum_fmt_vid_cap(struct file *file, void *priv, if (unlikely(f->index >= ARRAY_SIZE(formats))) return -EINVAL; - strscpy(f->description, formats[f->index].name, sizeof(f->description)); f->pixelformat = formats[f->index].fourcc; return 0; diff --git a/drivers/media/pci/cx25821/cx25821.h b/drivers/media/pci/cx25821/cx25821.h index 47dbaae78509..017307984094 100644 --- a/drivers/media/pci/cx25821/cx25821.h +++ b/drivers/media/pci/cx25821/cx25821.h @@ -83,7 +83,6 @@ #define VID_CHANNEL_NUM 8 struct cx25821_fmt { - char *name; u32 fourcc; /* v4l2 format id */ int depth; int flags; diff --git a/drivers/media/pci/cx88/cx88-blackbird.c b/drivers/media/pci/cx88/cx88-blackbird.c index 200d68827073..d3da7f4297af 100644 --- a/drivers/media/pci/cx88/cx88-blackbird.c +++ b/drivers/media/pci/cx88/cx88-blackbird.c @@ -805,9 +805,7 @@ static int vidioc_enum_fmt_vid_cap(struct file *file, void *priv, if (f->index != 0) return -EINVAL; - strscpy(f->description, "MPEG", sizeof(f->description)); f->pixelformat = V4L2_PIX_FMT_MPEG; - f->flags = V4L2_FMT_FLAG_COMPRESSED; return 0; } diff --git a/drivers/media/pci/cx88/cx88-video.c b/drivers/media/pci/cx88/cx88-video.c index e59a74514c7c..dcc0f02aeb70 100644 --- a/drivers/media/pci/cx88/cx88-video.c +++ b/drivers/media/pci/cx88/cx88-video.c @@ -69,62 +69,52 @@ MODULE_PARM_DESC(irq_debug, "enable debug messages [IRQ handler]"); static const struct cx8800_fmt formats[] = { { - .name = "8 bpp, gray", .fourcc = V4L2_PIX_FMT_GREY, .cxformat = ColorFormatY8, .depth = 8, .flags = FORMAT_FLAGS_PACKED, }, { - .name = "15 bpp RGB, le", .fourcc = V4L2_PIX_FMT_RGB555, .cxformat = ColorFormatRGB15, .depth = 16, .flags = FORMAT_FLAGS_PACKED, }, { - .name = "15 bpp RGB, be", .fourcc = V4L2_PIX_FMT_RGB555X, .cxformat = ColorFormatRGB15 | ColorFormatBSWAP, .depth = 16, .flags = FORMAT_FLAGS_PACKED, }, { - .name = "16 bpp RGB, le", .fourcc = V4L2_PIX_FMT_RGB565, .cxformat = ColorFormatRGB16, .depth = 16, .flags = FORMAT_FLAGS_PACKED, }, { - .name = "16 bpp RGB, be", .fourcc = V4L2_PIX_FMT_RGB565X, .cxformat = ColorFormatRGB16 | ColorFormatBSWAP, .depth = 16, .flags = FORMAT_FLAGS_PACKED, }, { - .name = "24 bpp RGB, le", .fourcc = V4L2_PIX_FMT_BGR24, .cxformat = ColorFormatRGB24, .depth = 24, .flags = FORMAT_FLAGS_PACKED, }, { - .name = "32 bpp RGB, le", .fourcc = V4L2_PIX_FMT_BGR32, .cxformat = ColorFormatRGB32, .depth = 32, .flags = FORMAT_FLAGS_PACKED, }, { - .name = "32 bpp RGB, be", .fourcc = V4L2_PIX_FMT_RGB32, .cxformat = ColorFormatRGB32 | ColorFormatBSWAP | ColorFormatWSWAP, .depth = 32, .flags = FORMAT_FLAGS_PACKED, }, { - .name = "4:2:2, packed, YUYV", .fourcc = V4L2_PIX_FMT_YUYV, .cxformat = ColorFormatYUY2, .depth = 16, .flags = FORMAT_FLAGS_PACKED, }, { - .name = "4:2:2, packed, UYVY", .fourcc = V4L2_PIX_FMT_UYVY, .cxformat = ColorFormatYUY2 | ColorFormatBSWAP, .depth = 16, @@ -489,9 +479,9 @@ static int buffer_prepare(struct vb2_buffer *vb) break; } dprintk(2, - "[%p/%d] buffer_prepare - %dx%d %dbpp \"%s\" - dma=0x%08lx\n", - buf, buf->vb.vb2_buf.index, - core->width, core->height, dev->fmt->depth, dev->fmt->name, + "[%p/%d] %s - %dx%d %dbpp 0x%08x - dma=0x%08lx\n", + buf, buf->vb.vb2_buf.index, __func__, + core->width, core->height, dev->fmt->depth, dev->fmt->fourcc, (unsigned long)buf->risc.dma); return 0; } @@ -829,7 +819,6 @@ static int vidioc_enum_fmt_vid_cap(struct file *file, void *priv, if (unlikely(f->index >= ARRAY_SIZE(formats))) return -EINVAL; - strscpy(f->description, formats[f->index].name, sizeof(f->description)); f->pixelformat = formats[f->index].fourcc; return 0; diff --git a/drivers/media/pci/cx88/cx88.h b/drivers/media/pci/cx88/cx88.h index a70a50dc3edf..744a22328ebc 100644 --- a/drivers/media/pci/cx88/cx88.h +++ b/drivers/media/pci/cx88/cx88.h @@ -99,7 +99,6 @@ static inline unsigned int norm_maxh(v4l2_std_id norm) /* static data */ struct cx8800_fmt { - const char *name; u32 fourcc; /* v4l2 format id */ int depth; int flags; diff --git a/drivers/media/pci/dt3155/dt3155.c b/drivers/media/pci/dt3155/dt3155.c index b4cdda50e742..7480f0d3ad0f 100644 --- a/drivers/media/pci/dt3155/dt3155.c +++ b/drivers/media/pci/dt3155/dt3155.c @@ -306,7 +306,6 @@ static int dt3155_enum_fmt_vid_cap(struct file *filp, if (f->index) return -EINVAL; f->pixelformat = V4L2_PIX_FMT_GREY; - strscpy(f->description, "8-bit Greyscale", sizeof(f->description)); return 0; } diff --git a/drivers/media/pci/meye/meye.c b/drivers/media/pci/meye/meye.c index 8218810c899e..0e61c81356ef 100644 --- a/drivers/media/pci/meye/meye.c +++ b/drivers/media/pci/meye/meye.c @@ -1104,12 +1104,9 @@ static int vidioc_enum_fmt_vid_cap(struct file *file, void *fh, if (f->index == 0) { /* standard YUV 422 capture */ f->flags = 0; - strscpy(f->description, "YUV422", sizeof(f->description)); f->pixelformat = V4L2_PIX_FMT_YUYV; } else { /* compressed MJPEG capture */ - f->flags = V4L2_FMT_FLAG_COMPRESSED; - strscpy(f->description, "MJPEG", sizeof(f->description)); f->pixelformat = V4L2_PIX_FMT_MJPEG; } diff --git a/drivers/media/pci/saa7134/saa7134-empress.c b/drivers/media/pci/saa7134/saa7134-empress.c index 1a41a56afec6..cb65d345fd3e 100644 --- a/drivers/media/pci/saa7134/saa7134-empress.c +++ b/drivers/media/pci/saa7134/saa7134-empress.c @@ -91,9 +91,7 @@ static int empress_enum_fmt_vid_cap(struct file *file, void *priv, if (f->index != 0) return -EINVAL; - strscpy(f->description, "MPEG TS", sizeof(f->description)); f->pixelformat = V4L2_PIX_FMT_MPEG; - f->flags = V4L2_FMT_FLAG_COMPRESSED; return 0; } diff --git a/drivers/media/pci/saa7134/saa7134-video.c b/drivers/media/pci/saa7134/saa7134-video.c index 606df51bb636..342cabf48064 100644 --- a/drivers/media/pci/saa7134/saa7134-video.c +++ b/drivers/media/pci/saa7134/saa7134-video.c @@ -90,70 +90,58 @@ static int video_out[][9] = { static struct saa7134_format formats[] = { { - .name = "8 bpp gray", .fourcc = V4L2_PIX_FMT_GREY, .depth = 8, .pm = 0x06, },{ - .name = "15 bpp RGB, le", .fourcc = V4L2_PIX_FMT_RGB555, .depth = 16, .pm = 0x13 | 0x80, },{ - .name = "15 bpp RGB, be", .fourcc = V4L2_PIX_FMT_RGB555X, .depth = 16, .pm = 0x13 | 0x80, .bswap = 1, },{ - .name = "16 bpp RGB, le", .fourcc = V4L2_PIX_FMT_RGB565, .depth = 16, .pm = 0x10 | 0x80, },{ - .name = "16 bpp RGB, be", .fourcc = V4L2_PIX_FMT_RGB565X, .depth = 16, .pm = 0x10 | 0x80, .bswap = 1, },{ - .name = "24 bpp RGB, le", .fourcc = V4L2_PIX_FMT_BGR24, .depth = 24, .pm = 0x11, },{ - .name = "24 bpp RGB, be", .fourcc = V4L2_PIX_FMT_RGB24, .depth = 24, .pm = 0x11, .bswap = 1, },{ - .name = "32 bpp RGB, le", .fourcc = V4L2_PIX_FMT_BGR32, .depth = 32, .pm = 0x12, },{ - .name = "32 bpp RGB, be", .fourcc = V4L2_PIX_FMT_RGB32, .depth = 32, .pm = 0x12, .bswap = 1, .wswap = 1, },{ - .name = "4:2:2 packed, YUYV", .fourcc = V4L2_PIX_FMT_YUYV, .depth = 16, .pm = 0x00, .bswap = 1, .yuv = 1, },{ - .name = "4:2:2 packed, UYVY", .fourcc = V4L2_PIX_FMT_UYVY, .depth = 16, .pm = 0x00, .yuv = 1, },{ - .name = "4:2:2 planar, Y-Cb-Cr", .fourcc = V4L2_PIX_FMT_YUV422P, .depth = 16, .pm = 0x09, @@ -162,7 +150,6 @@ static struct saa7134_format formats[] = { .hshift = 1, .vshift = 0, },{ - .name = "4:2:0 planar, Y-Cb-Cr", .fourcc = V4L2_PIX_FMT_YUV420, .depth = 12, .pm = 0x0a, @@ -171,7 +158,6 @@ static struct saa7134_format formats[] = { .hshift = 1, .vshift = 1, },{ - .name = "4:2:0 planar, Y-Cb-Cr", .fourcc = V4L2_PIX_FMT_YVU420, .depth = 12, .pm = 0x0a, @@ -720,10 +706,10 @@ static int start_preview(struct saa7134_dev *dev) return err; dev->ovfield = dev->win.field; - video_dbg("start_preview %dx%d+%d+%d %s field=%s\n", - dev->win.w.width, dev->win.w.height, - dev->win.w.left, dev->win.w.top, - dev->ovfmt->name, v4l2_field_names[dev->ovfield]); + video_dbg("%s %dx%d+%d+%d 0x%08x field=%s\n", __func__, + dev->win.w.width, dev->win.w.height, + dev->win.w.left, dev->win.w.top, + dev->ovfmt->fourcc, v4l2_field_names[dev->ovfield]); /* setup window + clipping */ set_size(dev, TASK_B, dev->win.w.width, dev->win.w.height, @@ -1780,9 +1766,6 @@ static int saa7134_enum_fmt_vid_cap(struct file *file, void *priv, if (f->index >= FORMATS) return -EINVAL; - strscpy(f->description, formats[f->index].name, - sizeof(f->description)); - f->pixelformat = formats[f->index].fourcc; return 0; @@ -1799,9 +1782,6 @@ static int saa7134_enum_fmt_vid_overlay(struct file *file, void *priv, if ((f->index >= FORMATS) || formats[f->index].planar) return -EINVAL; - strscpy(f->description, formats[f->index].name, - sizeof(f->description)); - f->pixelformat = formats[f->index].fourcc; return 0; diff --git a/drivers/media/pci/saa7134/saa7134.h b/drivers/media/pci/saa7134/saa7134.h index 6324f174c6f9..77c325e64a97 100644 --- a/drivers/media/pci/saa7134/saa7134.h +++ b/drivers/media/pci/saa7134/saa7134.h @@ -98,7 +98,6 @@ struct saa7134_tvaudio { }; struct saa7134_format { - char *name; unsigned int fourcc; unsigned int depth; unsigned int pm; diff --git a/drivers/media/pci/saa7164/saa7164-encoder.c b/drivers/media/pci/saa7164/saa7164-encoder.c index 43fdaa2d32bd..3fca7257a720 100644 --- a/drivers/media/pci/saa7164/saa7164-encoder.c +++ b/drivers/media/pci/saa7164/saa7164-encoder.c @@ -503,7 +503,6 @@ static int vidioc_enum_fmt_vid_cap(struct file *file, void *priv, if (f->index != 0) return -EINVAL; - strscpy(f->description, "MPEG", sizeof(f->description)); f->pixelformat = V4L2_PIX_FMT_MPEG; return 0; diff --git a/drivers/media/pci/solo6x10/solo6x10-v4l2-enc.c b/drivers/media/pci/solo6x10/solo6x10-v4l2-enc.c index 609100a46ff8..a02f08459e14 100644 --- a/drivers/media/pci/solo6x10/solo6x10-v4l2-enc.c +++ b/drivers/media/pci/solo6x10/solo6x10-v4l2-enc.c @@ -822,25 +822,18 @@ static int solo_enc_enum_fmt_cap(struct file *file, void *priv, switch (dev_type) { case SOLO_DEV_6010: f->pixelformat = V4L2_PIX_FMT_MPEG4; - strscpy(f->description, "MPEG-4 part 2", - sizeof(f->description)); break; case SOLO_DEV_6110: f->pixelformat = V4L2_PIX_FMT_H264; - strscpy(f->description, "H.264", sizeof(f->description)); break; } break; case 1: f->pixelformat = V4L2_PIX_FMT_MJPEG; - strscpy(f->description, "MJPEG", sizeof(f->description)); break; default: return -EINVAL; } - - f->flags = V4L2_FMT_FLAG_COMPRESSED; - return 0; } diff --git a/drivers/media/pci/solo6x10/solo6x10-v4l2.c b/drivers/media/pci/solo6x10/solo6x10-v4l2.c index a968f75920b5..2efa539d6075 100644 --- a/drivers/media/pci/solo6x10/solo6x10-v4l2.c +++ b/drivers/media/pci/solo6x10/solo6x10-v4l2.c @@ -458,8 +458,6 @@ static int solo_enum_fmt_cap(struct file *file, void *priv, return -EINVAL; f->pixelformat = V4L2_PIX_FMT_UYVY; - strscpy(f->description, "UYUV 4:2:2 Packed", sizeof(f->description)); - return 0; } diff --git a/drivers/media/pci/sta2x11/sta2x11_vip.c b/drivers/media/pci/sta2x11/sta2x11_vip.c index e52e29814378..fd3de3bb0c89 100644 --- a/drivers/media/pci/sta2x11/sta2x11_vip.c +++ b/drivers/media/pci/sta2x11/sta2x11_vip.c @@ -560,9 +560,7 @@ static int vidioc_enum_fmt_vid_cap(struct file *file, void *priv, if (f->index != 0) return -EINVAL; - strscpy(f->description, "4:2:2, packed, UYVY", sizeof(f->description)); f->pixelformat = V4L2_PIX_FMT_UYVY; - f->flags = 0; return 0; } diff --git a/drivers/media/pci/tw68/tw68-video.c b/drivers/media/pci/tw68/tw68-video.c index 8e0952d65ad4..99e74c22d3be 100644 --- a/drivers/media/pci/tw68/tw68-video.c +++ b/drivers/media/pci/tw68/tw68-video.c @@ -34,53 +34,43 @@ */ static const struct tw68_format formats[] = { { - .name = "15 bpp RGB, le", .fourcc = V4L2_PIX_FMT_RGB555, .depth = 16, .twformat = ColorFormatRGB15, }, { - .name = "15 bpp RGB, be", .fourcc = V4L2_PIX_FMT_RGB555X, .depth = 16, .twformat = ColorFormatRGB15 | ColorFormatBSWAP, }, { - .name = "16 bpp RGB, le", .fourcc = V4L2_PIX_FMT_RGB565, .depth = 16, .twformat = ColorFormatRGB16, }, { - .name = "16 bpp RGB, be", .fourcc = V4L2_PIX_FMT_RGB565X, .depth = 16, .twformat = ColorFormatRGB16 | ColorFormatBSWAP, }, { - .name = "24 bpp RGB, le", .fourcc = V4L2_PIX_FMT_BGR24, .depth = 24, .twformat = ColorFormatRGB24, }, { - .name = "24 bpp RGB, be", .fourcc = V4L2_PIX_FMT_RGB24, .depth = 24, .twformat = ColorFormatRGB24 | ColorFormatBSWAP, }, { - .name = "32 bpp RGB, le", .fourcc = V4L2_PIX_FMT_BGR32, .depth = 32, .twformat = ColorFormatRGB32, }, { - .name = "32 bpp RGB, be", .fourcc = V4L2_PIX_FMT_RGB32, .depth = 32, .twformat = ColorFormatRGB32 | ColorFormatBSWAP | ColorFormatWSWAP, }, { - .name = "4:2:2 packed, YUYV", .fourcc = V4L2_PIX_FMT_YUYV, .depth = 16, .twformat = ColorFormatYUY2, }, { - .name = "4:2:2 packed, UYVY", .fourcc = V4L2_PIX_FMT_UYVY, .depth = 16, .twformat = ColorFormatYUY2 | ColorFormatBSWAP, @@ -774,9 +764,6 @@ static int tw68_enum_fmt_vid_cap(struct file *file, void *priv, if (f->index >= FORMATS) return -EINVAL; - strscpy(f->description, formats[f->index].name, - sizeof(f->description)); - f->pixelformat = formats[f->index].fourcc; return 0; diff --git a/drivers/media/pci/tw68/tw68.h b/drivers/media/pci/tw68/tw68.h index 7021290d726a..a1f422d6e600 100644 --- a/drivers/media/pci/tw68/tw68.h +++ b/drivers/media/pci/tw68/tw68.h @@ -85,7 +85,6 @@ struct tw68_tvnorm { }; struct tw68_format { - char *name; u32 fourcc; u32 depth; u32 twformat; diff --git a/include/media/drv-intf/saa7146_vv.h b/include/media/drv-intf/saa7146_vv.h index b34d86bb0664..635805fb35e8 100644 --- a/include/media/drv-intf/saa7146_vv.h +++ b/include/media/drv-intf/saa7146_vv.h @@ -32,7 +32,6 @@ struct saa7146_video_dma { #define FORMAT_IS_PLANAR 0x2 struct saa7146_format { - char *name; u32 pixelformat; u32 trans; u8 depth; -- cgit v1.2.3 From 642ac63d166d07e43396a4d8c48ab24cb072cb18 Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Tue, 11 Jun 2019 10:02:43 -0400 Subject: media: drivers/staging/media: don't set description for ENUM_FMT The V4L2 core sets the format description and flags for the driver in order to ensure consistent naming. So drop the strscpy of the description in drivers. Also remove any description strings in driver-internal structures since those are no longer needed. Note that bcm2835-camera.c: the formats array still stores the flags field for compressed formats since that information is used elsewhere in the driver. But enum_fmt doesn't use it anymore, since the core will set the COMPRESSED flag correctly. Signed-off-by: Hans Verkuil Reviewed-by: Laurent Pinchart Signed-off-by: Mauro Carvalho Chehab --- drivers/staging/media/omap4iss/iss_video.c | 40 ++++++++++------------ drivers/staging/media/omap4iss/iss_video.h | 2 -- drivers/staging/media/soc_camera/soc_camera.c | 2 -- .../vc04_services/bcm2835-camera/bcm2835-camera.c | 29 ---------------- .../vc04_services/bcm2835-camera/mmal-common.h | 1 - include/media/drv-intf/soc_mediabus.h | 2 -- 6 files changed, 19 insertions(+), 57 deletions(-) (limited to 'include') diff --git a/drivers/staging/media/omap4iss/iss_video.c b/drivers/staging/media/omap4iss/iss_video.c index c307707480f7..54144dc9f509 100644 --- a/drivers/staging/media/omap4iss/iss_video.c +++ b/drivers/staging/media/omap4iss/iss_video.c @@ -31,61 +31,61 @@ static struct iss_format_info formats[] = { { MEDIA_BUS_FMT_Y8_1X8, MEDIA_BUS_FMT_Y8_1X8, MEDIA_BUS_FMT_Y8_1X8, MEDIA_BUS_FMT_Y8_1X8, - V4L2_PIX_FMT_GREY, 8, "Greyscale 8 bpp", }, + V4L2_PIX_FMT_GREY, 8, }, { MEDIA_BUS_FMT_Y10_1X10, MEDIA_BUS_FMT_Y10_1X10, MEDIA_BUS_FMT_Y10_1X10, MEDIA_BUS_FMT_Y8_1X8, - V4L2_PIX_FMT_Y10, 10, "Greyscale 10 bpp", }, + V4L2_PIX_FMT_Y10, 10, }, { MEDIA_BUS_FMT_Y12_1X12, MEDIA_BUS_FMT_Y10_1X10, MEDIA_BUS_FMT_Y12_1X12, MEDIA_BUS_FMT_Y8_1X8, - V4L2_PIX_FMT_Y12, 12, "Greyscale 12 bpp", }, + V4L2_PIX_FMT_Y12, 12, }, { MEDIA_BUS_FMT_SBGGR8_1X8, MEDIA_BUS_FMT_SBGGR8_1X8, MEDIA_BUS_FMT_SBGGR8_1X8, MEDIA_BUS_FMT_SBGGR8_1X8, - V4L2_PIX_FMT_SBGGR8, 8, "BGGR Bayer 8 bpp", }, + V4L2_PIX_FMT_SBGGR8, 8, }, { MEDIA_BUS_FMT_SGBRG8_1X8, MEDIA_BUS_FMT_SGBRG8_1X8, MEDIA_BUS_FMT_SGBRG8_1X8, MEDIA_BUS_FMT_SGBRG8_1X8, - V4L2_PIX_FMT_SGBRG8, 8, "GBRG Bayer 8 bpp", }, + V4L2_PIX_FMT_SGBRG8, 8, }, { MEDIA_BUS_FMT_SGRBG8_1X8, MEDIA_BUS_FMT_SGRBG8_1X8, MEDIA_BUS_FMT_SGRBG8_1X8, MEDIA_BUS_FMT_SGRBG8_1X8, - V4L2_PIX_FMT_SGRBG8, 8, "GRBG Bayer 8 bpp", }, + V4L2_PIX_FMT_SGRBG8, 8, }, { MEDIA_BUS_FMT_SRGGB8_1X8, MEDIA_BUS_FMT_SRGGB8_1X8, MEDIA_BUS_FMT_SRGGB8_1X8, MEDIA_BUS_FMT_SRGGB8_1X8, - V4L2_PIX_FMT_SRGGB8, 8, "RGGB Bayer 8 bpp", }, + V4L2_PIX_FMT_SRGGB8, 8, }, { MEDIA_BUS_FMT_SGRBG10_DPCM8_1X8, MEDIA_BUS_FMT_SGRBG10_DPCM8_1X8, MEDIA_BUS_FMT_SGRBG10_1X10, 0, - V4L2_PIX_FMT_SGRBG10DPCM8, 8, "GRBG Bayer 10 bpp DPCM8", }, + V4L2_PIX_FMT_SGRBG10DPCM8, 8, }, { MEDIA_BUS_FMT_SBGGR10_1X10, MEDIA_BUS_FMT_SBGGR10_1X10, MEDIA_BUS_FMT_SBGGR10_1X10, MEDIA_BUS_FMT_SBGGR8_1X8, - V4L2_PIX_FMT_SBGGR10, 10, "BGGR Bayer 10 bpp", }, + V4L2_PIX_FMT_SBGGR10, 10, }, { MEDIA_BUS_FMT_SGBRG10_1X10, MEDIA_BUS_FMT_SGBRG10_1X10, MEDIA_BUS_FMT_SGBRG10_1X10, MEDIA_BUS_FMT_SGBRG8_1X8, - V4L2_PIX_FMT_SGBRG10, 10, "GBRG Bayer 10 bpp", }, + V4L2_PIX_FMT_SGBRG10, 10, }, { MEDIA_BUS_FMT_SGRBG10_1X10, MEDIA_BUS_FMT_SGRBG10_1X10, MEDIA_BUS_FMT_SGRBG10_1X10, MEDIA_BUS_FMT_SGRBG8_1X8, - V4L2_PIX_FMT_SGRBG10, 10, "GRBG Bayer 10 bpp", }, + V4L2_PIX_FMT_SGRBG10, 10, }, { MEDIA_BUS_FMT_SRGGB10_1X10, MEDIA_BUS_FMT_SRGGB10_1X10, MEDIA_BUS_FMT_SRGGB10_1X10, MEDIA_BUS_FMT_SRGGB8_1X8, - V4L2_PIX_FMT_SRGGB10, 10, "RGGB Bayer 10 bpp", }, + V4L2_PIX_FMT_SRGGB10, 10, }, { MEDIA_BUS_FMT_SBGGR12_1X12, MEDIA_BUS_FMT_SBGGR10_1X10, MEDIA_BUS_FMT_SBGGR12_1X12, MEDIA_BUS_FMT_SBGGR8_1X8, - V4L2_PIX_FMT_SBGGR12, 12, "BGGR Bayer 12 bpp", }, + V4L2_PIX_FMT_SBGGR12, 12, }, { MEDIA_BUS_FMT_SGBRG12_1X12, MEDIA_BUS_FMT_SGBRG10_1X10, MEDIA_BUS_FMT_SGBRG12_1X12, MEDIA_BUS_FMT_SGBRG8_1X8, - V4L2_PIX_FMT_SGBRG12, 12, "GBRG Bayer 12 bpp", }, + V4L2_PIX_FMT_SGBRG12, 12, }, { MEDIA_BUS_FMT_SGRBG12_1X12, MEDIA_BUS_FMT_SGRBG10_1X10, MEDIA_BUS_FMT_SGRBG12_1X12, MEDIA_BUS_FMT_SGRBG8_1X8, - V4L2_PIX_FMT_SGRBG12, 12, "GRBG Bayer 12 bpp", }, + V4L2_PIX_FMT_SGRBG12, 12, }, { MEDIA_BUS_FMT_SRGGB12_1X12, MEDIA_BUS_FMT_SRGGB10_1X10, MEDIA_BUS_FMT_SRGGB12_1X12, MEDIA_BUS_FMT_SRGGB8_1X8, - V4L2_PIX_FMT_SRGGB12, 12, "RGGB Bayer 12 bpp", }, + V4L2_PIX_FMT_SRGGB12, 12, }, { MEDIA_BUS_FMT_UYVY8_1X16, MEDIA_BUS_FMT_UYVY8_1X16, MEDIA_BUS_FMT_UYVY8_1X16, 0, - V4L2_PIX_FMT_UYVY, 16, "YUV 4:2:2 (UYVY)", }, + V4L2_PIX_FMT_UYVY, 16, }, { MEDIA_BUS_FMT_YUYV8_1X16, MEDIA_BUS_FMT_YUYV8_1X16, MEDIA_BUS_FMT_YUYV8_1X16, 0, - V4L2_PIX_FMT_YUYV, 16, "YUV 4:2:2 (YUYV)", }, + V4L2_PIX_FMT_YUYV, 16, }, { MEDIA_BUS_FMT_YUYV8_1_5X8, MEDIA_BUS_FMT_YUYV8_1_5X8, MEDIA_BUS_FMT_YUYV8_1_5X8, 0, - V4L2_PIX_FMT_NV12, 8, "YUV 4:2:0 (NV12)", }, + V4L2_PIX_FMT_NV12, 8, }, }; const struct iss_format_info * @@ -563,8 +563,6 @@ iss_video_enum_format(struct file *file, void *fh, struct v4l2_fmtdesc *f) if (index == 0) { f->pixelformat = info->pixelformat; - strscpy(f->description, info->description, - sizeof(f->description)); return 0; } diff --git a/drivers/staging/media/omap4iss/iss_video.h b/drivers/staging/media/omap4iss/iss_video.h index f22489edb562..8b3dd92021e1 100644 --- a/drivers/staging/media/omap4iss/iss_video.h +++ b/drivers/staging/media/omap4iss/iss_video.h @@ -36,7 +36,6 @@ struct v4l2_pix_format; * shifted to be 8 bits per pixel. =0 if format is not shiftable. * @pixelformat: V4L2 pixel format FCC identifier * @bpp: Bits per pixel - * @description: Human-readable format description */ struct iss_format_info { u32 code; @@ -45,7 +44,6 @@ struct iss_format_info { u32 flavor; u32 pixelformat; unsigned int bpp; - const char *description; }; enum iss_pipeline_stream_state { diff --git a/drivers/staging/media/soc_camera/soc_camera.c b/drivers/staging/media/soc_camera/soc_camera.c index a6232dcd59bc..7b9448e3c9ba 100644 --- a/drivers/staging/media/soc_camera/soc_camera.c +++ b/drivers/staging/media/soc_camera/soc_camera.c @@ -869,8 +869,6 @@ static int soc_camera_enum_fmt_vid_cap(struct file *file, void *priv, format = icd->user_formats[f->index].host_fmt; - if (format->name) - strscpy(f->description, format->name, sizeof(f->description)); f->pixelformat = format->fourcc; return 0; } diff --git a/drivers/staging/vc04_services/bcm2835-camera/bcm2835-camera.c b/drivers/staging/vc04_services/bcm2835-camera/bcm2835-camera.c index ea54cc27e645..d4d1e44b16b2 100644 --- a/drivers/staging/vc04_services/bcm2835-camera/bcm2835-camera.c +++ b/drivers/staging/vc04_services/bcm2835-camera/bcm2835-camera.c @@ -75,34 +75,27 @@ static const struct v4l2_fract /* video formats */ static struct mmal_fmt formats[] = { { - .name = "4:2:0, planar, YUV", .fourcc = V4L2_PIX_FMT_YUV420, - .flags = 0, .mmal = MMAL_ENCODING_I420, .depth = 12, .mmal_component = COMP_CAMERA, .ybbp = 1, .remove_padding = 1, }, { - .name = "4:2:2, packed, YUYV", .fourcc = V4L2_PIX_FMT_YUYV, - .flags = 0, .mmal = MMAL_ENCODING_YUYV, .depth = 16, .mmal_component = COMP_CAMERA, .ybbp = 2, .remove_padding = 0, }, { - .name = "RGB24 (LE)", .fourcc = V4L2_PIX_FMT_RGB24, - .flags = 0, .mmal = MMAL_ENCODING_RGB24, .depth = 24, .mmal_component = COMP_CAMERA, .ybbp = 3, .remove_padding = 0, }, { - .name = "JPEG", .fourcc = V4L2_PIX_FMT_JPEG, .flags = V4L2_FMT_FLAG_COMPRESSED, .mmal = MMAL_ENCODING_JPEG, @@ -111,7 +104,6 @@ static struct mmal_fmt formats[] = { .ybbp = 0, .remove_padding = 0, }, { - .name = "H264", .fourcc = V4L2_PIX_FMT_H264, .flags = V4L2_FMT_FLAG_COMPRESSED, .mmal = MMAL_ENCODING_H264, @@ -120,7 +112,6 @@ static struct mmal_fmt formats[] = { .ybbp = 0, .remove_padding = 0, }, { - .name = "MJPEG", .fourcc = V4L2_PIX_FMT_MJPEG, .flags = V4L2_FMT_FLAG_COMPRESSED, .mmal = MMAL_ENCODING_MJPEG, @@ -129,72 +120,56 @@ static struct mmal_fmt formats[] = { .ybbp = 0, .remove_padding = 0, }, { - .name = "4:2:2, packed, YVYU", .fourcc = V4L2_PIX_FMT_YVYU, - .flags = 0, .mmal = MMAL_ENCODING_YVYU, .depth = 16, .mmal_component = COMP_CAMERA, .ybbp = 2, .remove_padding = 0, }, { - .name = "4:2:2, packed, VYUY", .fourcc = V4L2_PIX_FMT_VYUY, - .flags = 0, .mmal = MMAL_ENCODING_VYUY, .depth = 16, .mmal_component = COMP_CAMERA, .ybbp = 2, .remove_padding = 0, }, { - .name = "4:2:2, packed, UYVY", .fourcc = V4L2_PIX_FMT_UYVY, - .flags = 0, .mmal = MMAL_ENCODING_UYVY, .depth = 16, .mmal_component = COMP_CAMERA, .ybbp = 2, .remove_padding = 0, }, { - .name = "4:2:0, planar, NV12", .fourcc = V4L2_PIX_FMT_NV12, - .flags = 0, .mmal = MMAL_ENCODING_NV12, .depth = 12, .mmal_component = COMP_CAMERA, .ybbp = 1, .remove_padding = 1, }, { - .name = "RGB24 (BE)", .fourcc = V4L2_PIX_FMT_BGR24, - .flags = 0, .mmal = MMAL_ENCODING_BGR24, .depth = 24, .mmal_component = COMP_CAMERA, .ybbp = 3, .remove_padding = 0, }, { - .name = "4:2:0, planar, YVU", .fourcc = V4L2_PIX_FMT_YVU420, - .flags = 0, .mmal = MMAL_ENCODING_YV12, .depth = 12, .mmal_component = COMP_CAMERA, .ybbp = 1, .remove_padding = 1, }, { - .name = "4:2:0, planar, NV21", .fourcc = V4L2_PIX_FMT_NV21, - .flags = 0, .mmal = MMAL_ENCODING_NV21, .depth = 12, .mmal_component = COMP_CAMERA, .ybbp = 1, .remove_padding = 1, }, { - .name = "RGB32 (BE)", .fourcc = V4L2_PIX_FMT_BGR32, - .flags = 0, .mmal = MMAL_ENCODING_BGRA, .depth = 32, .mmal_component = COMP_CAMERA, @@ -716,9 +691,7 @@ static int vidioc_enum_fmt_vid_overlay(struct file *file, void *priv, fmt = &formats[f->index]; - strlcpy((char *)f->description, fmt->name, sizeof(f->description)); f->pixelformat = fmt->fourcc; - f->flags = fmt->flags; return 0; } @@ -919,9 +892,7 @@ static int vidioc_enum_fmt_vid_cap(struct file *file, void *priv, fmt = &formats[f->index]; - strlcpy((char *)f->description, fmt->name, sizeof(f->description)); f->pixelformat = fmt->fourcc; - f->flags = fmt->flags; return 0; } diff --git a/drivers/staging/vc04_services/bcm2835-camera/mmal-common.h b/drivers/staging/vc04_services/bcm2835-camera/mmal-common.h index 6f56c517d850..ff5398737b4a 100644 --- a/drivers/staging/vc04_services/bcm2835-camera/mmal-common.h +++ b/drivers/staging/vc04_services/bcm2835-camera/mmal-common.h @@ -26,7 +26,6 @@ struct mmal_msg_context; /* mapping between v4l and mmal video modes */ struct mmal_fmt { - char *name; u32 fourcc; /* v4l2 format id */ int flags; /* v4l2 flags field */ u32 mmal; diff --git a/include/media/drv-intf/soc_mediabus.h b/include/media/drv-intf/soc_mediabus.h index 73de3bd0c605..361f8852c9fc 100644 --- a/include/media/drv-intf/soc_mediabus.h +++ b/include/media/drv-intf/soc_mediabus.h @@ -66,7 +66,6 @@ enum soc_mbus_layout { /** * struct soc_mbus_pixelfmt - Data format on the media bus - * @name: Name of the format * @fourcc: Fourcc code, that will be obtained if the data is * stored in memory in the following way: * @packing: Type of sample-packing, that has to be used @@ -74,7 +73,6 @@ enum soc_mbus_layout { * @bits_per_sample: How many bits the bridge has to sample */ struct soc_mbus_pixelfmt { - const char *name; u32 fourcc; enum soc_mbus_packing packing; enum soc_mbus_order order; -- cgit v1.2.3 From 59fe916c84f891aab35019adc45377a10f5690b1 Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Tue, 11 Jun 2019 10:25:15 -0400 Subject: media: media/platform: don't set description in ENUM_FMT The V4L2 core sets the format description and flags for the driver in order to ensure consistent naming. So drop the strscpy of the description in drivers. Also remove any description strings in driver-internal structures since those are no longer needed. And in am437x-vpfe.c drop an unnecessary f->type assignment in vpfe_enum_fmt(). Signed-off-by: Hans Verkuil Acked-by: Benoit Parrot Reviewed-by: Laurent Pinchart [hverkuil-cisco@xs4all.nl: addressed some small suggestions from Laurent] Acked-by: Lad, Prabhakar Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/am437x/am437x-vpfe.c | 18 ++------------- drivers/media/platform/davinci/vpbe_display.c | 14 ++--------- drivers/media/platform/davinci/vpif_capture.c | 11 ++------- drivers/media/platform/davinci/vpif_display.c | 4 ---- drivers/media/platform/exynos-gsc/gsc-core.c | 22 ------------------ drivers/media/platform/exynos-gsc/gsc-core.h | 2 -- drivers/media/platform/exynos4-is/fimc-capture.c | 3 --- drivers/media/platform/exynos4-is/fimc-core.c | 20 ---------------- drivers/media/platform/exynos4-is/fimc-isp-video.c | 1 - drivers/media/platform/exynos4-is/fimc-isp.c | 3 --- drivers/media/platform/exynos4-is/fimc-lite.c | 8 ------- drivers/media/platform/exynos4-is/fimc-m2m.c | 1 - drivers/media/platform/m2m-deinterlace.c | 4 ---- drivers/media/platform/marvell-ccic/mcam-core.c | 10 -------- drivers/media/platform/mx2_emmaprp.c | 4 ---- drivers/media/platform/omap/omap_vout.c | 7 ------ drivers/media/platform/s3c-camif/camif-capture.c | 11 ++++----- drivers/media/platform/s3c-camif/camif-core.c | 6 ----- drivers/media/platform/s3c-camif/camif-core.h | 1 - drivers/media/platform/s5p-g2d/g2d.c | 10 +------- drivers/media/platform/s5p-g2d/g2d.h | 1 - drivers/media/platform/s5p-jpeg/jpeg-core.c | 27 ---------------------- drivers/media/platform/s5p-jpeg/jpeg-core.h | 2 -- drivers/media/platform/s5p-mfc/s5p_mfc_common.h | 1 - drivers/media/platform/s5p-mfc/s5p_mfc_dec.c | 19 +-------------- drivers/media/platform/s5p-mfc/s5p_mfc_enc.c | 14 +---------- drivers/media/platform/sh_veu.c | 19 +++++++-------- drivers/media/platform/sh_vou.c | 12 ++-------- drivers/media/platform/ti-vpe/vpe.c | 12 ---------- drivers/media/platform/via-camera.c | 4 ---- drivers/media/platform/xilinx/xilinx-dma.c | 2 -- drivers/media/platform/xilinx/xilinx-vip.c | 16 ++++++------- drivers/media/platform/xilinx/xilinx-vip.h | 2 -- include/media/drv-intf/exynos-fimc.h | 2 -- 34 files changed, 31 insertions(+), 262 deletions(-) (limited to 'include') diff --git a/drivers/media/platform/am437x/am437x-vpfe.c b/drivers/media/platform/am437x/am437x-vpfe.c index fe7b937eb5f2..7582c26f8459 100644 --- a/drivers/media/platform/am437x/am437x-vpfe.c +++ b/drivers/media/platform/am437x/am437x-vpfe.c @@ -76,7 +76,6 @@ struct bus_format { /* * struct vpfe_fmt - VPFE media bus format information - * @name: V4L2 format description * @code: V4L2 media bus format code * @shifted: V4L2 media bus format code for the same pixel layout but * shifted to be 8 bits per pixel. =0 if format is not shiftable. @@ -86,7 +85,6 @@ struct bus_format { * @supported: Indicates format supported by subdev */ struct vpfe_fmt { - const char *name; u32 fourcc; u32 code; struct bus_format l; @@ -97,7 +95,6 @@ struct vpfe_fmt { static struct vpfe_fmt formats[] = { { - .name = "YUV 4:2:2 packed, YCbYCr", .fourcc = V4L2_PIX_FMT_YUYV, .code = MEDIA_BUS_FMT_YUYV8_2X8, .l.width = 10, @@ -106,7 +103,6 @@ static struct vpfe_fmt formats[] = { .s.bpp = 2, .supported = false, }, { - .name = "YUV 4:2:2 packed, CbYCrY", .fourcc = V4L2_PIX_FMT_UYVY, .code = MEDIA_BUS_FMT_UYVY8_2X8, .l.width = 10, @@ -115,7 +111,6 @@ static struct vpfe_fmt formats[] = { .s.bpp = 2, .supported = false, }, { - .name = "YUV 4:2:2 packed, YCrYCb", .fourcc = V4L2_PIX_FMT_YVYU, .code = MEDIA_BUS_FMT_YVYU8_2X8, .l.width = 10, @@ -124,7 +119,6 @@ static struct vpfe_fmt formats[] = { .s.bpp = 2, .supported = false, }, { - .name = "YUV 4:2:2 packed, CrYCbY", .fourcc = V4L2_PIX_FMT_VYUY, .code = MEDIA_BUS_FMT_VYUY8_2X8, .l.width = 10, @@ -133,7 +127,6 @@ static struct vpfe_fmt formats[] = { .s.bpp = 2, .supported = false, }, { - .name = "RAW8 BGGR", .fourcc = V4L2_PIX_FMT_SBGGR8, .code = MEDIA_BUS_FMT_SBGGR8_1X8, .l.width = 10, @@ -142,7 +135,6 @@ static struct vpfe_fmt formats[] = { .s.bpp = 1, .supported = false, }, { - .name = "RAW8 GBRG", .fourcc = V4L2_PIX_FMT_SGBRG8, .code = MEDIA_BUS_FMT_SGBRG8_1X8, .l.width = 10, @@ -151,7 +143,6 @@ static struct vpfe_fmt formats[] = { .s.bpp = 1, .supported = false, }, { - .name = "RAW8 GRBG", .fourcc = V4L2_PIX_FMT_SGRBG8, .code = MEDIA_BUS_FMT_SGRBG8_1X8, .l.width = 10, @@ -160,7 +151,6 @@ static struct vpfe_fmt formats[] = { .s.bpp = 1, .supported = false, }, { - .name = "RAW8 RGGB", .fourcc = V4L2_PIX_FMT_SRGGB8, .code = MEDIA_BUS_FMT_SRGGB8_1X8, .l.width = 10, @@ -169,7 +159,6 @@ static struct vpfe_fmt formats[] = { .s.bpp = 1, .supported = false, }, { - .name = "RGB565 (LE)", .fourcc = V4L2_PIX_FMT_RGB565, .code = MEDIA_BUS_FMT_RGB565_2X8_LE, .l.width = 10, @@ -178,7 +167,6 @@ static struct vpfe_fmt formats[] = { .s.bpp = 2, .supported = false, }, { - .name = "RGB565 (BE)", .fourcc = V4L2_PIX_FMT_RGB565X, .code = MEDIA_BUS_FMT_RGB565_2X8_BE, .l.width = 10, @@ -1540,12 +1528,10 @@ static int vpfe_enum_fmt(struct file *file, void *priv, if (!fmt) return -EINVAL; - strscpy(f->description, fmt->name, sizeof(f->description)); f->pixelformat = fmt->fourcc; - f->type = vpfe->fmt.type; - vpfe_dbg(1, vpfe, "vpfe_enum_format: mbus index: %d code: %x pixelformat: %s [%s]\n", - f->index, fmt->code, print_fourcc(fmt->fourcc), fmt->name); + vpfe_dbg(1, vpfe, "vpfe_enum_format: mbus index: %d code: %x pixelformat: %s\n", + f->index, fmt->code, print_fourcc(fmt->fourcc)); return 0; } diff --git a/drivers/media/platform/davinci/vpbe_display.c b/drivers/media/platform/davinci/vpbe_display.c index 000b191c42d8..ed10a9bbf96b 100644 --- a/drivers/media/platform/davinci/vpbe_display.c +++ b/drivers/media/platform/davinci/vpbe_display.c @@ -792,7 +792,6 @@ static int vpbe_display_enum_fmt(struct file *file, void *priv, { struct vpbe_layer *layer = video_drvdata(file); struct vpbe_device *vpbe_dev = layer->disp_dev->vpbe_dev; - unsigned int index = 0; v4l2_dbg(1, debug, &vpbe_dev->v4l2_dev, "VIDIOC_ENUM_FMT, layer id = %d\n", @@ -803,19 +802,10 @@ static int vpbe_display_enum_fmt(struct file *file, void *priv, } /* Fill in the information about format */ - index = fmt->index; - memset(fmt, 0, sizeof(*fmt)); - fmt->index = index; - fmt->type = V4L2_BUF_TYPE_VIDEO_OUTPUT; - if (index == 0) { - strscpy(fmt->description, "YUV 4:2:2 - UYVY", - sizeof(fmt->description)); + if (fmt->index == 0) fmt->pixelformat = V4L2_PIX_FMT_UYVY; - } else { - strscpy(fmt->description, "Y/CbCr 4:2:0", - sizeof(fmt->description)); + else fmt->pixelformat = V4L2_PIX_FMT_NV12; - } return 0; } diff --git a/drivers/media/platform/davinci/vpif_capture.c b/drivers/media/platform/davinci/vpif_capture.c index f0f7ef638c56..621d28470d2b 100644 --- a/drivers/media/platform/davinci/vpif_capture.c +++ b/drivers/media/platform/davinci/vpif_capture.c @@ -938,17 +938,10 @@ static int vpif_enum_fmt_vid_cap(struct file *file, void *priv, } /* Fill in the information about format */ - if (ch->vpifparams.iface.if_type == VPIF_IF_RAW_BAYER) { - fmt->type = V4L2_BUF_TYPE_VIDEO_CAPTURE; - strscpy(fmt->description, "Raw Mode -Bayer Pattern GrRBGb", - sizeof(fmt->description)); + if (ch->vpifparams.iface.if_type == VPIF_IF_RAW_BAYER) fmt->pixelformat = V4L2_PIX_FMT_SBGGR8; - } else { - fmt->type = V4L2_BUF_TYPE_VIDEO_CAPTURE; - strscpy(fmt->description, "YCbCr4:2:2 Semi-Planar", - sizeof(fmt->description)); + else fmt->pixelformat = V4L2_PIX_FMT_NV16; - } return 0; } diff --git a/drivers/media/platform/davinci/vpif_display.c b/drivers/media/platform/davinci/vpif_display.c index a69897c68a50..be32f87001cd 100644 --- a/drivers/media/platform/davinci/vpif_display.c +++ b/drivers/media/platform/davinci/vpif_display.c @@ -601,11 +601,7 @@ static int vpif_enum_fmt_vid_out(struct file *file, void *priv, return -EINVAL; /* Fill in the information about format */ - fmt->type = V4L2_BUF_TYPE_VIDEO_OUTPUT; - strscpy(fmt->description, "YCbCr4:2:2 YC Planar", - sizeof(fmt->description)); fmt->pixelformat = V4L2_PIX_FMT_YUV422P; - fmt->flags = 0; return 0; } diff --git a/drivers/media/platform/exynos-gsc/gsc-core.c b/drivers/media/platform/exynos-gsc/gsc-core.c index 854869f0024e..f6650b45bc3d 100644 --- a/drivers/media/platform/exynos-gsc/gsc-core.c +++ b/drivers/media/platform/exynos-gsc/gsc-core.c @@ -27,21 +27,18 @@ static const struct gsc_fmt gsc_formats[] = { { - .name = "RGB565", .pixelformat = V4L2_PIX_FMT_RGB565X, .depth = { 16 }, .color = GSC_RGB, .num_planes = 1, .num_comp = 1, }, { - .name = "BGRX-8-8-8-8, 32 bpp", .pixelformat = V4L2_PIX_FMT_BGR32, .depth = { 32 }, .color = GSC_RGB, .num_planes = 1, .num_comp = 1, }, { - .name = "YUV 4:2:2 packed, YCbYCr", .pixelformat = V4L2_PIX_FMT_YUYV, .depth = { 16 }, .color = GSC_YUV422, @@ -51,7 +48,6 @@ static const struct gsc_fmt gsc_formats[] = { .num_comp = 1, .mbus_code = MEDIA_BUS_FMT_YUYV8_2X8, }, { - .name = "YUV 4:2:2 packed, CbYCrY", .pixelformat = V4L2_PIX_FMT_UYVY, .depth = { 16 }, .color = GSC_YUV422, @@ -61,7 +57,6 @@ static const struct gsc_fmt gsc_formats[] = { .num_comp = 1, .mbus_code = MEDIA_BUS_FMT_UYVY8_2X8, }, { - .name = "YUV 4:2:2 packed, CrYCbY", .pixelformat = V4L2_PIX_FMT_VYUY, .depth = { 16 }, .color = GSC_YUV422, @@ -71,7 +66,6 @@ static const struct gsc_fmt gsc_formats[] = { .num_comp = 1, .mbus_code = MEDIA_BUS_FMT_VYUY8_2X8, }, { - .name = "YUV 4:2:2 packed, YCrYCb", .pixelformat = V4L2_PIX_FMT_YVYU, .depth = { 16 }, .color = GSC_YUV422, @@ -81,7 +75,6 @@ static const struct gsc_fmt gsc_formats[] = { .num_comp = 1, .mbus_code = MEDIA_BUS_FMT_YVYU8_2X8, }, { - .name = "YUV 4:4:4 planar, YCbYCr", .pixelformat = V4L2_PIX_FMT_YUV32, .depth = { 32 }, .color = GSC_YUV444, @@ -90,7 +83,6 @@ static const struct gsc_fmt gsc_formats[] = { .num_planes = 1, .num_comp = 1, }, { - .name = "YUV 4:2:2 planar, Y/Cb/Cr", .pixelformat = V4L2_PIX_FMT_YUV422P, .depth = { 16 }, .color = GSC_YUV422, @@ -99,7 +91,6 @@ static const struct gsc_fmt gsc_formats[] = { .num_planes = 1, .num_comp = 3, }, { - .name = "YUV 4:2:2 planar, Y/CbCr", .pixelformat = V4L2_PIX_FMT_NV16, .depth = { 16 }, .color = GSC_YUV422, @@ -108,7 +99,6 @@ static const struct gsc_fmt gsc_formats[] = { .num_planes = 1, .num_comp = 2, }, { - .name = "YUV 4:2:2 non-contig, Y/CbCr", .pixelformat = V4L2_PIX_FMT_NV16M, .depth = { 8, 8 }, .color = GSC_YUV422, @@ -117,7 +107,6 @@ static const struct gsc_fmt gsc_formats[] = { .num_planes = 2, .num_comp = 2, }, { - .name = "YUV 4:2:2 planar, Y/CrCb", .pixelformat = V4L2_PIX_FMT_NV61, .depth = { 16 }, .color = GSC_YUV422, @@ -126,7 +115,6 @@ static const struct gsc_fmt gsc_formats[] = { .num_planes = 1, .num_comp = 2, }, { - .name = "YUV 4:2:2 non-contig, Y/CrCb", .pixelformat = V4L2_PIX_FMT_NV61M, .depth = { 8, 8 }, .color = GSC_YUV422, @@ -135,7 +123,6 @@ static const struct gsc_fmt gsc_formats[] = { .num_planes = 2, .num_comp = 2, }, { - .name = "YUV 4:2:0 planar, YCbCr", .pixelformat = V4L2_PIX_FMT_YUV420, .depth = { 12 }, .color = GSC_YUV420, @@ -144,7 +131,6 @@ static const struct gsc_fmt gsc_formats[] = { .num_planes = 1, .num_comp = 3, }, { - .name = "YUV 4:2:0 planar, YCrCb", .pixelformat = V4L2_PIX_FMT_YVU420, .depth = { 12 }, .color = GSC_YUV420, @@ -154,7 +140,6 @@ static const struct gsc_fmt gsc_formats[] = { .num_comp = 3, }, { - .name = "YUV 4:2:0 planar, Y/CbCr", .pixelformat = V4L2_PIX_FMT_NV12, .depth = { 12 }, .color = GSC_YUV420, @@ -163,7 +148,6 @@ static const struct gsc_fmt gsc_formats[] = { .num_planes = 1, .num_comp = 2, }, { - .name = "YUV 4:2:0 planar, Y/CrCb", .pixelformat = V4L2_PIX_FMT_NV21, .depth = { 12 }, .color = GSC_YUV420, @@ -172,7 +156,6 @@ static const struct gsc_fmt gsc_formats[] = { .num_planes = 1, .num_comp = 2, }, { - .name = "YUV 4:2:0 non-contig. 2p, Y/CrCb", .pixelformat = V4L2_PIX_FMT_NV21M, .depth = { 8, 4 }, .color = GSC_YUV420, @@ -181,7 +164,6 @@ static const struct gsc_fmt gsc_formats[] = { .num_planes = 2, .num_comp = 2, }, { - .name = "YUV 4:2:0 non-contig. 2p, Y/CbCr", .pixelformat = V4L2_PIX_FMT_NV12M, .depth = { 8, 4 }, .color = GSC_YUV420, @@ -190,7 +172,6 @@ static const struct gsc_fmt gsc_formats[] = { .num_planes = 2, .num_comp = 2, }, { - .name = "YUV 4:2:0 non-contig. 3p, Y/Cb/Cr", .pixelformat = V4L2_PIX_FMT_YUV420M, .depth = { 8, 2, 2 }, .color = GSC_YUV420, @@ -199,7 +180,6 @@ static const struct gsc_fmt gsc_formats[] = { .num_planes = 3, .num_comp = 3, }, { - .name = "YUV 4:2:0 non-contig. 3p, Y/Cr/Cb", .pixelformat = V4L2_PIX_FMT_YVU420M, .depth = { 8, 2, 2 }, .color = GSC_YUV420, @@ -208,7 +188,6 @@ static const struct gsc_fmt gsc_formats[] = { .num_planes = 3, .num_comp = 3, }, { - .name = "YUV 4:2:0 n.c. 2p, Y/CbCr tiled", .pixelformat = V4L2_PIX_FMT_NV12MT_16X16, .depth = { 8, 4 }, .color = GSC_YUV420, @@ -335,7 +314,6 @@ int gsc_enum_fmt(struct v4l2_fmtdesc *f) if (!fmt) return -EINVAL; - strscpy(f->description, fmt->name, sizeof(f->description)); f->pixelformat = fmt->pixelformat; return 0; diff --git a/drivers/media/platform/exynos-gsc/gsc-core.h b/drivers/media/platform/exynos-gsc/gsc-core.h index 772183b090c2..8e5a9acb78aa 100644 --- a/drivers/media/platform/exynos-gsc/gsc-core.h +++ b/drivers/media/platform/exynos-gsc/gsc-core.h @@ -103,7 +103,6 @@ enum gsc_yuv_fmt { /** * struct gsc_fmt - the driver's internal color format data * @mbus_code: Media Bus pixel code, -1 if not applicable - * @name: format description * @pixelformat: the fourcc code for this format, 0 if not applicable * @yorder: Y/C order * @corder: Chrominance order control @@ -114,7 +113,6 @@ enum gsc_yuv_fmt { */ struct gsc_fmt { u32 mbus_code; - char *name; u32 pixelformat; u32 color; u32 yorder; diff --git a/drivers/media/platform/exynos4-is/fimc-capture.c b/drivers/media/platform/exynos4-is/fimc-capture.c index 66510365dd5d..121d609ff856 100644 --- a/drivers/media/platform/exynos4-is/fimc-capture.c +++ b/drivers/media/platform/exynos4-is/fimc-capture.c @@ -738,10 +738,7 @@ static int fimc_cap_enum_fmt(struct file *file, void *priv, f->index); if (!fmt) return -EINVAL; - strscpy(f->description, fmt->name, sizeof(f->description)); f->pixelformat = fmt->fourcc; - if (fmt->fourcc == MEDIA_BUS_FMT_JPEG_1X8) - f->flags |= V4L2_FMT_FLAG_COMPRESSED; return 0; } diff --git a/drivers/media/platform/exynos4-is/fimc-core.c b/drivers/media/platform/exynos4-is/fimc-core.c index 7006f54bfee2..cde60fbb23a8 100644 --- a/drivers/media/platform/exynos4-is/fimc-core.c +++ b/drivers/media/platform/exynos4-is/fimc-core.c @@ -36,7 +36,6 @@ static char *fimc_clocks[MAX_FIMC_CLOCKS] = { static struct fimc_fmt fimc_formats[] = { { - .name = "RGB565", .fourcc = V4L2_PIX_FMT_RGB565, .depth = { 16 }, .color = FIMC_FMT_RGB565, @@ -44,7 +43,6 @@ static struct fimc_fmt fimc_formats[] = { .colplanes = 1, .flags = FMT_FLAGS_M2M, }, { - .name = "BGR666", .fourcc = V4L2_PIX_FMT_BGR666, .depth = { 32 }, .color = FIMC_FMT_RGB666, @@ -52,7 +50,6 @@ static struct fimc_fmt fimc_formats[] = { .colplanes = 1, .flags = FMT_FLAGS_M2M, }, { - .name = "BGRA8888, 32 bpp", .fourcc = V4L2_PIX_FMT_BGR32, .depth = { 32 }, .color = FIMC_FMT_RGB888, @@ -60,7 +57,6 @@ static struct fimc_fmt fimc_formats[] = { .colplanes = 1, .flags = FMT_FLAGS_M2M | FMT_HAS_ALPHA, }, { - .name = "ARGB1555", .fourcc = V4L2_PIX_FMT_RGB555, .depth = { 16 }, .color = FIMC_FMT_RGB555, @@ -68,7 +64,6 @@ static struct fimc_fmt fimc_formats[] = { .colplanes = 1, .flags = FMT_FLAGS_M2M_OUT | FMT_HAS_ALPHA, }, { - .name = "ARGB4444", .fourcc = V4L2_PIX_FMT_RGB444, .depth = { 16 }, .color = FIMC_FMT_RGB444, @@ -76,11 +71,9 @@ static struct fimc_fmt fimc_formats[] = { .colplanes = 1, .flags = FMT_FLAGS_M2M_OUT | FMT_HAS_ALPHA, }, { - .name = "YUV 4:4:4", .mbus_code = MEDIA_BUS_FMT_YUV10_1X30, .flags = FMT_FLAGS_WRITEBACK, }, { - .name = "YUV 4:2:2 packed, YCbYCr", .fourcc = V4L2_PIX_FMT_YUYV, .depth = { 16 }, .color = FIMC_FMT_YCBYCR422, @@ -89,7 +82,6 @@ static struct fimc_fmt fimc_formats[] = { .mbus_code = MEDIA_BUS_FMT_YUYV8_2X8, .flags = FMT_FLAGS_M2M | FMT_FLAGS_CAM, }, { - .name = "YUV 4:2:2 packed, CbYCrY", .fourcc = V4L2_PIX_FMT_UYVY, .depth = { 16 }, .color = FIMC_FMT_CBYCRY422, @@ -98,7 +90,6 @@ static struct fimc_fmt fimc_formats[] = { .mbus_code = MEDIA_BUS_FMT_UYVY8_2X8, .flags = FMT_FLAGS_M2M | FMT_FLAGS_CAM, }, { - .name = "YUV 4:2:2 packed, CrYCbY", .fourcc = V4L2_PIX_FMT_VYUY, .depth = { 16 }, .color = FIMC_FMT_CRYCBY422, @@ -107,7 +98,6 @@ static struct fimc_fmt fimc_formats[] = { .mbus_code = MEDIA_BUS_FMT_VYUY8_2X8, .flags = FMT_FLAGS_M2M | FMT_FLAGS_CAM, }, { - .name = "YUV 4:2:2 packed, YCrYCb", .fourcc = V4L2_PIX_FMT_YVYU, .depth = { 16 }, .color = FIMC_FMT_YCRYCB422, @@ -116,7 +106,6 @@ static struct fimc_fmt fimc_formats[] = { .mbus_code = MEDIA_BUS_FMT_YVYU8_2X8, .flags = FMT_FLAGS_M2M | FMT_FLAGS_CAM, }, { - .name = "YUV 4:2:2 planar, Y/Cb/Cr", .fourcc = V4L2_PIX_FMT_YUV422P, .depth = { 16 }, .color = FIMC_FMT_YCBYCR422, @@ -124,7 +113,6 @@ static struct fimc_fmt fimc_formats[] = { .colplanes = 3, .flags = FMT_FLAGS_M2M, }, { - .name = "YUV 4:2:2 planar, Y/CbCr", .fourcc = V4L2_PIX_FMT_NV16, .depth = { 16 }, .color = FIMC_FMT_YCBYCR422, @@ -132,7 +120,6 @@ static struct fimc_fmt fimc_formats[] = { .colplanes = 2, .flags = FMT_FLAGS_M2M, }, { - .name = "YUV 4:2:2 planar, Y/CrCb", .fourcc = V4L2_PIX_FMT_NV61, .depth = { 16 }, .color = FIMC_FMT_YCRYCB422, @@ -140,7 +127,6 @@ static struct fimc_fmt fimc_formats[] = { .colplanes = 2, .flags = FMT_FLAGS_M2M, }, { - .name = "YUV 4:2:0 planar, YCbCr", .fourcc = V4L2_PIX_FMT_YUV420, .depth = { 12 }, .color = FIMC_FMT_YCBCR420, @@ -148,7 +134,6 @@ static struct fimc_fmt fimc_formats[] = { .colplanes = 3, .flags = FMT_FLAGS_M2M, }, { - .name = "YUV 4:2:0 planar, Y/CbCr", .fourcc = V4L2_PIX_FMT_NV12, .depth = { 12 }, .color = FIMC_FMT_YCBCR420, @@ -156,7 +141,6 @@ static struct fimc_fmt fimc_formats[] = { .colplanes = 2, .flags = FMT_FLAGS_M2M, }, { - .name = "YUV 4:2:0 non-contig. 2p, Y/CbCr", .fourcc = V4L2_PIX_FMT_NV12M, .color = FIMC_FMT_YCBCR420, .depth = { 8, 4 }, @@ -164,7 +148,6 @@ static struct fimc_fmt fimc_formats[] = { .colplanes = 2, .flags = FMT_FLAGS_M2M, }, { - .name = "YUV 4:2:0 non-contig. 3p, Y/Cb/Cr", .fourcc = V4L2_PIX_FMT_YUV420M, .color = FIMC_FMT_YCBCR420, .depth = { 8, 2, 2 }, @@ -172,7 +155,6 @@ static struct fimc_fmt fimc_formats[] = { .colplanes = 3, .flags = FMT_FLAGS_M2M, }, { - .name = "YUV 4:2:0 non-contig. 2p, tiled", .fourcc = V4L2_PIX_FMT_NV12MT, .color = FIMC_FMT_YCBCR420, .depth = { 8, 4 }, @@ -180,7 +162,6 @@ static struct fimc_fmt fimc_formats[] = { .colplanes = 2, .flags = FMT_FLAGS_M2M, }, { - .name = "JPEG encoded data", .fourcc = V4L2_PIX_FMT_JPEG, .color = FIMC_FMT_JPEG, .depth = { 8 }, @@ -189,7 +170,6 @@ static struct fimc_fmt fimc_formats[] = { .mbus_code = MEDIA_BUS_FMT_JPEG_1X8, .flags = FMT_FLAGS_CAM | FMT_FLAGS_COMPRESSED, }, { - .name = "S5C73MX interleaved UYVY/JPEG", .fourcc = V4L2_PIX_FMT_S5C_UYVY_JPG, .color = FIMC_FMT_YUYV_JPEG, .depth = { 8 }, diff --git a/drivers/media/platform/exynos4-is/fimc-isp-video.c b/drivers/media/platform/exynos4-is/fimc-isp-video.c index a75f932a289a..378cc302e1f8 100644 --- a/drivers/media/platform/exynos4-is/fimc-isp-video.c +++ b/drivers/media/platform/exynos4-is/fimc-isp-video.c @@ -362,7 +362,6 @@ static int isp_video_enum_fmt(struct file *file, void *priv, if (WARN_ON(fmt == NULL)) return -EINVAL; - strscpy(f->description, fmt->name, sizeof(f->description)); f->pixelformat = fmt->fourcc; return 0; diff --git a/drivers/media/platform/exynos4-is/fimc-isp.c b/drivers/media/platform/exynos4-is/fimc-isp.c index 907b83e6649d..cde0d254ec1c 100644 --- a/drivers/media/platform/exynos4-is/fimc-isp.c +++ b/drivers/media/platform/exynos4-is/fimc-isp.c @@ -33,21 +33,18 @@ module_param_named(debug_isp, fimc_isp_debug, int, S_IRUGO | S_IWUSR); static const struct fimc_fmt fimc_isp_formats[FIMC_ISP_NUM_FORMATS] = { { - .name = "RAW8 (GRBG)", .fourcc = V4L2_PIX_FMT_SGRBG8, .depth = { 8 }, .color = FIMC_FMT_RAW8, .memplanes = 1, .mbus_code = MEDIA_BUS_FMT_SGRBG8_1X8, }, { - .name = "RAW10 (GRBG)", .fourcc = V4L2_PIX_FMT_SGRBG10, .depth = { 10 }, .color = FIMC_FMT_RAW10, .memplanes = 1, .mbus_code = MEDIA_BUS_FMT_SGRBG10_1X10, }, { - .name = "RAW12 (GRBG)", .fourcc = V4L2_PIX_FMT_SGRBG12, .depth = { 12 }, .color = FIMC_FMT_RAW12, diff --git a/drivers/media/platform/exynos4-is/fimc-lite.c b/drivers/media/platform/exynos4-is/fimc-lite.c index c1f0aee02e5e..e87c6a09205b 100644 --- a/drivers/media/platform/exynos4-is/fimc-lite.c +++ b/drivers/media/platform/exynos4-is/fimc-lite.c @@ -39,7 +39,6 @@ module_param(debug, int, 0644); static const struct fimc_fmt fimc_lite_formats[] = { { - .name = "YUV 4:2:2 packed, YCbYCr", .fourcc = V4L2_PIX_FMT_YUYV, .colorspace = V4L2_COLORSPACE_JPEG, .depth = { 16 }, @@ -48,7 +47,6 @@ static const struct fimc_fmt fimc_lite_formats[] = { .mbus_code = MEDIA_BUS_FMT_YUYV8_2X8, .flags = FMT_FLAGS_YUV, }, { - .name = "YUV 4:2:2 packed, CbYCrY", .fourcc = V4L2_PIX_FMT_UYVY, .colorspace = V4L2_COLORSPACE_JPEG, .depth = { 16 }, @@ -57,7 +55,6 @@ static const struct fimc_fmt fimc_lite_formats[] = { .mbus_code = MEDIA_BUS_FMT_UYVY8_2X8, .flags = FMT_FLAGS_YUV, }, { - .name = "YUV 4:2:2 packed, CrYCbY", .fourcc = V4L2_PIX_FMT_VYUY, .colorspace = V4L2_COLORSPACE_JPEG, .depth = { 16 }, @@ -66,7 +63,6 @@ static const struct fimc_fmt fimc_lite_formats[] = { .mbus_code = MEDIA_BUS_FMT_VYUY8_2X8, .flags = FMT_FLAGS_YUV, }, { - .name = "YUV 4:2:2 packed, YCrYCb", .fourcc = V4L2_PIX_FMT_YVYU, .colorspace = V4L2_COLORSPACE_JPEG, .depth = { 16 }, @@ -75,7 +71,6 @@ static const struct fimc_fmt fimc_lite_formats[] = { .mbus_code = MEDIA_BUS_FMT_YVYU8_2X8, .flags = FMT_FLAGS_YUV, }, { - .name = "RAW8 (GRBG)", .fourcc = V4L2_PIX_FMT_SGRBG8, .colorspace = V4L2_COLORSPACE_SRGB, .depth = { 8 }, @@ -84,7 +79,6 @@ static const struct fimc_fmt fimc_lite_formats[] = { .mbus_code = MEDIA_BUS_FMT_SGRBG8_1X8, .flags = FMT_FLAGS_RAW_BAYER, }, { - .name = "RAW10 (GRBG)", .fourcc = V4L2_PIX_FMT_SGRBG10, .colorspace = V4L2_COLORSPACE_SRGB, .depth = { 16 }, @@ -93,7 +87,6 @@ static const struct fimc_fmt fimc_lite_formats[] = { .mbus_code = MEDIA_BUS_FMT_SGRBG10_1X10, .flags = FMT_FLAGS_RAW_BAYER, }, { - .name = "RAW12 (GRBG)", .fourcc = V4L2_PIX_FMT_SGRBG12, .colorspace = V4L2_COLORSPACE_SRGB, .depth = { 16 }, @@ -667,7 +660,6 @@ static int fimc_lite_enum_fmt(struct file *file, void *priv, return -EINVAL; fmt = &fimc_lite_formats[f->index]; - strscpy(f->description, fmt->name, sizeof(f->description)); f->pixelformat = fmt->fourcc; return 0; diff --git a/drivers/media/platform/exynos4-is/fimc-m2m.c b/drivers/media/platform/exynos4-is/fimc-m2m.c index 62e876fc3555..c70c2cbe3eb1 100644 --- a/drivers/media/platform/exynos4-is/fimc-m2m.c +++ b/drivers/media/platform/exynos4-is/fimc-m2m.c @@ -247,7 +247,6 @@ static int fimc_m2m_enum_fmt(struct file *file, void *priv, if (!fmt) return -EINVAL; - strscpy(f->description, fmt->name, sizeof(f->description)); f->pixelformat = fmt->fourcc; return 0; } diff --git a/drivers/media/platform/m2m-deinterlace.c b/drivers/media/platform/m2m-deinterlace.c index beb7fd7442fb..e9310eeec6cf 100644 --- a/drivers/media/platform/m2m-deinterlace.c +++ b/drivers/media/platform/m2m-deinterlace.c @@ -37,7 +37,6 @@ module_param(debug, bool, 0644); v4l2_dbg(1, debug, &dev->v4l2_dev, "%s: " fmt, __func__, ## arg) struct deinterlace_fmt { - char *name; u32 fourcc; /* Types the format can be used for */ u32 types; @@ -45,12 +44,10 @@ struct deinterlace_fmt { static struct deinterlace_fmt formats[] = { { - .name = "YUV 4:2:0 Planar", .fourcc = V4L2_PIX_FMT_YUV420, .types = MEM2MEM_CAPTURE | MEM2MEM_OUTPUT, }, { - .name = "YUYV 4:2:2", .fourcc = V4L2_PIX_FMT_YUYV, .types = MEM2MEM_CAPTURE | MEM2MEM_OUTPUT, }, @@ -470,7 +467,6 @@ static int enum_fmt(struct v4l2_fmtdesc *f, u32 type) if (i < NUM_FORMATS) { /* Format found */ fmt = &formats[i]; - strscpy(f->description, fmt->name, sizeof(f->description)); f->pixelformat = fmt->fourcc; return 0; } diff --git a/drivers/media/platform/marvell-ccic/mcam-core.c b/drivers/media/platform/marvell-ccic/mcam-core.c index dc30c48d4671..30ac454e702e 100644 --- a/drivers/media/platform/marvell-ccic/mcam-core.c +++ b/drivers/media/platform/marvell-ccic/mcam-core.c @@ -98,56 +98,48 @@ MODULE_PARM_DESC(buffer_mode, container_of(notifier, struct mcam_camera, notifier) static struct mcam_format_struct { - __u8 *desc; __u32 pixelformat; int bpp; /* Bytes per pixel */ bool planar; u32 mbus_code; } mcam_formats[] = { { - .desc = "YUYV 4:2:2", .pixelformat = V4L2_PIX_FMT_YUYV, .mbus_code = MEDIA_BUS_FMT_YUYV8_2X8, .bpp = 2, .planar = false, }, { - .desc = "YVYU 4:2:2", .pixelformat = V4L2_PIX_FMT_YVYU, .mbus_code = MEDIA_BUS_FMT_YUYV8_2X8, .bpp = 2, .planar = false, }, { - .desc = "YUV 4:2:0 PLANAR", .pixelformat = V4L2_PIX_FMT_YUV420, .mbus_code = MEDIA_BUS_FMT_YUYV8_2X8, .bpp = 1, .planar = true, }, { - .desc = "YVU 4:2:0 PLANAR", .pixelformat = V4L2_PIX_FMT_YVU420, .mbus_code = MEDIA_BUS_FMT_YUYV8_2X8, .bpp = 1, .planar = true, }, { - .desc = "XRGB 444", .pixelformat = V4L2_PIX_FMT_XRGB444, .mbus_code = MEDIA_BUS_FMT_RGB444_2X8_PADHI_LE, .bpp = 2, .planar = false, }, { - .desc = "RGB 565", .pixelformat = V4L2_PIX_FMT_RGB565, .mbus_code = MEDIA_BUS_FMT_RGB565_2X8_LE, .bpp = 2, .planar = false, }, { - .desc = "Raw RGB Bayer", .pixelformat = V4L2_PIX_FMT_SBGGR8, .mbus_code = MEDIA_BUS_FMT_SBGGR8_1X8, .bpp = 1, @@ -1369,8 +1361,6 @@ static int mcam_vidioc_enum_fmt_vid_cap(struct file *filp, { if (fmt->index >= N_MCAM_FMTS) return -EINVAL; - strscpy(fmt->description, mcam_formats[fmt->index].desc, - sizeof(fmt->description)); fmt->pixelformat = mcam_formats[fmt->index].pixelformat; return 0; } diff --git a/drivers/media/platform/mx2_emmaprp.c b/drivers/media/platform/mx2_emmaprp.c index 333324c75027..4d4225ab1589 100644 --- a/drivers/media/platform/mx2_emmaprp.c +++ b/drivers/media/platform/mx2_emmaprp.c @@ -145,7 +145,6 @@ module_param(debug, bool, 0644); #define PRP_INTR_ST_CH2OVF (1 << 8) struct emmaprp_fmt { - char *name; u32 fourcc; /* Types the format can be used for */ u32 types; @@ -153,12 +152,10 @@ struct emmaprp_fmt { static struct emmaprp_fmt formats[] = { { - .name = "YUV 4:2:0 Planar", .fourcc = V4L2_PIX_FMT_YUV420, .types = MEM2MEM_CAPTURE, }, { - .name = "4:2:2, packed, YUYV", .fourcc = V4L2_PIX_FMT_YUYV, .types = MEM2MEM_OUTPUT, }, @@ -409,7 +406,6 @@ static int enum_fmt(struct v4l2_fmtdesc *f, u32 type) if (i < NUM_FORMATS) { /* Format found */ fmt = &formats[i]; - strscpy(f->description, fmt->name, sizeof(f->description) - 1); f->pixelformat = fmt->fourcc; return 0; } diff --git a/drivers/media/platform/omap/omap_vout.c b/drivers/media/platform/omap/omap_vout.c index cb6a9e3946b6..1f6742536c46 100644 --- a/drivers/media/platform/omap/omap_vout.c +++ b/drivers/media/platform/omap/omap_vout.c @@ -114,14 +114,12 @@ static const struct v4l2_fmtdesc omap_formats[] = { * Byte 0 Byte 1 * g2 g1 g0 b4 b3 b2 b1 b0 r4 r3 r2 r1 r0 g5 g4 g3 */ - .description = "RGB565, le", .pixelformat = V4L2_PIX_FMT_RGB565, }, { /* Note: V4L2 defines RGB32 as: RGB-8-8-8-8 we use * this for RGB24 unpack mode, the last 8 bits are ignored * */ - .description = "RGB32, le", .pixelformat = V4L2_PIX_FMT_RGB32, }, { @@ -129,15 +127,12 @@ static const struct v4l2_fmtdesc omap_formats[] = { * this for RGB24 packed mode * */ - .description = "RGB24, le", .pixelformat = V4L2_PIX_FMT_RGB24, }, { - .description = "YUYV (YUV 4:2:2), packed", .pixelformat = V4L2_PIX_FMT_YUYV, }, { - .description = "UYVY, packed", .pixelformat = V4L2_PIX_FMT_UYVY, }, }; @@ -1060,8 +1055,6 @@ static int vidioc_enum_fmt_vid_out(struct file *file, void *fh, return -EINVAL; fmt->flags = omap_formats[index].flags; - strscpy(fmt->description, omap_formats[index].description, - sizeof(fmt->description)); fmt->pixelformat = omap_formats[index].pixelformat; return 0; diff --git a/drivers/media/platform/s3c-camif/camif-capture.c b/drivers/media/platform/s3c-camif/camif-capture.c index a876d0873ebc..2191fdded9da 100644 --- a/drivers/media/platform/s3c-camif/camif-capture.c +++ b/drivers/media/platform/s3c-camif/camif-capture.c @@ -685,10 +685,7 @@ static int s3c_camif_vidioc_enum_fmt(struct file *file, void *priv, if (!fmt) return -EINVAL; - strscpy(f->description, fmt->name, sizeof(f->description)); f->pixelformat = fmt->fourcc; - - pr_debug("fmt(%d): %s\n", f->index, f->description); return 0; } @@ -802,10 +799,10 @@ static int s3c_camif_vidioc_s_fmt(struct file *file, void *priv, if (vp->owner == NULL) vp->owner = priv; - pr_debug("%ux%u. payload: %u. fmt: %s. %d %d. sizeimage: %d. bpl: %d\n", - out_frame->f_width, out_frame->f_height, vp->payload, fmt->name, - pix->width * pix->height * fmt->depth, fmt->depth, - pix->sizeimage, pix->bytesperline); + pr_debug("%ux%u. payload: %u. fmt: 0x%08x. %d %d. sizeimage: %d. bpl: %d\n", + out_frame->f_width, out_frame->f_height, vp->payload, + fmt->fourcc, pix->width * pix->height * fmt->depth, + fmt->depth, pix->sizeimage, pix->bytesperline); return 0; } diff --git a/drivers/media/platform/s3c-camif/camif-core.c b/drivers/media/platform/s3c-camif/camif-core.c index b05ce0149ca1..f0acd6edcbba 100644 --- a/drivers/media/platform/s3c-camif/camif-core.c +++ b/drivers/media/platform/s3c-camif/camif-core.c @@ -42,7 +42,6 @@ static char *camif_clocks[CLK_MAX_NUM] = { static const struct camif_fmt camif_formats[] = { { - .name = "YUV 4:2:2 planar, Y/Cb/Cr", .fourcc = V4L2_PIX_FMT_YUV422P, .depth = 16, .ybpp = 1, @@ -51,7 +50,6 @@ static const struct camif_fmt camif_formats[] = { .flags = FMT_FL_S3C24XX_CODEC | FMT_FL_S3C64XX, }, { - .name = "YUV 4:2:0 planar, Y/Cb/Cr", .fourcc = V4L2_PIX_FMT_YUV420, .depth = 12, .ybpp = 1, @@ -60,7 +58,6 @@ static const struct camif_fmt camif_formats[] = { .flags = FMT_FL_S3C24XX_CODEC | FMT_FL_S3C64XX, }, { - .name = "YVU 4:2:0 planar, Y/Cr/Cb", .fourcc = V4L2_PIX_FMT_YVU420, .depth = 12, .ybpp = 1, @@ -69,7 +66,6 @@ static const struct camif_fmt camif_formats[] = { .flags = FMT_FL_S3C24XX_CODEC | FMT_FL_S3C64XX, }, { - .name = "RGB565, 16 bpp", .fourcc = V4L2_PIX_FMT_RGB565X, .depth = 16, .ybpp = 2, @@ -78,7 +74,6 @@ static const struct camif_fmt camif_formats[] = { .flags = FMT_FL_S3C24XX_PREVIEW | FMT_FL_S3C64XX, }, { - .name = "XRGB8888, 32 bpp", .fourcc = V4L2_PIX_FMT_RGB32, .depth = 32, .ybpp = 4, @@ -87,7 +82,6 @@ static const struct camif_fmt camif_formats[] = { .flags = FMT_FL_S3C24XX_PREVIEW | FMT_FL_S3C64XX, }, { - .name = "BGR666", .fourcc = V4L2_PIX_FMT_BGR666, .depth = 32, .ybpp = 4, diff --git a/drivers/media/platform/s3c-camif/camif-core.h b/drivers/media/platform/s3c-camif/camif-core.h index efdc00b4ec6f..f937e638490f 100644 --- a/drivers/media/platform/s3c-camif/camif-core.h +++ b/drivers/media/platform/s3c-camif/camif-core.h @@ -89,7 +89,6 @@ enum img_fmt { * @ybpp: number of luminance bytes per pixel */ struct camif_fmt { - char *name; u32 fourcc; u32 color; u16 colplanes; diff --git a/drivers/media/platform/s5p-g2d/g2d.c b/drivers/media/platform/s5p-g2d/g2d.c index 152d192d5c3f..b3fc0ff8a5ec 100644 --- a/drivers/media/platform/s5p-g2d/g2d.c +++ b/drivers/media/platform/s5p-g2d/g2d.c @@ -29,31 +29,26 @@ static struct g2d_fmt formats[] = { { - .name = "XRGB_8888", .fourcc = V4L2_PIX_FMT_RGB32, .depth = 32, .hw = COLOR_MODE(ORDER_XRGB, MODE_XRGB_8888), }, { - .name = "RGB_565", .fourcc = V4L2_PIX_FMT_RGB565X, .depth = 16, .hw = COLOR_MODE(ORDER_XRGB, MODE_RGB_565), }, { - .name = "XRGB_1555", .fourcc = V4L2_PIX_FMT_RGB555X, .depth = 16, .hw = COLOR_MODE(ORDER_XRGB, MODE_XRGB_1555), }, { - .name = "XRGB_4444", .fourcc = V4L2_PIX_FMT_RGB444, .depth = 16, .hw = COLOR_MODE(ORDER_XRGB, MODE_XRGB_4444), }, { - .name = "PACKED_RGB_888", .fourcc = V4L2_PIX_FMT_RGB24, .depth = 24, .hw = COLOR_MODE(ORDER_XRGB, MODE_PACKED_RGB_888), @@ -303,12 +298,9 @@ static int vidioc_querycap(struct file *file, void *priv, static int vidioc_enum_fmt(struct file *file, void *prv, struct v4l2_fmtdesc *f) { - struct g2d_fmt *fmt; if (f->index >= NUM_FORMATS) return -EINVAL; - fmt = &formats[f->index]; - f->pixelformat = fmt->fourcc; - strscpy(f->description, fmt->name, sizeof(f->description)); + f->pixelformat = formats[f->index].fourcc; return 0; } diff --git a/drivers/media/platform/s5p-g2d/g2d.h b/drivers/media/platform/s5p-g2d/g2d.h index def0ec0dabeb..c2309c1370da 100644 --- a/drivers/media/platform/s5p-g2d/g2d.h +++ b/drivers/media/platform/s5p-g2d/g2d.h @@ -61,7 +61,6 @@ struct g2d_ctx { }; struct g2d_fmt { - char *name; u32 fourcc; int depth; u32 hw; diff --git a/drivers/media/platform/s5p-jpeg/jpeg-core.c b/drivers/media/platform/s5p-jpeg/jpeg-core.c index a3bc884b7df1..20cad061e908 100644 --- a/drivers/media/platform/s5p-jpeg/jpeg-core.c +++ b/drivers/media/platform/s5p-jpeg/jpeg-core.c @@ -35,7 +35,6 @@ static struct s5p_jpeg_fmt sjpeg_formats[] = { { - .name = "JPEG JFIF", .fourcc = V4L2_PIX_FMT_JPEG, .flags = SJPEG_FMT_FLAG_ENC_CAPTURE | SJPEG_FMT_FLAG_DEC_OUTPUT | @@ -44,7 +43,6 @@ static struct s5p_jpeg_fmt sjpeg_formats[] = { SJPEG_FMT_FLAG_EXYNOS4, }, { - .name = "YUV 4:2:2 packed, YCbYCr", .fourcc = V4L2_PIX_FMT_YUYV, .depth = 16, .colplanes = 1, @@ -57,7 +55,6 @@ static struct s5p_jpeg_fmt sjpeg_formats[] = { .subsampling = V4L2_JPEG_CHROMA_SUBSAMPLING_422, }, { - .name = "YUV 4:2:2 packed, YCbYCr", .fourcc = V4L2_PIX_FMT_YUYV, .depth = 16, .colplanes = 1, @@ -70,7 +67,6 @@ static struct s5p_jpeg_fmt sjpeg_formats[] = { .subsampling = V4L2_JPEG_CHROMA_SUBSAMPLING_422, }, { - .name = "YUV 4:2:2 packed, YCbYCr", .fourcc = V4L2_PIX_FMT_YUYV, .depth = 16, .colplanes = 1, @@ -83,7 +79,6 @@ static struct s5p_jpeg_fmt sjpeg_formats[] = { .subsampling = V4L2_JPEG_CHROMA_SUBSAMPLING_422, }, { - .name = "YUV 4:2:2 packed, YCrYCb", .fourcc = V4L2_PIX_FMT_YVYU, .depth = 16, .colplanes = 1, @@ -96,7 +91,6 @@ static struct s5p_jpeg_fmt sjpeg_formats[] = { .subsampling = V4L2_JPEG_CHROMA_SUBSAMPLING_422, }, { - .name = "YUV 4:2:2 packed, YCrYCb", .fourcc = V4L2_PIX_FMT_YVYU, .depth = 16, .colplanes = 1, @@ -109,7 +103,6 @@ static struct s5p_jpeg_fmt sjpeg_formats[] = { .subsampling = V4L2_JPEG_CHROMA_SUBSAMPLING_422, }, { - .name = "YUV 4:2:2 packed, YCrYCb", .fourcc = V4L2_PIX_FMT_UYVY, .depth = 16, .colplanes = 1, @@ -122,7 +115,6 @@ static struct s5p_jpeg_fmt sjpeg_formats[] = { .subsampling = V4L2_JPEG_CHROMA_SUBSAMPLING_422, }, { - .name = "YUV 4:2:2 packed, YCrYCb", .fourcc = V4L2_PIX_FMT_VYUY, .depth = 16, .colplanes = 1, @@ -135,7 +127,6 @@ static struct s5p_jpeg_fmt sjpeg_formats[] = { .subsampling = V4L2_JPEG_CHROMA_SUBSAMPLING_422, }, { - .name = "RGB565", .fourcc = V4L2_PIX_FMT_RGB565, .depth = 16, .colplanes = 1, @@ -148,7 +139,6 @@ static struct s5p_jpeg_fmt sjpeg_formats[] = { .subsampling = V4L2_JPEG_CHROMA_SUBSAMPLING_444, }, { - .name = "RGB565", .fourcc = V4L2_PIX_FMT_RGB565, .depth = 16, .colplanes = 1, @@ -161,7 +151,6 @@ static struct s5p_jpeg_fmt sjpeg_formats[] = { .subsampling = V4L2_JPEG_CHROMA_SUBSAMPLING_444, }, { - .name = "RGB565X", .fourcc = V4L2_PIX_FMT_RGB565X, .depth = 16, .colplanes = 1, @@ -174,7 +163,6 @@ static struct s5p_jpeg_fmt sjpeg_formats[] = { .subsampling = V4L2_JPEG_CHROMA_SUBSAMPLING_444, }, { - .name = "RGB565", .fourcc = V4L2_PIX_FMT_RGB565, .depth = 16, .colplanes = 1, @@ -186,7 +174,6 @@ static struct s5p_jpeg_fmt sjpeg_formats[] = { .subsampling = V4L2_JPEG_CHROMA_SUBSAMPLING_444, }, { - .name = "ARGB8888, 32 bpp", .fourcc = V4L2_PIX_FMT_RGB32, .depth = 32, .colplanes = 1, @@ -199,7 +186,6 @@ static struct s5p_jpeg_fmt sjpeg_formats[] = { .subsampling = V4L2_JPEG_CHROMA_SUBSAMPLING_444, }, { - .name = "ARGB8888, 32 bpp", .fourcc = V4L2_PIX_FMT_RGB32, .depth = 32, .colplanes = 1, @@ -212,7 +198,6 @@ static struct s5p_jpeg_fmt sjpeg_formats[] = { .subsampling = V4L2_JPEG_CHROMA_SUBSAMPLING_444, }, { - .name = "YUV 4:4:4 planar, Y/CbCr", .fourcc = V4L2_PIX_FMT_NV24, .depth = 24, .colplanes = 2, @@ -225,7 +210,6 @@ static struct s5p_jpeg_fmt sjpeg_formats[] = { .subsampling = V4L2_JPEG_CHROMA_SUBSAMPLING_444, }, { - .name = "YUV 4:4:4 planar, Y/CrCb", .fourcc = V4L2_PIX_FMT_NV42, .depth = 24, .colplanes = 2, @@ -238,7 +222,6 @@ static struct s5p_jpeg_fmt sjpeg_formats[] = { .subsampling = V4L2_JPEG_CHROMA_SUBSAMPLING_444, }, { - .name = "YUV 4:2:2 planar, Y/CrCb", .fourcc = V4L2_PIX_FMT_NV61, .depth = 16, .colplanes = 2, @@ -251,7 +234,6 @@ static struct s5p_jpeg_fmt sjpeg_formats[] = { .subsampling = V4L2_JPEG_CHROMA_SUBSAMPLING_422, }, { - .name = "YUV 4:2:2 planar, Y/CbCr", .fourcc = V4L2_PIX_FMT_NV16, .depth = 16, .colplanes = 2, @@ -264,7 +246,6 @@ static struct s5p_jpeg_fmt sjpeg_formats[] = { .subsampling = V4L2_JPEG_CHROMA_SUBSAMPLING_422, }, { - .name = "YUV 4:2:0 planar, Y/CbCr", .fourcc = V4L2_PIX_FMT_NV12, .depth = 12, .colplanes = 2, @@ -277,7 +258,6 @@ static struct s5p_jpeg_fmt sjpeg_formats[] = { .subsampling = V4L2_JPEG_CHROMA_SUBSAMPLING_420, }, { - .name = "YUV 4:2:0 planar, Y/CbCr", .fourcc = V4L2_PIX_FMT_NV12, .depth = 12, .colplanes = 2, @@ -290,7 +270,6 @@ static struct s5p_jpeg_fmt sjpeg_formats[] = { .subsampling = V4L2_JPEG_CHROMA_SUBSAMPLING_420, }, { - .name = "YUV 4:2:0 planar, Y/CbCr", .fourcc = V4L2_PIX_FMT_NV12, .depth = 12, .colplanes = 2, @@ -303,7 +282,6 @@ static struct s5p_jpeg_fmt sjpeg_formats[] = { .subsampling = V4L2_JPEG_CHROMA_SUBSAMPLING_420, }, { - .name = "YUV 4:2:0 planar, Y/CrCb", .fourcc = V4L2_PIX_FMT_NV21, .depth = 12, .colplanes = 2, @@ -316,7 +294,6 @@ static struct s5p_jpeg_fmt sjpeg_formats[] = { .subsampling = V4L2_JPEG_CHROMA_SUBSAMPLING_420, }, { - .name = "YUV 4:2:0 planar, Y/CrCb", .fourcc = V4L2_PIX_FMT_NV21, .depth = 12, .colplanes = 2, @@ -330,7 +307,6 @@ static struct s5p_jpeg_fmt sjpeg_formats[] = { .subsampling = V4L2_JPEG_CHROMA_SUBSAMPLING_420, }, { - .name = "YUV 4:2:0 contiguous 3-planar, Y/Cb/Cr", .fourcc = V4L2_PIX_FMT_YUV420, .depth = 12, .colplanes = 3, @@ -343,7 +319,6 @@ static struct s5p_jpeg_fmt sjpeg_formats[] = { .subsampling = V4L2_JPEG_CHROMA_SUBSAMPLING_420, }, { - .name = "YUV 4:2:0 contiguous 3-planar, Y/Cb/Cr", .fourcc = V4L2_PIX_FMT_YUV420, .depth = 12, .colplanes = 3, @@ -356,7 +331,6 @@ static struct s5p_jpeg_fmt sjpeg_formats[] = { .subsampling = V4L2_JPEG_CHROMA_SUBSAMPLING_420, }, { - .name = "Gray", .fourcc = V4L2_PIX_FMT_GREY, .depth = 8, .colplanes = 1, @@ -1314,7 +1288,6 @@ static int enum_fmt(struct s5p_jpeg_ctx *ctx, if (i >= n) return -EINVAL; - strscpy(f->description, sjpeg_formats[i].name, sizeof(f->description)); f->pixelformat = sjpeg_formats[i].fourcc; return 0; diff --git a/drivers/media/platform/s5p-jpeg/jpeg-core.h b/drivers/media/platform/s5p-jpeg/jpeg-core.h index 34f87f6c02f2..3bc52f83f5bc 100644 --- a/drivers/media/platform/s5p-jpeg/jpeg-core.h +++ b/drivers/media/platform/s5p-jpeg/jpeg-core.h @@ -150,7 +150,6 @@ struct s5p_jpeg_variant { /** * struct jpeg_fmt - driver's internal color format data - * @name: format description * @fourcc: the fourcc code, 0 if not applicable * @depth: number of bits per pixel * @colplanes: number of color planes (1 for packed formats) @@ -159,7 +158,6 @@ struct s5p_jpeg_variant { * @flags: flags describing format applicability */ struct s5p_jpeg_fmt { - char *name; u32 fourcc; int depth; int colplanes; diff --git a/drivers/media/platform/s5p-mfc/s5p_mfc_common.h b/drivers/media/platform/s5p-mfc/s5p_mfc_common.h index 5dc086516360..96d1ecd1521b 100644 --- a/drivers/media/platform/s5p-mfc/s5p_mfc_common.h +++ b/drivers/media/platform/s5p-mfc/s5p_mfc_common.h @@ -718,7 +718,6 @@ struct s5p_mfc_ctx { * used by the MFC */ struct s5p_mfc_fmt { - char *name; u32 fourcc; u32 codec_mode; enum s5p_mfc_fmt_type type; diff --git a/drivers/media/platform/s5p-mfc/s5p_mfc_dec.c b/drivers/media/platform/s5p-mfc/s5p_mfc_dec.c index 4017c8b471f4..61e144a35201 100644 --- a/drivers/media/platform/s5p-mfc/s5p_mfc_dec.c +++ b/drivers/media/platform/s5p-mfc/s5p_mfc_dec.c @@ -29,7 +29,6 @@ static struct s5p_mfc_fmt formats[] = { { - .name = "4:2:0 2 Planes 16x16 Tiles", .fourcc = V4L2_PIX_FMT_NV12MT_16X16, .codec_mode = S5P_MFC_CODEC_NONE, .type = MFC_FMT_RAW, @@ -37,7 +36,6 @@ static struct s5p_mfc_fmt formats[] = { .versions = MFC_V6_BIT | MFC_V7_BIT, }, { - .name = "4:2:0 2 Planes 64x32 Tiles", .fourcc = V4L2_PIX_FMT_NV12MT, .codec_mode = S5P_MFC_CODEC_NONE, .type = MFC_FMT_RAW, @@ -45,7 +43,6 @@ static struct s5p_mfc_fmt formats[] = { .versions = MFC_V5_BIT, }, { - .name = "4:2:0 2 Planes Y/CbCr", .fourcc = V4L2_PIX_FMT_NV12M, .codec_mode = S5P_MFC_CODEC_NONE, .type = MFC_FMT_RAW, @@ -53,7 +50,6 @@ static struct s5p_mfc_fmt formats[] = { .versions = MFC_V6PLUS_BITS, }, { - .name = "4:2:0 2 Planes Y/CrCb", .fourcc = V4L2_PIX_FMT_NV21M, .codec_mode = S5P_MFC_CODEC_NONE, .type = MFC_FMT_RAW, @@ -61,7 +57,6 @@ static struct s5p_mfc_fmt formats[] = { .versions = MFC_V6PLUS_BITS, }, { - .name = "H264 Encoded Stream", .fourcc = V4L2_PIX_FMT_H264, .codec_mode = S5P_MFC_CODEC_H264_DEC, .type = MFC_FMT_DEC, @@ -69,7 +64,6 @@ static struct s5p_mfc_fmt formats[] = { .versions = MFC_V5PLUS_BITS, }, { - .name = "H264/MVC Encoded Stream", .fourcc = V4L2_PIX_FMT_H264_MVC, .codec_mode = S5P_MFC_CODEC_H264_MVC_DEC, .type = MFC_FMT_DEC, @@ -77,7 +71,6 @@ static struct s5p_mfc_fmt formats[] = { .versions = MFC_V6PLUS_BITS, }, { - .name = "H263 Encoded Stream", .fourcc = V4L2_PIX_FMT_H263, .codec_mode = S5P_MFC_CODEC_H263_DEC, .type = MFC_FMT_DEC, @@ -85,7 +78,6 @@ static struct s5p_mfc_fmt formats[] = { .versions = MFC_V5PLUS_BITS, }, { - .name = "MPEG1 Encoded Stream", .fourcc = V4L2_PIX_FMT_MPEG1, .codec_mode = S5P_MFC_CODEC_MPEG2_DEC, .type = MFC_FMT_DEC, @@ -93,7 +85,6 @@ static struct s5p_mfc_fmt formats[] = { .versions = MFC_V5PLUS_BITS, }, { - .name = "MPEG2 Encoded Stream", .fourcc = V4L2_PIX_FMT_MPEG2, .codec_mode = S5P_MFC_CODEC_MPEG2_DEC, .type = MFC_FMT_DEC, @@ -101,7 +92,6 @@ static struct s5p_mfc_fmt formats[] = { .versions = MFC_V5PLUS_BITS, }, { - .name = "MPEG4 Encoded Stream", .fourcc = V4L2_PIX_FMT_MPEG4, .codec_mode = S5P_MFC_CODEC_MPEG4_DEC, .type = MFC_FMT_DEC, @@ -109,7 +99,6 @@ static struct s5p_mfc_fmt formats[] = { .versions = MFC_V5PLUS_BITS, }, { - .name = "XviD Encoded Stream", .fourcc = V4L2_PIX_FMT_XVID, .codec_mode = S5P_MFC_CODEC_MPEG4_DEC, .type = MFC_FMT_DEC, @@ -117,7 +106,6 @@ static struct s5p_mfc_fmt formats[] = { .versions = MFC_V5PLUS_BITS, }, { - .name = "VC1 Encoded Stream", .fourcc = V4L2_PIX_FMT_VC1_ANNEX_G, .codec_mode = S5P_MFC_CODEC_VC1_DEC, .type = MFC_FMT_DEC, @@ -125,7 +113,6 @@ static struct s5p_mfc_fmt formats[] = { .versions = MFC_V5PLUS_BITS, }, { - .name = "VC1 RCV Encoded Stream", .fourcc = V4L2_PIX_FMT_VC1_ANNEX_L, .codec_mode = S5P_MFC_CODEC_VC1RCV_DEC, .type = MFC_FMT_DEC, @@ -133,7 +120,6 @@ static struct s5p_mfc_fmt formats[] = { .versions = MFC_V5PLUS_BITS, }, { - .name = "VP8 Encoded Stream", .fourcc = V4L2_PIX_FMT_VP8, .codec_mode = S5P_MFC_CODEC_VP8_DEC, .type = MFC_FMT_DEC, @@ -279,7 +265,6 @@ static int vidioc_enum_fmt(struct file *file, struct v4l2_fmtdesc *f, bool out) { struct s5p_mfc_dev *dev = video_drvdata(file); - struct s5p_mfc_fmt *fmt; int i, j = 0; for (i = 0; i < ARRAY_SIZE(formats); ++i) { @@ -296,9 +281,7 @@ static int vidioc_enum_fmt(struct file *file, struct v4l2_fmtdesc *f, } if (i == ARRAY_SIZE(formats)) return -EINVAL; - fmt = &formats[i]; - strscpy(f->description, fmt->name, sizeof(f->description)); - f->pixelformat = fmt->fourcc; + f->pixelformat = formats[i].fourcc; return 0; } diff --git a/drivers/media/platform/s5p-mfc/s5p_mfc_enc.c b/drivers/media/platform/s5p-mfc/s5p_mfc_enc.c index 97e76480e942..912fe0c5ab18 100644 --- a/drivers/media/platform/s5p-mfc/s5p_mfc_enc.c +++ b/drivers/media/platform/s5p-mfc/s5p_mfc_enc.c @@ -32,7 +32,6 @@ static struct s5p_mfc_fmt formats[] = { { - .name = "4:2:0 2 Planes 16x16 Tiles", .fourcc = V4L2_PIX_FMT_NV12MT_16X16, .codec_mode = S5P_MFC_CODEC_NONE, .type = MFC_FMT_RAW, @@ -40,7 +39,6 @@ static struct s5p_mfc_fmt formats[] = { .versions = MFC_V6_BIT | MFC_V7_BIT, }, { - .name = "4:2:0 2 Planes 64x32 Tiles", .fourcc = V4L2_PIX_FMT_NV12MT, .codec_mode = S5P_MFC_CODEC_NONE, .type = MFC_FMT_RAW, @@ -48,7 +46,6 @@ static struct s5p_mfc_fmt formats[] = { .versions = MFC_V5_BIT, }, { - .name = "4:2:0 2 Planes Y/CbCr", .fourcc = V4L2_PIX_FMT_NV12M, .codec_mode = S5P_MFC_CODEC_NONE, .type = MFC_FMT_RAW, @@ -56,7 +53,6 @@ static struct s5p_mfc_fmt formats[] = { .versions = MFC_V5PLUS_BITS, }, { - .name = "4:2:0 2 Planes Y/CrCb", .fourcc = V4L2_PIX_FMT_NV21M, .codec_mode = S5P_MFC_CODEC_NONE, .type = MFC_FMT_RAW, @@ -64,7 +60,6 @@ static struct s5p_mfc_fmt formats[] = { .versions = MFC_V6PLUS_BITS, }, { - .name = "H264 Encoded Stream", .fourcc = V4L2_PIX_FMT_H264, .codec_mode = S5P_MFC_CODEC_H264_ENC, .type = MFC_FMT_ENC, @@ -72,7 +67,6 @@ static struct s5p_mfc_fmt formats[] = { .versions = MFC_V5PLUS_BITS, }, { - .name = "MPEG4 Encoded Stream", .fourcc = V4L2_PIX_FMT_MPEG4, .codec_mode = S5P_MFC_CODEC_MPEG4_ENC, .type = MFC_FMT_ENC, @@ -80,7 +74,6 @@ static struct s5p_mfc_fmt formats[] = { .versions = MFC_V5PLUS_BITS, }, { - .name = "H263 Encoded Stream", .fourcc = V4L2_PIX_FMT_H263, .codec_mode = S5P_MFC_CODEC_H263_ENC, .type = MFC_FMT_ENC, @@ -88,7 +81,6 @@ static struct s5p_mfc_fmt formats[] = { .versions = MFC_V5PLUS_BITS, }, { - .name = "VP8 Encoded Stream", .fourcc = V4L2_PIX_FMT_VP8, .codec_mode = S5P_MFC_CODEC_VP8_ENC, .type = MFC_FMT_ENC, @@ -1320,7 +1312,6 @@ static int vidioc_enum_fmt(struct file *file, struct v4l2_fmtdesc *f, bool out) { struct s5p_mfc_dev *dev = video_drvdata(file); - struct s5p_mfc_fmt *fmt; int i, j = 0; for (i = 0; i < ARRAY_SIZE(formats); ++i) { @@ -1332,10 +1323,7 @@ static int vidioc_enum_fmt(struct file *file, struct v4l2_fmtdesc *f, continue; if (j == f->index) { - fmt = &formats[i]; - strscpy(f->description, fmt->name, - sizeof(f->description)); - f->pixelformat = fmt->fourcc; + f->pixelformat = formats[i].fourcc; return 0; } ++j; diff --git a/drivers/media/platform/sh_veu.c b/drivers/media/platform/sh_veu.c index 5a9ba05c996e..26360af2325e 100644 --- a/drivers/media/platform/sh_veu.c +++ b/drivers/media/platform/sh_veu.c @@ -86,7 +86,6 @@ struct sh_veu_file { }; struct sh_veu_format { - char *name; u32 fourcc; unsigned int depth; unsigned int ydepth; @@ -144,14 +143,14 @@ enum sh_veu_fmt_idx { * aligned for NV24. */ static const struct sh_veu_format sh_veu_fmt[] = { - [SH_VEU_FMT_NV12] = { .ydepth = 8, .depth = 12, .name = "NV12", .fourcc = V4L2_PIX_FMT_NV12 }, - [SH_VEU_FMT_NV16] = { .ydepth = 8, .depth = 16, .name = "NV16", .fourcc = V4L2_PIX_FMT_NV16 }, - [SH_VEU_FMT_NV24] = { .ydepth = 8, .depth = 24, .name = "NV24", .fourcc = V4L2_PIX_FMT_NV24 }, - [SH_VEU_FMT_RGB332] = { .ydepth = 8, .depth = 8, .name = "RGB332", .fourcc = V4L2_PIX_FMT_RGB332 }, - [SH_VEU_FMT_RGB444] = { .ydepth = 16, .depth = 16, .name = "RGB444", .fourcc = V4L2_PIX_FMT_RGB444 }, - [SH_VEU_FMT_RGB565] = { .ydepth = 16, .depth = 16, .name = "RGB565", .fourcc = V4L2_PIX_FMT_RGB565 }, - [SH_VEU_FMT_RGB666] = { .ydepth = 32, .depth = 32, .name = "BGR666", .fourcc = V4L2_PIX_FMT_BGR666 }, - [SH_VEU_FMT_RGB24] = { .ydepth = 24, .depth = 24, .name = "RGB24", .fourcc = V4L2_PIX_FMT_RGB24 }, + [SH_VEU_FMT_NV12] = { .ydepth = 8, .depth = 12, .fourcc = V4L2_PIX_FMT_NV12 }, + [SH_VEU_FMT_NV16] = { .ydepth = 8, .depth = 16, .fourcc = V4L2_PIX_FMT_NV16 }, + [SH_VEU_FMT_NV24] = { .ydepth = 8, .depth = 24, .fourcc = V4L2_PIX_FMT_NV24 }, + [SH_VEU_FMT_RGB332] = { .ydepth = 8, .depth = 8, .fourcc = V4L2_PIX_FMT_RGB332 }, + [SH_VEU_FMT_RGB444] = { .ydepth = 16, .depth = 16, .fourcc = V4L2_PIX_FMT_RGB444 }, + [SH_VEU_FMT_RGB565] = { .ydepth = 16, .depth = 16, .fourcc = V4L2_PIX_FMT_RGB565 }, + [SH_VEU_FMT_RGB666] = { .ydepth = 32, .depth = 32, .fourcc = V4L2_PIX_FMT_BGR666 }, + [SH_VEU_FMT_RGB24] = { .ydepth = 24, .depth = 24, .fourcc = V4L2_PIX_FMT_RGB24 }, }; #define DEFAULT_IN_VFMT (struct sh_veu_vfmt){ \ @@ -359,8 +358,6 @@ static int sh_veu_enum_fmt(struct v4l2_fmtdesc *f, const int *fmt, int fmt_num) if (f->index >= fmt_num) return -EINVAL; - strscpy(f->description, sh_veu_fmt[fmt[f->index]].name, - sizeof(f->description)); f->pixelformat = sh_veu_fmt[fmt[f->index]].fourcc; return 0; } diff --git a/drivers/media/platform/sh_vou.c b/drivers/media/platform/sh_vou.c index 5799aa4b9323..4fc1b4e11b70 100644 --- a/drivers/media/platform/sh_vou.c +++ b/drivers/media/platform/sh_vou.c @@ -138,7 +138,6 @@ static void sh_vou_reg_ab_set(struct sh_vou_device *vou_dev, unsigned int reg, struct sh_vou_fmt { u32 pfmt; - char *desc; unsigned char bpp; unsigned char bpl; unsigned char rgb; @@ -152,7 +151,6 @@ static struct sh_vou_fmt vou_fmt[] = { .pfmt = V4L2_PIX_FMT_NV12, .bpp = 12, .bpl = 1, - .desc = "YVU420 planar", .yf = 0, .rgb = 0, }, @@ -160,7 +158,6 @@ static struct sh_vou_fmt vou_fmt[] = { .pfmt = V4L2_PIX_FMT_NV16, .bpp = 16, .bpl = 1, - .desc = "YVYU planar", .yf = 1, .rgb = 0, }, @@ -168,7 +165,6 @@ static struct sh_vou_fmt vou_fmt[] = { .pfmt = V4L2_PIX_FMT_RGB24, .bpp = 24, .bpl = 3, - .desc = "RGB24", .pkf = 2, .rgb = 1, }, @@ -176,7 +172,6 @@ static struct sh_vou_fmt vou_fmt[] = { .pfmt = V4L2_PIX_FMT_RGB565, .bpp = 16, .bpl = 2, - .desc = "RGB565", .pkf = 3, .rgb = 1, }, @@ -184,7 +179,6 @@ static struct sh_vou_fmt vou_fmt[] = { .pfmt = V4L2_PIX_FMT_RGB565X, .bpp = 16, .bpl = 2, - .desc = "RGB565 byteswapped", .pkf = 3, .rgb = 1, }, @@ -398,9 +392,6 @@ static int sh_vou_enum_fmt_vid_out(struct file *file, void *priv, dev_dbg(vou_dev->v4l2_dev.dev, "%s()\n", __func__); - fmt->type = V4L2_BUF_TYPE_VIDEO_OUTPUT; - strscpy(fmt->description, vou_fmt[fmt->index].desc, - sizeof(fmt->description)); fmt->pixelformat = vou_fmt[fmt->index].pfmt; return 0; @@ -494,7 +485,8 @@ static void sh_vou_configure_geometry(struct sh_vou_device *vou_dev, if (h_idx) vouvcr |= (1 << 14) | vou_scale_v_fld[h_idx - 1]; - dev_dbg(vou_dev->v4l2_dev.dev, "%s: scaling 0x%x\n", fmt->desc, vouvcr); + dev_dbg(vou_dev->v4l2_dev.dev, "0x%08x: scaling 0x%x\n", + fmt->pfmt, vouvcr); /* To produce a colour bar for testing set bit 23 of VOUVCR */ sh_vou_reg_ab_write(vou_dev, VOUVCR, vouvcr); diff --git a/drivers/media/platform/ti-vpe/vpe.c b/drivers/media/platform/ti-vpe/vpe.c index dda04498ac56..60b575bb44c4 100644 --- a/drivers/media/platform/ti-vpe/vpe.c +++ b/drivers/media/platform/ti-vpe/vpe.c @@ -224,7 +224,6 @@ static const struct vpe_port_data port_data[11] = { /* driver info for each of the supported video formats */ struct vpe_fmt { - char *name; /* human-readable name */ u32 fourcc; /* standard format identifier */ u8 types; /* CAPTURE and/or OUTPUT */ u8 coplanar; /* set for unpacked Luma and Chroma */ @@ -234,7 +233,6 @@ struct vpe_fmt { static struct vpe_fmt vpe_formats[] = { { - .name = "NV16 YUV 422 co-planar", .fourcc = V4L2_PIX_FMT_NV16, .types = VPE_FMT_TYPE_CAPTURE | VPE_FMT_TYPE_OUTPUT, .coplanar = 1, @@ -243,7 +241,6 @@ static struct vpe_fmt vpe_formats[] = { }, }, { - .name = "NV12 YUV 420 co-planar", .fourcc = V4L2_PIX_FMT_NV12, .types = VPE_FMT_TYPE_CAPTURE | VPE_FMT_TYPE_OUTPUT, .coplanar = 1, @@ -252,7 +249,6 @@ static struct vpe_fmt vpe_formats[] = { }, }, { - .name = "YUYV 422 packed", .fourcc = V4L2_PIX_FMT_YUYV, .types = VPE_FMT_TYPE_CAPTURE | VPE_FMT_TYPE_OUTPUT, .coplanar = 0, @@ -260,7 +256,6 @@ static struct vpe_fmt vpe_formats[] = { }, }, { - .name = "UYVY 422 packed", .fourcc = V4L2_PIX_FMT_UYVY, .types = VPE_FMT_TYPE_CAPTURE | VPE_FMT_TYPE_OUTPUT, .coplanar = 0, @@ -268,7 +263,6 @@ static struct vpe_fmt vpe_formats[] = { }, }, { - .name = "RGB888 packed", .fourcc = V4L2_PIX_FMT_RGB24, .types = VPE_FMT_TYPE_CAPTURE, .coplanar = 0, @@ -276,7 +270,6 @@ static struct vpe_fmt vpe_formats[] = { }, }, { - .name = "ARGB32", .fourcc = V4L2_PIX_FMT_RGB32, .types = VPE_FMT_TYPE_CAPTURE, .coplanar = 0, @@ -284,7 +277,6 @@ static struct vpe_fmt vpe_formats[] = { }, }, { - .name = "BGR888 packed", .fourcc = V4L2_PIX_FMT_BGR24, .types = VPE_FMT_TYPE_CAPTURE, .coplanar = 0, @@ -292,7 +284,6 @@ static struct vpe_fmt vpe_formats[] = { }, }, { - .name = "ABGR32", .fourcc = V4L2_PIX_FMT_BGR32, .types = VPE_FMT_TYPE_CAPTURE, .coplanar = 0, @@ -300,7 +291,6 @@ static struct vpe_fmt vpe_formats[] = { }, }, { - .name = "RGB565", .fourcc = V4L2_PIX_FMT_RGB565, .types = VPE_FMT_TYPE_CAPTURE, .coplanar = 0, @@ -308,7 +298,6 @@ static struct vpe_fmt vpe_formats[] = { }, }, { - .name = "RGB5551", .fourcc = V4L2_PIX_FMT_RGB555, .types = VPE_FMT_TYPE_CAPTURE, .coplanar = 0, @@ -1514,7 +1503,6 @@ static int __enum_fmt(struct v4l2_fmtdesc *f, u32 type) if (!fmt) return -EINVAL; - strscpy(f->description, fmt->name, sizeof(f->description)); f->pixelformat = fmt->fourcc; return 0; } diff --git a/drivers/media/platform/via-camera.c b/drivers/media/platform/via-camera.c index 038de7a2027a..d5f811820be9 100644 --- a/drivers/media/platform/via-camera.c +++ b/drivers/media/platform/via-camera.c @@ -142,13 +142,11 @@ static struct via_camera *via_cam_info; * now this information must be managed at this level too. */ static struct via_format { - __u8 *desc; __u32 pixelformat; int bpp; /* Bytes per pixel */ u32 mbus_code; } via_formats[] = { { - .desc = "YUYV 4:2:2", .pixelformat = V4L2_PIX_FMT_YUYV, .mbus_code = MEDIA_BUS_FMT_YUYV8_2X8, .bpp = 2, @@ -860,8 +858,6 @@ static int viacam_enum_fmt_vid_cap(struct file *filp, void *priv, { if (fmt->index >= N_VIA_FMTS) return -EINVAL; - strscpy(fmt->description, via_formats[fmt->index].desc, - sizeof(fmt->description)); fmt->pixelformat = via_formats[fmt->index].pixelformat; return 0; } diff --git a/drivers/media/platform/xilinx/xilinx-dma.c b/drivers/media/platform/xilinx/xilinx-dma.c index c9d5fdb2d407..7bd2600cdc9a 100644 --- a/drivers/media/platform/xilinx/xilinx-dma.c +++ b/drivers/media/platform/xilinx/xilinx-dma.c @@ -524,8 +524,6 @@ xvip_dma_enum_format(struct file *file, void *fh, struct v4l2_fmtdesc *f) return -EINVAL; f->pixelformat = dma->format.pixelformat; - strscpy(f->description, dma->fmtinfo->description, - sizeof(f->description)); return 0; } diff --git a/drivers/media/platform/xilinx/xilinx-vip.c b/drivers/media/platform/xilinx/xilinx-vip.c index 08a825c3a3f6..6ad61b08a31a 100644 --- a/drivers/media/platform/xilinx/xilinx-vip.c +++ b/drivers/media/platform/xilinx/xilinx-vip.c @@ -25,21 +25,21 @@ static const struct xvip_video_format xvip_video_formats[] = { { XVIP_VF_YUV_422, 8, NULL, MEDIA_BUS_FMT_UYVY8_1X16, - 2, V4L2_PIX_FMT_YUYV, "4:2:2, packed, YUYV" }, + 2, V4L2_PIX_FMT_YUYV }, { XVIP_VF_YUV_444, 8, NULL, MEDIA_BUS_FMT_VUY8_1X24, - 3, V4L2_PIX_FMT_YUV444, "4:4:4, packed, YUYV" }, + 3, V4L2_PIX_FMT_YUV444 }, { XVIP_VF_RBG, 8, NULL, MEDIA_BUS_FMT_RBG888_1X24, - 3, 0, NULL }, + 3, 0 }, { XVIP_VF_MONO_SENSOR, 8, "mono", MEDIA_BUS_FMT_Y8_1X8, - 1, V4L2_PIX_FMT_GREY, "Greyscale 8-bit" }, + 1, V4L2_PIX_FMT_GREY }, { XVIP_VF_MONO_SENSOR, 8, "rggb", MEDIA_BUS_FMT_SRGGB8_1X8, - 1, V4L2_PIX_FMT_SRGGB8, "Bayer 8-bit RGGB" }, + 1, V4L2_PIX_FMT_SRGGB8 }, { XVIP_VF_MONO_SENSOR, 8, "grbg", MEDIA_BUS_FMT_SGRBG8_1X8, - 1, V4L2_PIX_FMT_SGRBG8, "Bayer 8-bit GRBG" }, + 1, V4L2_PIX_FMT_SGRBG8 }, { XVIP_VF_MONO_SENSOR, 8, "gbrg", MEDIA_BUS_FMT_SGBRG8_1X8, - 1, V4L2_PIX_FMT_SGBRG8, "Bayer 8-bit GBRG" }, + 1, V4L2_PIX_FMT_SGBRG8 }, { XVIP_VF_MONO_SENSOR, 8, "bggr", MEDIA_BUS_FMT_SBGGR8_1X8, - 1, V4L2_PIX_FMT_SBGGR8, "Bayer 8-bit BGGR" }, + 1, V4L2_PIX_FMT_SBGGR8 }, }; /** diff --git a/drivers/media/platform/xilinx/xilinx-vip.h b/drivers/media/platform/xilinx/xilinx-vip.h index ba939dd52818..47da39211ae4 100644 --- a/drivers/media/platform/xilinx/xilinx-vip.h +++ b/drivers/media/platform/xilinx/xilinx-vip.h @@ -108,7 +108,6 @@ struct xvip_device { * @code: media bus format code * @bpp: bytes per pixel (when stored in memory) * @fourcc: V4L2 pixel format FCC identifier - * @description: format description, suitable for userspace */ struct xvip_video_format { unsigned int vf_code; @@ -117,7 +116,6 @@ struct xvip_video_format { unsigned int code; unsigned int bpp; u32 fourcc; - const char *description; }; const struct xvip_video_format *xvip_get_format_by_code(unsigned int code); diff --git a/include/media/drv-intf/exynos-fimc.h b/include/media/drv-intf/exynos-fimc.h index 59703439bb37..6b9ef631d6bb 100644 --- a/include/media/drv-intf/exynos-fimc.h +++ b/include/media/drv-intf/exynos-fimc.h @@ -87,7 +87,6 @@ struct fimc_source_info { /** * struct fimc_fmt - color format data structure * @mbus_code: media bus pixel code, -1 if not applicable - * @name: format description * @fourcc: fourcc code for this format, 0 if not applicable * @color: the driver's private color format id * @memplanes: number of physically non-contiguous data planes @@ -99,7 +98,6 @@ struct fimc_source_info { */ struct fimc_fmt { u32 mbus_code; - char *name; u32 fourcc; u32 color; u16 memplanes; -- cgit v1.2.3 From 66b5f1c439843bcbab01cc7f3854ae2742f3d1e3 Mon Sep 17 00:00:00 2001 From: Maciej Żenczykowski Date: Thu, 18 Jul 2019 23:30:03 -0700 Subject: net-ipv6-ndisc: add support for RFC7710 RA Captive Portal Identifier MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is trivial since we already have support for the entirely identical (from the kernel's point of view) RDNSS and DNSSL that also contain opaque data that needs to be passed down to userspace. As specified in RFC7710, Captive Portal option contains a URL. 8-bit identifier of the option type as assigned by the IANA is 37. This option should also be treated as userland. Hence, treat ND option 37 as userland (Captive Portal support) See: https://tools.ietf.org/html/rfc7710 https://www.iana.org/assignments/icmpv6-parameters/icmpv6-parameters.xhtml Fixes: e35f30c131a56 Signed-off-by: Maciej Żenczykowski Cc: Lorenzo Colitti Cc: Remin Nguyen Van Cc: Alexey I. Froloff Signed-off-by: David S. Miller --- include/net/ndisc.h | 1 + net/ipv6/ndisc.c | 1 + 2 files changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/ndisc.h b/include/net/ndisc.h index 366150053043..b2f715ca0567 100644 --- a/include/net/ndisc.h +++ b/include/net/ndisc.h @@ -40,6 +40,7 @@ enum { ND_OPT_RDNSS = 25, /* RFC5006 */ ND_OPT_DNSSL = 31, /* RFC6106 */ ND_OPT_6CO = 34, /* RFC6775 */ + ND_OPT_CAPTIVE_PORTAL = 37, /* RFC7710 */ __ND_OPT_MAX }; diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 083cc1c94cd3..53caf59c591e 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -196,6 +196,7 @@ static inline int ndisc_is_useropt(const struct net_device *dev, { return opt->nd_opt_type == ND_OPT_RDNSS || opt->nd_opt_type == ND_OPT_DNSSL || + opt->nd_opt_type == ND_OPT_CAPTIVE_PORTAL || ndisc_ops_is_useropt(dev, opt->nd_opt_type); } -- cgit v1.2.3 From a57d6acaf352d91e52271704f45c72e14cd2d98a Mon Sep 17 00:00:00 2001 From: Pawel Osciak Date: Thu, 11 Jul 2019 16:26:42 -0400 Subject: media: uapi: Add VP8 stateless decoder API Add the parsed VP8 frame pixel format and controls, to be used with the new stateless decoder API for VP8 to provide parameters for accelerator (aka stateless) codecs. Reviewed-by: Tomasz Figa Reviewed-by: Boris Brezillon Reviewed-by: Nicolas Dufresne Signed-off-by: Pawel Osciak Signed-off-by: Ezequiel Garcia Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- Documentation/media/uapi/v4l/biblio.rst | 10 + Documentation/media/uapi/v4l/ext-ctrls-codec.rst | 323 +++++++++++++++++++++ Documentation/media/uapi/v4l/pixfmt-compressed.rst | 20 ++ drivers/media/v4l2-core/v4l2-ctrls.c | 10 + drivers/media/v4l2-core/v4l2-ioctl.c | 1 + include/media/v4l2-ctrls.h | 3 + include/media/vp8-ctrls.h | 110 +++++++ 7 files changed, 477 insertions(+) create mode 100644 include/media/vp8-ctrls.h (limited to 'include') diff --git a/Documentation/media/uapi/v4l/biblio.rst b/Documentation/media/uapi/v4l/biblio.rst index 8f4eb8823d82..ad2ff258afa8 100644 --- a/Documentation/media/uapi/v4l/biblio.rst +++ b/Documentation/media/uapi/v4l/biblio.rst @@ -395,3 +395,13 @@ colimg :title: Color Imaging: Fundamentals and Applications :author: Erik Reinhard et al. + +.. _vp8: + +VP8 +=== + + +:title: RFC 6386: "VP8 Data Format and Decoding Guide" + +:author: J. Bankoski et al. diff --git a/Documentation/media/uapi/v4l/ext-ctrls-codec.rst b/Documentation/media/uapi/v4l/ext-ctrls-codec.rst index d6ea2ffd65c5..c5f39dd50043 100644 --- a/Documentation/media/uapi/v4l/ext-ctrls-codec.rst +++ b/Documentation/media/uapi/v4l/ext-ctrls-codec.rst @@ -2234,6 +2234,329 @@ enum v4l2_mpeg_video_h264_hierarchical_coding_type - Quantization parameter for a P frame for FWHT. Valid range: from 1 to 31. +.. _v4l2-mpeg-vp8: + +``V4L2_CID_MPEG_VIDEO_VP8_FRAME_HEADER (struct)`` + Specifies the frame parameters for the associated VP8 parsed frame data. + This includes the necessary parameters for + configuring a stateless hardware decoding pipeline for VP8. + The bitstream parameters are defined according to :ref:`vp8`. + + .. note:: + + This compound control is not yet part of the public kernel API and + it is expected to change. + +.. c:type:: v4l2_ctrl_vp8_frame_header + +.. cssclass:: longtable + +.. tabularcolumns:: |p{5.8cm}|p{4.8cm}|p{6.6cm}| + +.. flat-table:: struct v4l2_ctrl_vp8_frame_header + :header-rows: 0 + :stub-columns: 0 + :widths: 1 1 2 + + * - struct :c:type:`v4l2_vp8_segment_header` + - ``segment_header`` + - Structure with segment-based adjustments metadata. + * - struct :c:type:`v4l2_vp8_loopfilter_header` + - ``loopfilter_header`` + - Structure with loop filter level adjustments metadata. + * - struct :c:type:`v4l2_vp8_quantization_header` + - ``quant_header`` + - Structure with VP8 dequantization indices metadata. + * - struct :c:type:`v4l2_vp8_entropy_header` + - ``entropy_header`` + - Structure with VP8 entropy coder probabilities metadata. + * - struct :c:type:`v4l2_vp8_entropy_coder_state` + - ``coder_state`` + - Structure with VP8 entropy coder state. + * - __u16 + - ``width`` + - The width of the frame. Must be set for all frames. + * - __u16 + - ``height`` + - The height of the frame. Must be set for all frames. + * - __u8 + - ``horizontal_scale`` + - Horizontal scaling factor. + * - __u8 + - ``vertical_scaling factor`` + - Vertical scale. + * - __u8 + - ``version`` + - Bitstream version. + * - __u8 + - ``prob_skip_false`` + - Indicates the probability that the macroblock is not skipped. + * - __u8 + - ``prob_intra`` + - Indicates the probability that a macroblock is intra-predicted. + * - __u8 + - ``prob_last`` + - Indicates the probability that the last reference frame is used + for inter-prediction + * - __u8 + - ``prob_gf`` + - Indicates the probability that the golden reference frame is used + for inter-prediction + * - __u8 + - ``num_dct_parts`` + - Number of DCT coefficients partitions. Must be one of: 1, 2, 4, or 8. + * - __u32 + - ``first_part_size`` + - Size of the first partition, i.e. the control partition. + * - __u32 + - ``first_part_header_bits`` + - Size in bits of the first partition header portion. + * - __u32 + - ``dct_part_sizes[8]`` + - DCT coefficients sizes. + * - __u64 + - ``last_frame_ts`` + - Timestamp for the V4L2 capture buffer to use as last reference frame, used + with inter-coded frames. The timestamp refers to the ``timestamp`` field in + struct :c:type:`v4l2_buffer`. Use the :c:func:`v4l2_timeval_to_ns()` + function to convert the struct :c:type:`timeval` in struct + :c:type:`v4l2_buffer` to a __u64. + * - __u64 + - ``golden_frame_ts`` + - Timestamp for the V4L2 capture buffer to use as last reference frame, used + with inter-coded frames. The timestamp refers to the ``timestamp`` field in + struct :c:type:`v4l2_buffer`. Use the :c:func:`v4l2_timeval_to_ns()` + function to convert the struct :c:type:`timeval` in struct + :c:type:`v4l2_buffer` to a __u64. + * - __u64 + - ``alt_frame_ts`` + - Timestamp for the V4L2 capture buffer to use as alternate reference frame, used + with inter-coded frames. The timestamp refers to the ``timestamp`` field in + struct :c:type:`v4l2_buffer`. Use the :c:func:`v4l2_timeval_to_ns()` + function to convert the struct :c:type:`timeval` in struct + :c:type:`v4l2_buffer` to a __u64. + * - __u64 + - ``flags`` + - See :ref:`Frame Header Flags ` + +.. _vp8_frame_header_flags: + +``Frame Header Flags`` + +.. cssclass:: longtable + +.. flat-table:: + :header-rows: 0 + :stub-columns: 0 + :widths: 1 1 2 + + * - ``V4L2_VP8_FRAME_HEADER_FLAG_KEY_FRAME`` + - 0x01 + - Indicates if the frame is a key frame. + * - ``V4L2_VP8_FRAME_HEADER_FLAG_EXPERIMENTAL`` + - 0x02 + - Experimental bitstream. + * - ``V4L2_VP8_FRAME_HEADER_FLAG_SHOW_FRAME`` + - 0x04 + - Show frame flag, indicates if the frame is for display. + * - ``V4L2_VP8_FRAME_HEADER_FLAG_MB_NO_SKIP_COEFF`` + - 0x08 + - Enable/disable skipping of macroblocks with no non-zero coefficients. + * - ``V4L2_VP8_FRAME_HEADER_FLAG_SIGN_BIAS_GOLDEN`` + - 0x10 + - Sign of motion vectors when the golden frame is referenced. + * - ``V4L2_VP8_FRAME_HEADER_FLAG_SIGN_BIAS_ALT`` + - 0x20 + - Sign of motion vectors when the alt frame is referenced. + +.. c:type:: v4l2_vp8_entropy_coder_state + +.. cssclass:: longtable + +.. tabularcolumns:: |p{1.5cm}|p{6.3cm}|p{9.4cm}| + +.. flat-table:: struct v4l2_vp8_entropy_coder_state + :header-rows: 0 + :stub-columns: 0 + :widths: 1 1 2 + + * - __u8 + - ``range`` + - + * - __u8 + - ``value`` + - + * - __u8 + - ``bit_count`` + - + * - __u8 + - ``padding`` + - Applications and drivers must set this to zero. + +.. c:type:: v4l2_vp8_segment_header + +.. cssclass:: longtable + +.. tabularcolumns:: |p{1.5cm}|p{6.3cm}|p{9.4cm}| + +.. flat-table:: struct v4l2_vp8_segment_header + :header-rows: 0 + :stub-columns: 0 + :widths: 1 1 2 + + * - __s8 + - ``quant_update[4]`` + - Signed quantizer value update. + * - __s8 + - ``lf_update[4]`` + - Signed loop filter level value update. + * - __u8 + - ``segment_probs[3]`` + - Segment probabilities. + * - __u8 + - ``padding`` + - Applications and drivers must set this to zero. + * - __u32 + - ``flags`` + - See :ref:`Segment Header Flags ` + +.. _vp8_segment_header_flags: + +``Segment Header Flags`` + +.. cssclass:: longtable + +.. flat-table:: + :header-rows: 0 + :stub-columns: 0 + :widths: 1 1 2 + + * - ``V4L2_VP8_SEGMENT_HEADER_FLAG_ENABLED`` + - 0x01 + - Enable/disable segment-based adjustments. + * - ``V4L2_VP8_SEGMENT_HEADER_FLAG_UPDATE_MAP`` + - 0x02 + - Indicates if the macroblock segmentation map is updated in this frame. + * - ``V4L2_VP8_SEGMENT_HEADER_FLAG_UPDATE_FEATURE_DATA`` + - 0x04 + - Indicates if the segment feature data is updated in this frame. + * - ``V4L2_VP8_SEGMENT_HEADER_FLAG_DELTA_VALUE_MODE`` + - 0x08 + - If is set, the segment feature data mode is delta-value. + If cleared, it's absolute-value. + +.. c:type:: v4l2_vp8_loopfilter_header + +.. cssclass:: longtable + +.. tabularcolumns:: |p{1.5cm}|p{6.3cm}|p{9.4cm}| + +.. flat-table:: struct v4l2_vp8_loopfilter_header + :header-rows: 0 + :stub-columns: 0 + :widths: 1 1 2 + + * - __s8 + - ``ref_frm_delta[4]`` + - Reference adjustment (signed) delta value. + * - __s8 + - ``mb_mode_delta[4]`` + - Macroblock prediction mode adjustment (signed) delta value. + * - __u8 + - ``sharpness_level`` + - Sharpness level + * - __u8 + - ``level`` + - Filter level + * - __u16 + - ``padding`` + - Applications and drivers must set this to zero. + * - __u32 + - ``flags`` + - See :ref:`Loopfilter Header Flags ` + +.. _vp8_loopfilter_header_flags: + +``Loopfilter Header Flags`` + +.. cssclass:: longtable + +.. flat-table:: + :header-rows: 0 + :stub-columns: 0 + :widths: 1 1 2 + + * - ``V4L2_VP8_LF_HEADER_ADJ_ENABLE`` + - 0x01 + - Enable/disable macroblock-level loop filter adjustment. + * - ``V4L2_VP8_LF_HEADER_DELTA_UPDATE`` + - 0x02 + - Indicates if the delta values used in an adjustment are updated. + * - ``V4L2_VP8_LF_FILTER_TYPE_SIMPLE`` + - 0x04 + - If set, indicates the filter type is simple. + If cleared, the filter type is normal. + +.. c:type:: v4l2_vp8_quantization_header + +.. cssclass:: longtable + +.. tabularcolumns:: |p{1.5cm}|p{6.3cm}|p{9.4cm}| + +.. flat-table:: struct v4l2_vp8_quantization_header + :header-rows: 0 + :stub-columns: 0 + :widths: 1 1 2 + + * - __u8 + - ``y_ac_qi`` + - Luma AC coefficient table index. + * - __s8 + - ``y_dc_delta`` + - Luma DC delta vaue. + * - __s8 + - ``y2_dc_delta`` + - Y2 block DC delta value. + * - __s8 + - ``y2_ac_delta`` + - Y2 block AC delta value. + * - __s8 + - ``uv_dc_delta`` + - Chroma DC delta value. + * - __s8 + - ``uv_ac_delta`` + - Chroma AC delta value. + * - __u16 + - ``padding`` + - Applications and drivers must set this to zero. + +.. c:type:: v4l2_vp8_entropy_header + +.. cssclass:: longtable + +.. tabularcolumns:: |p{1.5cm}|p{6.3cm}|p{9.4cm}| + +.. flat-table:: struct v4l2_vp8_entropy_header + :header-rows: 0 + :stub-columns: 0 + :widths: 1 1 2 + + * - __u8 + - ``coeff_probs[4][8][3][11]`` + - Coefficient update probabilities. + * - __u8 + - ``y_mode_probs[4]`` + - Luma mode update probabilities. + * - __u8 + - ``uv_mode_probs[3]`` + - Chroma mode update probabilities. + * - __u8 + - ``mv_probs[2][19]`` + - MV decoding update probabilities. + * - __u8 + - ``padding[3]`` + - Applications and drivers must set this to zero. + .. raw:: latex \normalsize diff --git a/Documentation/media/uapi/v4l/pixfmt-compressed.rst b/Documentation/media/uapi/v4l/pixfmt-compressed.rst index 4b701fc7653e..f52a7b67023d 100644 --- a/Documentation/media/uapi/v4l/pixfmt-compressed.rst +++ b/Documentation/media/uapi/v4l/pixfmt-compressed.rst @@ -133,6 +133,26 @@ Compressed Formats - ``V4L2_PIX_FMT_VP8`` - 'VP80' - VP8 video elementary stream. + * .. _V4L2-PIX-FMT-VP8-FRAME: + + - ``V4L2_PIX_FMT_VP8_FRAME`` + - 'VP8F' + - VP8 parsed frame, as extracted from the container. + This format is adapted for stateless video decoders that implement a + VP8 pipeline (using the :ref:`mem2mem` and :ref:`media-request-api`). + Metadata associated with the frame to decode is required to be passed + through the ``V4L2_CID_MPEG_VIDEO_VP8_FRAME_HEADER`` control. + See the :ref:`associated Codec Control IDs `. + Exactly one output and one capture buffer must be provided for use with + this pixel format. The output buffer must contain the appropriate number + of macroblocks to decode a full corresponding frame to the matching + capture buffer. + + .. note:: + + This format is not yet part of the public kernel API and it + is expected to change. + * .. _V4L2-PIX-FMT-VP9: - ``V4L2_PIX_FMT_VP9`` diff --git a/drivers/media/v4l2-core/v4l2-ctrls.c b/drivers/media/v4l2-core/v4l2-ctrls.c index 739418aa9108..b2c9f5816c4a 100644 --- a/drivers/media/v4l2-core/v4l2-ctrls.c +++ b/drivers/media/v4l2-core/v4l2-ctrls.c @@ -885,6 +885,7 @@ const char *v4l2_ctrl_get_name(u32 id) case V4L2_CID_MPEG_VIDEO_VPX_P_FRAME_QP: return "VPX P-Frame QP Value"; case V4L2_CID_MPEG_VIDEO_VP8_PROFILE: return "VP8 Profile"; case V4L2_CID_MPEG_VIDEO_VP9_PROFILE: return "VP9 Profile"; + case V4L2_CID_MPEG_VIDEO_VP8_FRAME_HEADER: return "VP8 Frame Header"; /* HEVC controls */ case V4L2_CID_MPEG_VIDEO_HEVC_I_FRAME_QP: return "HEVC I-Frame QP Value"; @@ -1345,6 +1346,9 @@ void v4l2_ctrl_fill(u32 id, const char **name, enum v4l2_ctrl_type *type, case V4L2_CID_MPEG_VIDEO_H264_DECODE_PARAMS: *type = V4L2_CTRL_TYPE_H264_DECODE_PARAMS; break; + case V4L2_CID_MPEG_VIDEO_VP8_FRAME_HEADER: + *type = V4L2_CTRL_TYPE_VP8_FRAME_HEADER; + break; default: *type = V4L2_CTRL_TYPE_INTEGER; break; @@ -1690,6 +1694,9 @@ static int std_validate_compound(const struct v4l2_ctrl *ctrl, u32 idx, case V4L2_CTRL_TYPE_H264_SLICE_PARAMS: case V4L2_CTRL_TYPE_H264_DECODE_PARAMS: break; + + case V4L2_CTRL_TYPE_VP8_FRAME_HEADER: + break; default: return -EINVAL; } @@ -2360,6 +2367,9 @@ static struct v4l2_ctrl *v4l2_ctrl_new(struct v4l2_ctrl_handler *hdl, case V4L2_CTRL_TYPE_H264_DECODE_PARAMS: elem_size = sizeof(struct v4l2_ctrl_h264_decode_params); break; + case V4L2_CTRL_TYPE_VP8_FRAME_HEADER: + elem_size = sizeof(struct v4l2_ctrl_vp8_frame_header); + break; default: if (type < V4L2_CTRL_COMPOUND_TYPES) elem_size = sizeof(s32); diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c index 20cc23ef730e..80efc581e3f9 100644 --- a/drivers/media/v4l2-core/v4l2-ioctl.c +++ b/drivers/media/v4l2-core/v4l2-ioctl.c @@ -1348,6 +1348,7 @@ static void v4l_fill_fmtdesc(struct v4l2_fmtdesc *fmt) case V4L2_PIX_FMT_VC1_ANNEX_G: descr = "VC-1 (SMPTE 412M Annex G)"; break; case V4L2_PIX_FMT_VC1_ANNEX_L: descr = "VC-1 (SMPTE 412M Annex L)"; break; case V4L2_PIX_FMT_VP8: descr = "VP8"; break; + case V4L2_PIX_FMT_VP8_FRAME: descr = "VP8 Frame"; break; case V4L2_PIX_FMT_VP9: descr = "VP9"; break; case V4L2_PIX_FMT_HEVC: descr = "HEVC"; break; /* aka H.265 */ case V4L2_PIX_FMT_FWHT: descr = "FWHT"; break; /* used in vicodec */ diff --git a/include/media/v4l2-ctrls.h b/include/media/v4l2-ctrls.h index b4433483af23..6e9dc9c44bb1 100644 --- a/include/media/v4l2-ctrls.h +++ b/include/media/v4l2-ctrls.h @@ -20,6 +20,7 @@ #include #include #include +#include /* forward references */ struct file; @@ -48,6 +49,7 @@ struct poll_table_struct; * @p_h264_scaling_matrix: Pointer to a struct v4l2_ctrl_h264_scaling_matrix. * @p_h264_slice_params: Pointer to a struct v4l2_ctrl_h264_slice_params. * @p_h264_decode_params: Pointer to a struct v4l2_ctrl_h264_decode_params. + * @p_vp8_frame_header: Pointer to a VP8 frame header structure. * @p: Pointer to a compound value. */ union v4l2_ctrl_ptr { @@ -65,6 +67,7 @@ union v4l2_ctrl_ptr { struct v4l2_ctrl_h264_scaling_matrix *p_h264_scaling_matrix; struct v4l2_ctrl_h264_slice_params *p_h264_slice_params; struct v4l2_ctrl_h264_decode_params *p_h264_decode_params; + struct v4l2_ctrl_vp8_frame_header *p_vp8_frame_header; void *p; }; diff --git a/include/media/vp8-ctrls.h b/include/media/vp8-ctrls.h new file mode 100644 index 000000000000..6cc2eeea4c90 --- /dev/null +++ b/include/media/vp8-ctrls.h @@ -0,0 +1,110 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * These are the VP8 state controls for use with stateless VP8 + * codec drivers. + * + * It turns out that these structs are not stable yet and will undergo + * more changes. So keep them private until they are stable and ready to + * become part of the official public API. + */ + +#ifndef _VP8_CTRLS_H_ +#define _VP8_CTRLS_H_ + +#define V4L2_PIX_FMT_VP8_FRAME v4l2_fourcc('V', 'P', '8', 'F') + +#define V4L2_CID_MPEG_VIDEO_VP8_FRAME_HEADER (V4L2_CID_MPEG_BASE + 2000) +#define V4L2_CTRL_TYPE_VP8_FRAME_HEADER 0x301 + +#define V4L2_VP8_SEGMENT_HEADER_FLAG_ENABLED 0x01 +#define V4L2_VP8_SEGMENT_HEADER_FLAG_UPDATE_MAP 0x02 +#define V4L2_VP8_SEGMENT_HEADER_FLAG_UPDATE_FEATURE_DATA 0x04 +#define V4L2_VP8_SEGMENT_HEADER_FLAG_DELTA_VALUE_MODE 0x08 + +struct v4l2_vp8_segment_header { + __s8 quant_update[4]; + __s8 lf_update[4]; + __u8 segment_probs[3]; + __u8 padding; + __u32 flags; +}; + +#define V4L2_VP8_LF_HEADER_ADJ_ENABLE 0x01 +#define V4L2_VP8_LF_HEADER_DELTA_UPDATE 0x02 +#define V4L2_VP8_LF_FILTER_TYPE_SIMPLE 0x04 +struct v4l2_vp8_loopfilter_header { + __s8 ref_frm_delta[4]; + __s8 mb_mode_delta[4]; + __u8 sharpness_level; + __u8 level; + __u16 padding; + __u32 flags; +}; + +struct v4l2_vp8_quantization_header { + __u8 y_ac_qi; + __s8 y_dc_delta; + __s8 y2_dc_delta; + __s8 y2_ac_delta; + __s8 uv_dc_delta; + __s8 uv_ac_delta; + __u16 padding; +}; + +struct v4l2_vp8_entropy_header { + __u8 coeff_probs[4][8][3][11]; + __u8 y_mode_probs[4]; + __u8 uv_mode_probs[3]; + __u8 mv_probs[2][19]; + __u8 padding[3]; +}; + +struct v4l2_vp8_entropy_coder_state { + __u8 range; + __u8 value; + __u8 bit_count; + __u8 padding; +}; + +#define V4L2_VP8_FRAME_HEADER_FLAG_KEY_FRAME 0x01 +#define V4L2_VP8_FRAME_HEADER_FLAG_EXPERIMENTAL 0x02 +#define V4L2_VP8_FRAME_HEADER_FLAG_SHOW_FRAME 0x04 +#define V4L2_VP8_FRAME_HEADER_FLAG_MB_NO_SKIP_COEFF 0x08 +#define V4L2_VP8_FRAME_HEADER_FLAG_SIGN_BIAS_GOLDEN 0x10 +#define V4L2_VP8_FRAME_HEADER_FLAG_SIGN_BIAS_ALT 0x20 + +#define VP8_FRAME_IS_KEY_FRAME(hdr) \ + (!!((hdr)->flags & V4L2_VP8_FRAME_HEADER_FLAG_KEY_FRAME)) + +struct v4l2_ctrl_vp8_frame_header { + struct v4l2_vp8_segment_header segment_header; + struct v4l2_vp8_loopfilter_header lf_header; + struct v4l2_vp8_quantization_header quant_header; + struct v4l2_vp8_entropy_header entropy_header; + struct v4l2_vp8_entropy_coder_state coder_state; + + __u16 width; + __u16 height; + + __u8 horizontal_scale; + __u8 vertical_scale; + + __u8 version; + __u8 prob_skip_false; + __u8 prob_intra; + __u8 prob_last; + __u8 prob_gf; + __u8 num_dct_parts; + + __u32 first_part_size; + __u32 first_part_header_bits; + __u32 dct_part_sizes[8]; + + __u64 last_frame_ts; + __u64 golden_frame_ts; + __u64 alt_frame_ts; + + __u64 flags; +}; + +#endif -- cgit v1.2.3 From a81431e7d1077a84695fc7be487fc2b4930f8c3d Mon Sep 17 00:00:00 2001 From: Sean Young Date: Fri, 12 Jul 2019 18:46:59 -0400 Subject: media: rc: remove unused #define RC_PROTO_BIT_ALL This lists all the protocols that the kernel knows about, however there are no users. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- include/media/rc-map.h | 16 ---------------- 1 file changed, 16 deletions(-) (limited to 'include') diff --git a/include/media/rc-map.h b/include/media/rc-map.h index bebd3c4c6338..3a7f8728f6ec 100644 --- a/include/media/rc-map.h +++ b/include/media/rc-map.h @@ -38,22 +38,6 @@ #define RC_PROTO_BIT_RCMM32 BIT_ULL(RC_PROTO_RCMM32) #define RC_PROTO_BIT_XBOX_DVD BIT_ULL(RC_PROTO_XBOX_DVD) -#define RC_PROTO_BIT_ALL \ - (RC_PROTO_BIT_UNKNOWN | RC_PROTO_BIT_OTHER | \ - RC_PROTO_BIT_RC5 | RC_PROTO_BIT_RC5X_20 | \ - RC_PROTO_BIT_RC5_SZ | RC_PROTO_BIT_JVC | \ - RC_PROTO_BIT_SONY12 | RC_PROTO_BIT_SONY15 | \ - RC_PROTO_BIT_SONY20 | RC_PROTO_BIT_NEC | \ - RC_PROTO_BIT_NECX | RC_PROTO_BIT_NEC32 | \ - RC_PROTO_BIT_SANYO | \ - RC_PROTO_BIT_MCIR2_KBD | RC_PROTO_BIT_MCIR2_MSE | \ - RC_PROTO_BIT_RC6_0 | RC_PROTO_BIT_RC6_6A_20 | \ - RC_PROTO_BIT_RC6_6A_24 | RC_PROTO_BIT_RC6_6A_32 | \ - RC_PROTO_BIT_RC6_MCE | RC_PROTO_BIT_SHARP | \ - RC_PROTO_BIT_XMP | RC_PROTO_BIT_CEC | \ - RC_PROTO_BIT_IMON | RC_PROTO_BIT_RCMM12 | \ - RC_PROTO_BIT_RCMM24 | RC_PROTO_BIT_RCMM32 | \ - RC_PROTO_BIT_XBOX_DVD) /* All rc protocols for which we have decoders */ #define RC_PROTO_BIT_ALL_IR_DECODER \ (RC_PROTO_BIT_RC5 | RC_PROTO_BIT_RC5X_20 | \ -- cgit v1.2.3 From 66193b24514c91aeda88da744554b2665471aeae Mon Sep 17 00:00:00 2001 From: Jan Pieter van Woerkom Date: Wed, 17 Jul 2019 14:09:10 -0400 Subject: media: dvbsky: add support for Mygica T230C v2 Adds support for the "Mygica T230C v2" to the dvbsky driver. Signed-off-by: Jan Pieter van Woerkom Tested-by: Frank Rysanek Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/usb/dvb-usb-v2/dvbsky.c | 5 +++++ include/media/dvb-usb-ids.h | 1 + 2 files changed, 6 insertions(+) (limited to 'include') diff --git a/drivers/media/usb/dvb-usb-v2/dvbsky.c b/drivers/media/usb/dvb-usb-v2/dvbsky.c index 8610487f2d72..bbfe1cfdc013 100644 --- a/drivers/media/usb/dvb-usb-v2/dvbsky.c +++ b/drivers/media/usb/dvb-usb-v2/dvbsky.c @@ -540,6 +540,8 @@ static int dvbsky_mygica_t230c_attach(struct dvb_usb_adapter *adap) si2168_config.i2c_adapter = &i2c_adapter; si2168_config.fe = &adap->fe[0]; si2168_config.ts_mode = SI2168_TS_PARALLEL; + if (le16_to_cpu(d->udev->descriptor.idProduct) == USB_PID_MYGICA_T230C2) + si2168_config.ts_mode |= SI2168_TS_CLK_MANUAL; si2168_config.ts_clock_inv = 1; state->i2c_client_demod = dvb_module_probe("si2168", NULL, @@ -779,6 +781,9 @@ static const struct usb_device_id dvbsky_id_table[] = { { DVB_USB_DEVICE(USB_VID_CONEXANT, USB_PID_MYGICA_T230C, &mygica_t230c_props, "MyGica Mini DVB-T2 USB Stick T230C", RC_MAP_TOTAL_MEDIA_IN_HAND_02) }, + { DVB_USB_DEVICE(USB_VID_CONEXANT, USB_PID_MYGICA_T230C2, + &mygica_t230c_props, "MyGica Mini DVB-T2 USB Stick T230C v2", + RC_MAP_TOTAL_MEDIA_IN_HAND_02) }, { } }; MODULE_DEVICE_TABLE(usb, dvbsky_id_table); diff --git a/include/media/dvb-usb-ids.h b/include/media/dvb-usb-ids.h index 52875e3eee71..7ce4e8332421 100644 --- a/include/media/dvb-usb-ids.h +++ b/include/media/dvb-usb-ids.h @@ -388,6 +388,7 @@ #define USB_PID_MYGICA_D689 0xd811 #define USB_PID_MYGICA_T230 0xc688 #define USB_PID_MYGICA_T230C 0xc689 +#define USB_PID_MYGICA_T230C2 0xc68a #define USB_PID_ELGATO_EYETV_DIVERSITY 0x0011 #define USB_PID_ELGATO_EYETV_DTT 0x0021 #define USB_PID_ELGATO_EYETV_DTT_2 0x003f -- cgit v1.2.3 From cf949bbe22bee8749078e0b810ee2dc60a983746 Mon Sep 17 00:00:00 2001 From: Avri Altman Date: Wed, 12 Jun 2019 16:34:37 +0300 Subject: scsi: ufs: uapi: Fix SPDX license identifier Added 'WITH Linux-syscall-note' exception which is the officially assigned exception identifier for the kernel syscall exception. This exception makes it possible to include GPL headers into non GPL code without confusing license compliance tools. Fixes: a851b2bd3632 (scsi: uapi: ufs: Make utp_upiu_req visible to user space) Signed-off-by: Avri Altman Reviewed-by: Pedro Sousa Signed-off-by: Martin K. Petersen --- include/uapi/scsi/scsi_bsg_ufs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/scsi/scsi_bsg_ufs.h b/include/uapi/scsi/scsi_bsg_ufs.h index 17c7abd0803a..9988db6ad244 100644 --- a/include/uapi/scsi/scsi_bsg_ufs.h +++ b/include/uapi/scsi/scsi_bsg_ufs.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ /* * UFS Transport SGIO v4 BSG Message Support * -- cgit v1.2.3 From 8930a6c207918d5a5675eedab06a71096b1a3d47 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 30 May 2019 13:28:10 +0200 Subject: scsi: core: add support for request batching This allows a list of requests to be issued, with the LLD only writing the hardware doorbell when necessary, after the last request was prepared. This is more efficient if we have lists of requests to issue, particularly on virtualized hardware, where writing the doorbell is more expensive than on real hardware. The use case for this is plugged IO, where blk-mq flushes a batch of requests all at once. The API is the same as for blk-mq, just with blk-mq concepts tweaked to fit the SCSI subsystem API: the "last" flag in blk_mq_queue_data becomes a flag in scsi_cmnd, while the queue_num in the commit_rqs callback is extracted from the hctx and passed as a parameter. The only complication is that blk-mq uses different plugging heuristics depending on whether commit_rqs is present or not. So we have two different sets of blk_mq_ops and pick one depending on whether the scsi_host template uses commit_rqs or not. Signed-off-by: Paolo Bonzini Reviewed-by: Bart Van Assche Reviewed-by: Ming Lei Reviewed-by: Hannes Reinecke Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_lib.c | 37 ++++++++++++++++++++++++++++++++++--- include/scsi/scsi_cmnd.h | 1 + include/scsi/scsi_host.h | 16 ++++++++++++++-- 3 files changed, 49 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 9381171c2fc0..c72bce2f0cf1 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1666,10 +1666,11 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx, blk_mq_start_request(req); } + cmd->flags &= SCMD_PRESERVED_FLAGS; if (sdev->simple_tags) cmd->flags |= SCMD_TAGGED; - else - cmd->flags &= ~SCMD_TAGGED; + if (bd->last) + cmd->flags |= SCMD_LAST; scsi_init_cmd_errh(cmd); cmd->scsi_done = scsi_mq_done; @@ -1807,10 +1808,37 @@ void __scsi_init_queue(struct Scsi_Host *shost, struct request_queue *q) } EXPORT_SYMBOL_GPL(__scsi_init_queue); +static const struct blk_mq_ops scsi_mq_ops_no_commit = { + .get_budget = scsi_mq_get_budget, + .put_budget = scsi_mq_put_budget, + .queue_rq = scsi_queue_rq, + .complete = scsi_softirq_done, + .timeout = scsi_timeout, +#ifdef CONFIG_BLK_DEBUG_FS + .show_rq = scsi_show_rq, +#endif + .init_request = scsi_mq_init_request, + .exit_request = scsi_mq_exit_request, + .initialize_rq_fn = scsi_initialize_rq, + .busy = scsi_mq_lld_busy, + .map_queues = scsi_map_queues, +}; + + +static void scsi_commit_rqs(struct blk_mq_hw_ctx *hctx) +{ + struct request_queue *q = hctx->queue; + struct scsi_device *sdev = q->queuedata; + struct Scsi_Host *shost = sdev->host; + + shost->hostt->commit_rqs(shost, hctx->queue_num); +} + static const struct blk_mq_ops scsi_mq_ops = { .get_budget = scsi_mq_get_budget, .put_budget = scsi_mq_put_budget, .queue_rq = scsi_queue_rq, + .commit_rqs = scsi_commit_rqs, .complete = scsi_softirq_done, .timeout = scsi_timeout, #ifdef CONFIG_BLK_DEBUG_FS @@ -1846,7 +1874,10 @@ int scsi_mq_setup_tags(struct Scsi_Host *shost) sizeof(struct scatterlist) * SCSI_INLINE_PROT_SG_CNT; memset(&shost->tag_set, 0, sizeof(shost->tag_set)); - shost->tag_set.ops = &scsi_mq_ops; + if (shost->hostt->commit_rqs) + shost->tag_set.ops = &scsi_mq_ops; + else + shost->tag_set.ops = &scsi_mq_ops_no_commit; shost->tag_set.nr_hw_queues = shost->nr_hw_queues ? : 1; shost->tag_set.queue_depth = shost->can_queue; shost->tag_set.cmd_size = cmd_size; diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h index 76ed5e4acd38..91bd749a02f7 100644 --- a/include/scsi/scsi_cmnd.h +++ b/include/scsi/scsi_cmnd.h @@ -57,6 +57,7 @@ struct scsi_pointer { #define SCMD_TAGGED (1 << 0) #define SCMD_UNCHECKED_ISA_DMA (1 << 1) #define SCMD_INITIALIZED (1 << 2) +#define SCMD_LAST (1 << 3) /* flags preserved across unprep / reprep */ #define SCMD_PRESERVED_FLAGS (SCMD_UNCHECKED_ISA_DMA | SCMD_INITIALIZED) diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h index cc139dbd71e5..31e0d6ca1eba 100644 --- a/include/scsi/scsi_host.h +++ b/include/scsi/scsi_host.h @@ -80,8 +80,10 @@ struct scsi_host_template { * command block to the LLDD. When the driver finished * processing the command the done callback is invoked. * - * If queuecommand returns 0, then the HBA has accepted the - * command. The done() function must be called on the command + * If queuecommand returns 0, then the driver has accepted the + * command. It must also push it to the HBA if the scsi_cmnd + * flag SCMD_LAST is set, or if the driver does not implement + * commit_rqs. The done() function must be called on the command * when the driver has finished with it. (you may call done on the * command before queuecommand returns, but in this case you * *must* return 0 from queuecommand). @@ -109,6 +111,16 @@ struct scsi_host_template { */ int (* queuecommand)(struct Scsi_Host *, struct scsi_cmnd *); + /* + * The commit_rqs function is used to trigger a hardware + * doorbell after some requests have been queued with + * queuecommand, when an error is encountered before sending + * the request with SCMD_LAST set. + * + * STATUS: OPTIONAL + */ + void (*commit_rqs)(struct Scsi_Host *, u16); + /* * This is an error handling strategy routine. You don't need to * define one of these if you don't want to - there is a default -- cgit v1.2.3 From 6ee82ef04e38b0d8b09b04bc1b068673deed6582 Mon Sep 17 00:00:00 2001 From: Sylwester Nawrocki Date: Mon, 1 Jul 2019 13:46:51 +0200 Subject: clk: Add missing documentation of devm_clk_bulk_get_optional() argument Fix an incomplete devm_clk_bulk_get_optional() function documentation by adding description of the num_clks argument as in other *clk_bulk* functions. Fixes: 9bd5ef0bd874 ("clk: Add devm_clk_bulk_get_optional() function") Reported-by: kbuild test robot Signed-off-by: Sylwester Nawrocki Signed-off-by: Stephen Boyd --- include/linux/clk.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/clk.h b/include/linux/clk.h index 3c096c7a51dc..853a8f181394 100644 --- a/include/linux/clk.h +++ b/include/linux/clk.h @@ -359,6 +359,7 @@ int __must_check devm_clk_bulk_get(struct device *dev, int num_clks, /** * devm_clk_bulk_get_optional - managed get multiple optional consumer clocks * @dev: device for clock "consumer" + * @num_clks: the number of clk_bulk_data * @clks: pointer to the clk_bulk_data table of consumer * * Behaves the same as devm_clk_bulk_get() except where there is no clock -- cgit v1.2.3 From af311ff9a69189a03548efd5a47d4bb44644fd45 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Fri, 17 May 2019 14:09:22 -0700 Subject: firmware: qcom_scm: Cleanup code in qcom_scm_assign_mem() There are some questionable coding styles in this function. It looks quite odd to deref a pointer with array indexing that only uses the first element. Also, destroying an input/output variable halfway through the function and then overwriting it on success is not clear. It's better to use a local variable and the kernel macros to step through each bit set in a bitmask and clearly show where outputs are set. Cc: Ian Jackson Cc: Julien Grall Cc: Bjorn Andersson Cc: Avaneesh Kumar Dwivedi Tested-by: Bjorn Andersson Signed-off-by: Stephen Boyd [bjorn: Changed for_each_set_bit() size to BITS_PER_LONG] Signed-off-by: Bjorn Andersson --- drivers/firmware/qcom_scm.c | 34 ++++++++++++++++------------------ include/linux/qcom_scm.h | 9 +++++---- 2 files changed, 21 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/drivers/firmware/qcom_scm.c b/drivers/firmware/qcom_scm.c index c7e63af91567..4802ab170fe5 100644 --- a/drivers/firmware/qcom_scm.c +++ b/drivers/firmware/qcom_scm.c @@ -434,7 +434,8 @@ EXPORT_SYMBOL(qcom_scm_set_remote_state); */ int qcom_scm_assign_mem(phys_addr_t mem_addr, size_t mem_sz, unsigned int *srcvm, - struct qcom_scm_vmperm *newvm, int dest_cnt) + const struct qcom_scm_vmperm *newvm, + unsigned int dest_cnt) { struct qcom_scm_current_perm_info *destvm; struct qcom_scm_mem_map_info *mem_to_map; @@ -449,11 +450,10 @@ int qcom_scm_assign_mem(phys_addr_t mem_addr, size_t mem_sz, int next_vm; __le32 *src; void *ptr; - int ret; - int len; - int i; + int ret, i, b; + unsigned long srcvm_bits = *srcvm; - src_sz = hweight_long(*srcvm) * sizeof(*src); + src_sz = hweight_long(srcvm_bits) * sizeof(*src); mem_to_map_sz = sizeof(*mem_to_map); dest_sz = dest_cnt * sizeof(*destvm); ptr_sz = ALIGN(src_sz, SZ_64) + ALIGN(mem_to_map_sz, SZ_64) + @@ -466,28 +466,26 @@ int qcom_scm_assign_mem(phys_addr_t mem_addr, size_t mem_sz, /* Fill source vmid detail */ src = ptr; - len = hweight_long(*srcvm); - for (i = 0; i < len; i++) { - src[i] = cpu_to_le32(ffs(*srcvm) - 1); - *srcvm ^= 1 << (ffs(*srcvm) - 1); - } + i = 0; + for_each_set_bit(b, &srcvm_bits, BITS_PER_LONG) + src[i++] = cpu_to_le32(b); /* Fill details of mem buff to map */ mem_to_map = ptr + ALIGN(src_sz, SZ_64); mem_to_map_phys = ptr_phys + ALIGN(src_sz, SZ_64); - mem_to_map[0].mem_addr = cpu_to_le64(mem_addr); - mem_to_map[0].mem_size = cpu_to_le64(mem_sz); + mem_to_map->mem_addr = cpu_to_le64(mem_addr); + mem_to_map->mem_size = cpu_to_le64(mem_sz); next_vm = 0; /* Fill details of next vmid detail */ destvm = ptr + ALIGN(mem_to_map_sz, SZ_64) + ALIGN(src_sz, SZ_64); dest_phys = ptr_phys + ALIGN(mem_to_map_sz, SZ_64) + ALIGN(src_sz, SZ_64); - for (i = 0; i < dest_cnt; i++) { - destvm[i].vmid = cpu_to_le32(newvm[i].vmid); - destvm[i].perm = cpu_to_le32(newvm[i].perm); - destvm[i].ctx = 0; - destvm[i].ctx_size = 0; - next_vm |= BIT(newvm[i].vmid); + for (i = 0; i < dest_cnt; i++, destvm++, newvm++) { + destvm->vmid = cpu_to_le32(newvm->vmid); + destvm->perm = cpu_to_le32(newvm->perm); + destvm->ctx = 0; + destvm->ctx_size = 0; + next_vm |= BIT(newvm->vmid); } ret = __qcom_scm_assign_mem(__scm->dev, mem_to_map_phys, mem_to_map_sz, diff --git a/include/linux/qcom_scm.h b/include/linux/qcom_scm.h index 3f12cc77fb58..2d5eff506e13 100644 --- a/include/linux/qcom_scm.h +++ b/include/linux/qcom_scm.h @@ -49,8 +49,9 @@ extern int qcom_scm_pas_mem_setup(u32 peripheral, phys_addr_t addr, extern int qcom_scm_pas_auth_and_reset(u32 peripheral); extern int qcom_scm_pas_shutdown(u32 peripheral); extern int qcom_scm_assign_mem(phys_addr_t mem_addr, size_t mem_sz, - unsigned int *src, struct qcom_scm_vmperm *newvm, - int dest_cnt); + unsigned int *src, + const struct qcom_scm_vmperm *newvm, + unsigned int dest_cnt); extern void qcom_scm_cpu_power_down(u32 flags); extern u32 qcom_scm_get_version(void); extern int qcom_scm_set_remote_state(u32 state, u32 id); @@ -87,8 +88,8 @@ qcom_scm_pas_auth_and_reset(u32 peripheral) { return -ENODEV; } static inline int qcom_scm_pas_shutdown(u32 peripheral) { return -ENODEV; } static inline int qcom_scm_assign_mem(phys_addr_t mem_addr, size_t mem_sz, unsigned int *src, - struct qcom_scm_vmperm *newvm, - int dest_cnt) { return -ENODEV; } + const struct qcom_scm_vmperm *newvm, + unsigned int dest_cnt) { return -ENODEV; } static inline void qcom_scm_cpu_power_down(u32 flags) {} static inline u32 qcom_scm_get_version(void) { return 0; } static inline u32 -- cgit v1.2.3 From d8e18a516f8f67404c0d21af8c93d0474fba0876 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 22 Jul 2019 20:08:26 -0700 Subject: net: Use skb accessors in network core In preparation for unifying the skb_frag and bio_vec, use the fine accessors which already exist and use skb_frag_t instead of struct skb_frag_struct. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 +- net/core/skbuff.c | 24 ++++++++++++++---------- net/core/tso.c | 8 ++++---- net/ipv4/tcp.c | 14 ++++++++------ net/kcm/kcmsock.c | 8 ++++---- net/tls/tls_device.c | 14 +++++++------- 6 files changed, 38 insertions(+), 32 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index d8af86d995d6..f9078e7edb53 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3166,7 +3166,7 @@ static inline bool skb_can_coalesce(struct sk_buff *skb, int i, if (skb_zcopy(skb)) return false; if (i) { - const struct skb_frag_struct *frag = &skb_shinfo(skb)->frags[i - 1]; + const skb_frag_t *frag = &skb_shinfo(skb)->frags[i - 1]; return page == skb_frag_page(frag) && off == frag->page_offset + skb_frag_size(frag); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 0338820ee0ec..ba9a36903503 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2485,19 +2485,19 @@ do_frag_list: for (fragidx = 0; fragidx < skb_shinfo(skb)->nr_frags; fragidx++) { skb_frag_t *frag = &skb_shinfo(skb)->frags[fragidx]; - if (offset < frag->size) + if (offset < skb_frag_size(frag)) break; - offset -= frag->size; + offset -= skb_frag_size(frag); } for (; len && fragidx < skb_shinfo(skb)->nr_frags; fragidx++) { skb_frag_t *frag = &skb_shinfo(skb)->frags[fragidx]; - slen = min_t(size_t, len, frag->size - offset); + slen = min_t(size_t, len, skb_frag_size(frag) - offset); while (slen) { - ret = kernel_sendpage_locked(sk, frag->page.p, + ret = kernel_sendpage_locked(sk, skb_frag_page(frag), frag->page_offset + offset, slen, MSG_DONTWAIT); if (ret <= 0) @@ -2975,11 +2975,15 @@ skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen) skb_zerocopy_clone(to, from, GFP_ATOMIC); for (i = 0; i < skb_shinfo(from)->nr_frags; i++) { + int size; + if (!len) break; skb_shinfo(to)->frags[j] = skb_shinfo(from)->frags[i]; - skb_shinfo(to)->frags[j].size = min_t(int, skb_shinfo(to)->frags[j].size, len); - len -= skb_shinfo(to)->frags[j].size; + size = min_t(int, skb_frag_size(&skb_shinfo(to)->frags[j]), + len); + skb_frag_size_set(&skb_shinfo(to)->frags[j], size); + len -= size; skb_frag_ref(to, j); j++; } @@ -3293,7 +3297,7 @@ static int skb_prepare_for_shift(struct sk_buff *skb) int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) { int from, to, merge, todo; - struct skb_frag_struct *fragfrom, *fragto; + skb_frag_t *fragfrom, *fragto; BUG_ON(shiftlen > skb->len); @@ -3625,10 +3629,10 @@ static inline skb_frag_t skb_head_frag_to_page_desc(struct sk_buff *frag_skb) struct page *page; page = virt_to_head_page(frag_skb->head); - head_frag.page.p = page; + __skb_frag_set_page(&head_frag, page); head_frag.page_offset = frag_skb->data - (unsigned char *)page_address(page); - head_frag.size = skb_headlen(frag_skb); + skb_frag_size_set(&head_frag, skb_headlen(frag_skb)); return head_frag; } @@ -4021,7 +4025,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) pinfo->nr_frags = nr_frags + 1 + skbinfo->nr_frags; - frag->page.p = page; + __skb_frag_set_page(frag, page); frag->page_offset = first_offset; skb_frag_size_set(frag, first_size); diff --git a/net/core/tso.c b/net/core/tso.c index 43f4eba61933..d4d5c077ad72 100644 --- a/net/core/tso.c +++ b/net/core/tso.c @@ -55,8 +55,8 @@ void tso_build_data(struct sk_buff *skb, struct tso_t *tso, int size) skb_frag_t *frag = &skb_shinfo(skb)->frags[tso->next_frag_idx]; /* Move to next segment */ - tso->size = frag->size; - tso->data = page_address(frag->page.p) + frag->page_offset; + tso->size = skb_frag_size(frag); + tso->data = skb_frag_address(frag); tso->next_frag_idx++; } } @@ -79,8 +79,8 @@ void tso_start(struct sk_buff *skb, struct tso_t *tso) skb_frag_t *frag = &skb_shinfo(skb)->frags[tso->next_frag_idx]; /* Move to next segment */ - tso->size = frag->size; - tso->data = page_address(frag->page.p) + frag->page_offset; + tso->size = skb_frag_size(frag); + tso->data = skb_frag_address(frag); tso->next_frag_idx++; } } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 776905899ac0..f62f0e7e3cdd 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1776,19 +1776,21 @@ static int tcp_zerocopy_receive(struct sock *sk, break; frags = skb_shinfo(skb)->frags; while (offset) { - if (frags->size > offset) + if (skb_frag_size(frags) > offset) goto out; - offset -= frags->size; + offset -= skb_frag_size(frags); frags++; } } - if (frags->size != PAGE_SIZE || frags->page_offset) { + if (skb_frag_size(frags) != PAGE_SIZE || frags->page_offset) { int remaining = zc->recv_skip_hint; + int size = skb_frag_size(frags); - while (remaining && (frags->size != PAGE_SIZE || + while (remaining && (size != PAGE_SIZE || frags->page_offset)) { - remaining -= frags->size; + remaining -= size; frags++; + size = skb_frag_size(frags); } zc->recv_skip_hint -= remaining; break; @@ -3781,7 +3783,7 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp, return 1; for (i = 0; i < shi->nr_frags; ++i) { - const struct skb_frag_struct *f = &shi->frags[i]; + const skb_frag_t *f = &shi->frags[i]; unsigned int offset = f->page_offset; struct page *page = skb_frag_page(f) + (offset >> PAGE_SHIFT); diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index 5dbc0c48f8cb..05f63c4300e9 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -635,15 +635,15 @@ do_frag_list: frag_offset = 0; do_frag: frag = &skb_shinfo(skb)->frags[fragidx]; - if (WARN_ON(!frag->size)) { + if (WARN_ON(!skb_frag_size(frag))) { ret = -EINVAL; goto out; } ret = kernel_sendpage(psock->sk->sk_socket, - frag->page.p, + skb_frag_page(frag), frag->page_offset + frag_offset, - frag->size - frag_offset, + skb_frag_size(frag) - frag_offset, MSG_DONTWAIT); if (ret <= 0) { if (ret == -EAGAIN) { @@ -678,7 +678,7 @@ do_frag: sent += ret; frag_offset += ret; KCM_STATS_ADD(psock->stats.tx_bytes, ret); - if (frag_offset < frag->size) { + if (frag_offset < skb_frag_size(frag)) { /* Not finished with this frag */ goto do_frag; } diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 7c0b2b778703..4ec8a06fa5d1 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -243,14 +243,14 @@ static void tls_append_frag(struct tls_record_info *record, skb_frag_t *frag; frag = &record->frags[record->num_frags - 1]; - if (frag->page.p == pfrag->page && - frag->page_offset + frag->size == pfrag->offset) { - frag->size += size; + if (skb_frag_page(frag) == pfrag->page && + frag->page_offset + skb_frag_size(frag) == pfrag->offset) { + skb_frag_size_add(frag, size); } else { ++frag; - frag->page.p = pfrag->page; + __skb_frag_set_page(frag, pfrag->page); frag->page_offset = pfrag->offset; - frag->size = size; + skb_frag_size_set(frag, size); ++record->num_frags; get_page(pfrag->page); } @@ -301,8 +301,8 @@ static int tls_push_record(struct sock *sk, frag = &record->frags[i]; sg_unmark_end(&offload_ctx->sg_tx_data[i]); sg_set_page(&offload_ctx->sg_tx_data[i], skb_frag_page(frag), - frag->size, frag->page_offset); - sk_mem_charge(sk, frag->size); + skb_frag_size(frag), frag->page_offset); + sk_mem_charge(sk, skb_frag_size(frag)); get_page(skb_frag_page(frag)); } sg_mark_end(&offload_ctx->sg_tx_data[record->num_frags - 1]); -- cgit v1.2.3 From b656722906efe434f0befe1d4ae4bb7a66fdc124 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 22 Jul 2019 20:08:27 -0700 Subject: net: Increase the size of skb_frag_t To increase commonality between block and net, we are going to replace the skb_frag_t with the bio_vec. This patch increases the size of skb_frag_t on 32-bit machines from 8 bytes to 12 bytes. The size is unchanged on 64-bit machines. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: David S. Miller --- include/linux/skbuff.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index f9078e7edb53..7910935410e6 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -314,13 +314,8 @@ struct skb_frag_struct { struct { struct page *p; } page; -#if (BITS_PER_LONG > 32) || (PAGE_SIZE >= 65536) __u32 page_offset; __u32 size; -#else - __u16 page_offset; - __u16 size; -#endif }; /** -- cgit v1.2.3 From f58ecf1b7d58911921014ffd12c77a4ad33ade71 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 22 Jul 2019 20:08:28 -0700 Subject: net: Reorder the contents of skb_frag_t Match the layout of bio_vec. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 7910935410e6..b9dc8b4f24b1 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -314,8 +314,8 @@ struct skb_frag_struct { struct { struct page *p; } page; - __u32 page_offset; __u32 size; + __u32 page_offset; }; /** -- cgit v1.2.3 From 1dfa5bd38545c6f6a8b6c496e58db93f80da1076 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 22 Jul 2019 20:08:29 -0700 Subject: net: Rename skb_frag page to bv_page One step closer to turning the skb_frag_t into a bio_vec. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: David S. Miller --- include/linux/skbuff.h | 12 +++++------- net/core/skbuff.c | 2 +- 2 files changed, 6 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index b9dc8b4f24b1..8076e2ba8349 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -311,9 +311,7 @@ extern int sysctl_max_skb_frags; typedef struct skb_frag_struct skb_frag_t; struct skb_frag_struct { - struct { - struct page *p; - } page; + struct page *bv_page; __u32 size; __u32 page_offset; }; @@ -374,7 +372,7 @@ static inline bool skb_frag_must_loop(struct page *p) * skb_frag_foreach_page - loop over pages in a fragment * * @f: skb frag to operate on - * @f_off: offset from start of f->page.p + * @f_off: offset from start of f->bv_page * @f_len: length from f_off to loop over * @p: (temp var) current page * @p_off: (temp var) offset from start of current page, @@ -2084,7 +2082,7 @@ static inline void __skb_fill_page_desc(struct sk_buff *skb, int i, * that not all callers have unique ownership of the page but rely * on page_is_pfmemalloc doing the right thing(tm). */ - frag->page.p = page; + frag->bv_page = page; frag->page_offset = off; skb_frag_size_set(frag, size); @@ -2872,7 +2870,7 @@ static inline void skb_propagate_pfmemalloc(struct page *page, */ static inline struct page *skb_frag_page(const skb_frag_t *frag) { - return frag->page.p; + return frag->bv_page; } /** @@ -2958,7 +2956,7 @@ static inline void *skb_frag_address_safe(const skb_frag_t *frag) */ static inline void __skb_frag_set_page(skb_frag_t *frag, struct page *page) { - frag->page.p = page; + frag->bv_page = page; } /** diff --git a/net/core/skbuff.c b/net/core/skbuff.c index ba9a36903503..0b788df5a75b 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3364,7 +3364,7 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) } else { __skb_frag_ref(fragfrom); - fragto->page = fragfrom->page; + fragto->bv_page = fragfrom->bv_page; fragto->page_offset = fragfrom->page_offset; skb_frag_size_set(fragto, todo); -- cgit v1.2.3 From b8b576a16f79efbdde49348147f491b176537d88 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 22 Jul 2019 20:08:30 -0700 Subject: net: Rename skb_frag_t size to bv_len Improved compatibility with bvec Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: David S. Miller --- include/linux/skbuff.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 8076e2ba8349..e849e411d1f3 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -312,7 +312,7 @@ typedef struct skb_frag_struct skb_frag_t; struct skb_frag_struct { struct page *bv_page; - __u32 size; + unsigned int bv_len; __u32 page_offset; }; @@ -322,7 +322,7 @@ struct skb_frag_struct { */ static inline unsigned int skb_frag_size(const skb_frag_t *frag) { - return frag->size; + return frag->bv_len; } /** @@ -332,7 +332,7 @@ static inline unsigned int skb_frag_size(const skb_frag_t *frag) */ static inline void skb_frag_size_set(skb_frag_t *frag, unsigned int size) { - frag->size = size; + frag->bv_len = size; } /** @@ -342,7 +342,7 @@ static inline void skb_frag_size_set(skb_frag_t *frag, unsigned int size) */ static inline void skb_frag_size_add(skb_frag_t *frag, int delta) { - frag->size += delta; + frag->bv_len += delta; } /** @@ -352,7 +352,7 @@ static inline void skb_frag_size_add(skb_frag_t *frag, int delta) */ static inline void skb_frag_size_sub(skb_frag_t *frag, int delta) { - frag->size -= delta; + frag->bv_len -= delta; } /** -- cgit v1.2.3 From 8842d285bafa9ff7719f4107b6545a11dcd41995 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 22 Jul 2019 20:08:31 -0700 Subject: net: Convert skb_frag_t to bio_vec There are a lot of users of frag->page_offset, so use a union to avoid converting those users today. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: David S. Miller --- include/linux/bvec.h | 5 ++++- include/linux/skbuff.h | 9 ++------- 2 files changed, 6 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/bvec.h b/include/linux/bvec.h index a032f01e928c..7f2b2ea9399c 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -18,7 +18,10 @@ struct bio_vec { struct page *bv_page; unsigned int bv_len; - unsigned int bv_offset; + union { + __u32 page_offset; + unsigned int bv_offset; + }; }; struct bvec_iter { diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index e849e411d1f3..718742b1c505 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -308,13 +309,7 @@ extern int sysctl_max_skb_frags; */ #define GSO_BY_FRAGS 0xFFFF -typedef struct skb_frag_struct skb_frag_t; - -struct skb_frag_struct { - struct page *bv_page; - unsigned int bv_len; - __u32 page_offset; -}; +typedef struct bio_vec skb_frag_t; /** * skb_frag_size - Returns the size of a skb fragment -- cgit v1.2.3 From 3a79bc63d90750f737ab9d7219bd3091d2fd6d84 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 15 Jul 2019 13:03:20 +0200 Subject: PCI: irq: Introduce rearm_wake_irq() Introduce a new function, rearm_wake_irq(), allowing a wakeup IRQ to be armed for systen wakeup detection again without running any action handlers associated with it after it has been armed for wakeup detection and triggered. That is useful for IRQs, like ACPI SCI, that may deliver wakeup as well as non-wakeup interrupts when armed for systen wakeup detection. In those cases, it may be possible to determine whether or not the delivered interrupt is a systen wakeup one without running the entire action handler (or handlers, if the IRQ is shared) for the IRQ, and if the interrupt turns out to be a non-wakeup one, the IRQ can be rearmed with the help of the new function. Signed-off-by: Rafael J. Wysocki Acked-by: Thomas Gleixner --- include/linux/interrupt.h | 1 + kernel/irq/pm.c | 20 ++++++++++++++++++++ 2 files changed, 21 insertions(+) (limited to 'include') diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 5b8328a99b2a..0e9cdb3efda7 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -238,6 +238,7 @@ extern void teardown_percpu_nmi(unsigned int irq); /* The following three functions are for the core kernel use only. */ extern void suspend_device_irqs(void); extern void resume_device_irqs(void); +extern void rearm_wake_irq(unsigned int irq); /** * struct irq_affinity_notify - context for notification of IRQ affinity changes diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index d6961d3c6f9e..8f557fa1f4fe 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -176,6 +176,26 @@ static void resume_irqs(bool want_early) } } +/** + * rearm_wake_irq - rearm a wakeup interrupt line after signaling wakeup + * @irq: Interrupt to rearm + */ +void rearm_wake_irq(unsigned int irq) +{ + unsigned long flags; + struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); + + if (!desc || !(desc->istate & IRQS_SUSPENDED) || + !irqd_is_wakeup_set(&desc->irq_data)) + return; + + desc->istate &= ~IRQS_SUSPENDED; + irqd_set(&desc->irq_data, IRQD_WAKEUP_ARMED); + __enable_irq(desc); + + irq_put_desc_busunlock(desc, flags); +} + /** * irq_pm_syscore_ops - enable interrupt lines early * -- cgit v1.2.3 From 6921de898ba8f2ec91cfea70e7160b89c477382e Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 15 Jul 2019 13:03:28 +0200 Subject: ACPICA: Return u32 from acpi_dispatch_gpe() In some cases it is useful to know whether or not the acpi_ev_detect_gpe() called by acpi_dispatch_gpe() has found the GPE to be active, so return the return value of it (whose data type is u32) from latter. Signed-off-by: Rafael J. Wysocki Acked-by: Thomas Gleixner --- drivers/acpi/acpica/evxfgpe.c | 6 +++--- include/acpi/acpixf.h | 8 +++++++- 2 files changed, 10 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/acpi/acpica/evxfgpe.c b/drivers/acpi/acpica/evxfgpe.c index 710488ec59e9..04a40d563dd6 100644 --- a/drivers/acpi/acpica/evxfgpe.c +++ b/drivers/acpi/acpica/evxfgpe.c @@ -644,17 +644,17 @@ ACPI_EXPORT_SYMBOL(acpi_get_gpe_status) * PARAMETERS: gpe_device - Parent GPE Device. NULL for GPE0/GPE1 * gpe_number - GPE level within the GPE block * - * RETURN: None + * RETURN: INTERRUPT_HANDLED or INTERRUPT_NOT_HANDLED * * DESCRIPTION: Detect and dispatch a General Purpose Event to either a function * (e.g. EC) or method (e.g. _Lxx/_Exx) handler. * ******************************************************************************/ -void acpi_dispatch_gpe(acpi_handle gpe_device, u32 gpe_number) +u32 acpi_dispatch_gpe(acpi_handle gpe_device, u32 gpe_number) { ACPI_FUNCTION_TRACE(acpi_dispatch_gpe); - acpi_ev_detect_gpe(gpe_device, NULL, gpe_number); + return acpi_ev_detect_gpe(gpe_device, NULL, gpe_number); } ACPI_EXPORT_SYMBOL(acpi_dispatch_gpe) diff --git a/include/acpi/acpixf.h b/include/acpi/acpixf.h index 3845c8fcc94e..4ed603a3b448 100644 --- a/include/acpi/acpixf.h +++ b/include/acpi/acpixf.h @@ -297,6 +297,9 @@ ACPI_GLOBAL(u8, acpi_gbl_system_awake_and_running); #define ACPI_HW_DEPENDENT_RETURN_OK(prototype) \ ACPI_EXTERNAL_RETURN_OK(prototype) +#define ACPI_HW_DEPENDENT_RETURN_UINT32(prototype) \ + ACPI_EXTERNAL_RETURN_UINT32(prototype) + #define ACPI_HW_DEPENDENT_RETURN_VOID(prototype) \ ACPI_EXTERNAL_RETURN_VOID(prototype) @@ -307,6 +310,9 @@ ACPI_GLOBAL(u8, acpi_gbl_system_awake_and_running); #define ACPI_HW_DEPENDENT_RETURN_OK(prototype) \ static ACPI_INLINE prototype {return(AE_OK);} +#define ACPI_HW_DEPENDENT_RETURN_UINT32(prototype) \ + static ACPI_INLINE prototype {return(0);} + #define ACPI_HW_DEPENDENT_RETURN_VOID(prototype) \ static ACPI_INLINE prototype {return;} @@ -738,7 +744,7 @@ ACPI_HW_DEPENDENT_RETURN_STATUS(acpi_status u32 gpe_number, acpi_event_status *event_status)) -ACPI_HW_DEPENDENT_RETURN_VOID(void acpi_dispatch_gpe(acpi_handle gpe_device, u32 gpe_number)) +ACPI_HW_DEPENDENT_RETURN_UINT32(u32 acpi_dispatch_gpe(acpi_handle gpe_device, u32 gpe_number)) ACPI_HW_DEPENDENT_RETURN_STATUS(acpi_status acpi_disable_all_gpes(void)) ACPI_HW_DEPENDENT_RETURN_STATUS(acpi_status acpi_enable_all_runtime_gpes(void)) ACPI_HW_DEPENDENT_RETURN_STATUS(acpi_status acpi_enable_all_wakeup_gpes(void)) -- cgit v1.2.3 From 56b991849009f5def0443bfb2f48c8321d888e15 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 15 Jul 2019 23:52:03 +0200 Subject: PM: sleep: Simplify suspend-to-idle control flow After commit 33e4f80ee69b ("ACPI / PM: Ignore spurious SCI wakeups from suspend-to-idle") the "noirq" phases of device suspend and resume may run for multiple times during suspend-to-idle, if there are spurious system wakeup events while suspended. However, this is complicated and fragile and actually unnecessary. The main reason for doing this is that on some systems the EC may signal system wakeup events (power button events, for example) as well as events that should not cause the system to resume (spurious system wakeup events). Thus, in order to determine whether or not a given event signaled by the EC while suspended is a proper system wakeup one, the EC GPE needs to be dispatched and to start with that was achieved by allowing the ACPI SCI action handler to run, which was only possible after calling resume_device_irqs(). However, dispatching the EC GPE this way turned out to take too much time in some cases and some EC events might be missed due to that, so commit 68e22011856f ("ACPI: EC: Dispatch the EC GPE directly on s2idle wake") started to dispatch the EC GPE right after a wakeup event has been detected, so in fact the full ACPI SCI action handler doesn't need to run any more to deal with the wakeups coming from the EC. Use this observation to simplify the suspend-to-idle control flow so that the "noirq" phases of device suspend and resume are each run only once in every suspend-to-idle cycle, which is reported to significantly reduce power drawn by some systems when suspended to idle (by allowing them to reach a deep platform-wide low-power state through the suspend-to-idle flow). [What appears to happen is that the "noirq" resume of devices after a spurious EC wakeup brings some devices into a state in which they prevent the platform from reaching the deep low-power state going forward, even after a subsequent "noirq" suspend phase, and on some systems the EC triggers such wakeups already when the "noirq" suspend of devices is running for the first time in the given suspend/resume cycle, so the platform cannot reach the deep low-power state at all.] First, make acpi_s2idle_wake() use the acpi_ec_dispatch_gpe() return value to determine whether or not the wakeup may have been triggered by the EC (in which case the system wakeup is canceled and ACPI events are processed in order to determine whether or not the event is a proper system wakeup one) and use rearm_wake_irq() (introduced by a previous change) in it to rearm the ACPI SCI for system wakeup detection in case the system will remain suspended. Second, drop acpi_s2idle_sync(), which is not needed any more, and the corresponding global platform suspend-to-idle callback. Next, drop the pm_wakeup_pending() check (which is an optimization only) from __device_suspend_noirq() to prevent it from returning errors on system wakeups occurring before the "noirq" phase of device suspend is complete (as in the case of suspend-to-idle it is not known whether or not these wakeups are suprious at that point), in order to avoid having to carry out a "noirq" resume of devices on a spurious system wakeup. Finally, change the code flow in s2idle_loop() to (1) run the "noirq" suspend of devices once before starting the loop, (2) check for spurious EC wakeups (via the platform ->wake callback) for the first time before calling s2idle_enter(), and (3) run the "noirq" resume of devices once after leaving the loop. Signed-off-by: Rafael J. Wysocki Acked-by: Thomas Gleixner --- drivers/acpi/sleep.c | 47 +++++++++++++++++++++-------------------- drivers/base/power/main.c | 5 ----- include/linux/suspend.h | 1 - kernel/power/suspend.c | 53 ++++++++++++++++++++--------------------------- 4 files changed, 48 insertions(+), 58 deletions(-) (limited to 'include') diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c index 3debe1a42655..970ae7c7a3f7 100644 --- a/drivers/acpi/sleep.c +++ b/drivers/acpi/sleep.c @@ -986,33 +986,37 @@ static void acpi_s2idle_wake(void) lpi_check_constraints(); /* - * If IRQD_WAKEUP_ARMED is not set for the SCI at this point, it means - * that the SCI has triggered while suspended, so cancel the wakeup in - * case it has not been a wakeup event (the GPEs will be checked later). + * If IRQD_WAKEUP_ARMED is set for the SCI at this point, the SCI has + * not triggered while suspended, so bail out. */ - if (acpi_sci_irq_valid() && - !irqd_is_wakeup_armed(irq_get_irq_data(acpi_sci_irq))) { + if (!acpi_sci_irq_valid() || + irqd_is_wakeup_armed(irq_get_irq_data(acpi_sci_irq))) + return; + + /* + * If there are EC events to process, the wakeup may be a spurious one + * coming from the EC. + */ + if (acpi_ec_dispatch_gpe()) { + /* + * Cancel the wakeup and process all pending events in case + * there are any wakeup ones in there. + * + * Note that if any non-EC GPEs are active at this point, the + * SCI will retrigger after the rearming below, so no events + * should be missed by canceling the wakeup here. + */ pm_system_cancel_wakeup(); /* - * On some platforms with the LPS0 _DSM device noirq resume - * takes too much time for EC wakeup events to survive, so look - * for them now. + * The EC driver uses the system workqueue and an additional + * special one, so those need to be flushed too. */ - acpi_ec_dispatch_gpe(); + acpi_os_wait_events_complete(); /* synchronize EC GPE processing */ + acpi_ec_flush_work(); + acpi_os_wait_events_complete(); /* synchronize Notify handling */ } -} -static void acpi_s2idle_sync(void) -{ - /* - * Process all pending events in case there are any wakeup ones. - * - * The EC driver uses the system workqueue and an additional special - * one, so those need to be flushed too. - */ - acpi_os_wait_events_complete(); /* synchronize SCI IRQ handling */ - acpi_ec_flush_work(); - acpi_os_wait_events_complete(); /* synchronize Notify handling */ + rearm_wake_irq(acpi_sci_irq); } static void acpi_s2idle_restore(void) @@ -1044,7 +1048,6 @@ static const struct platform_s2idle_ops acpi_s2idle_ops = { .begin = acpi_s2idle_begin, .prepare = acpi_s2idle_prepare, .wake = acpi_s2idle_wake, - .sync = acpi_s2idle_sync, .restore = acpi_s2idle_restore, .end = acpi_s2idle_end, }; diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 7fb2c39bc725..f08332fab531 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -1291,11 +1291,6 @@ static int __device_suspend_noirq(struct device *dev, pm_message_t state, bool a if (async_error) goto Complete; - if (pm_wakeup_pending()) { - async_error = -EBUSY; - goto Complete; - } - if (dev->power.syscore || dev->power.direct_complete) goto Complete; diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 9c0ad1a3a727..66ce3871ed61 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -191,7 +191,6 @@ struct platform_s2idle_ops { int (*begin)(void); int (*prepare)(void); void (*wake)(void); - void (*sync)(void); void (*restore)(void); void (*end)(void); }; diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index c874a7026e24..907b2be0372f 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -119,48 +119,41 @@ static void s2idle_enter(void) static void s2idle_loop(void) { + int error; + + dpm_noirq_begin(); + error = dpm_noirq_suspend_devices(PMSG_SUSPEND); + if (error) + goto resume; + pm_pr_dbg("suspend-to-idle\n"); + /* + * Suspend-to-idle equals: + * frozen processes + suspended devices + idle processors. + * Thus s2idle_enter() should be called right after all devices have + * been suspended. + * + * Wakeups during the noirq suspend of devices may be spurious, so try + * to avoid them upfront. + */ for (;;) { - int error; - - dpm_noirq_begin(); - - /* - * Suspend-to-idle equals - * frozen processes + suspended devices + idle processors. - * Thus s2idle_enter() should be called right after - * all devices have been suspended. - * - * Wakeups during the noirq suspend of devices may be spurious, - * so prevent them from terminating the loop right away. - */ - error = dpm_noirq_suspend_devices(PMSG_SUSPEND); - if (!error) - s2idle_enter(); - else if (error == -EBUSY && pm_wakeup_pending()) - error = 0; - - if (!error && s2idle_ops && s2idle_ops->wake) + if (s2idle_ops && s2idle_ops->wake) s2idle_ops->wake(); - dpm_noirq_resume_devices(PMSG_RESUME); - - dpm_noirq_end(); - - if (error) - break; - - if (s2idle_ops && s2idle_ops->sync) - s2idle_ops->sync(); - if (pm_wakeup_pending()) break; pm_wakeup_clear(false); + + s2idle_enter(); } pm_pr_dbg("resume from suspend-to-idle\n"); + +resume: + dpm_noirq_resume_devices(PMSG_RESUME); + dpm_noirq_end(); } void s2idle_wake(void) -- cgit v1.2.3 From b605c44c30b59990e806f930c37bd288b9d901a5 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 15 Jul 2019 23:52:18 +0200 Subject: PM: sleep: Drop dpm_noirq_begin() and dpm_noirq_end() Note that after previous changes dpm_noirq_begin() and dpm_noirq_end() each have only one caller, so move the code from them to their respective callers and drop them. Also note that dpm_noirq_resume_devices() and dpm_noirq_suspend_devices() need not be exported any more, so make them both static. This change is not expected to alter functionality. Signed-off-by: Rafael J. Wysocki Acked-by: Thomas Gleixner --- drivers/base/power/main.c | 30 ++++++++++++------------------ include/linux/pm.h | 4 ---- 2 files changed, 12 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index f08332fab531..134a8af51511 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -716,7 +716,7 @@ static void async_resume_noirq(void *data, async_cookie_t cookie) put_device(dev); } -void dpm_noirq_resume_devices(pm_message_t state) +static void dpm_noirq_resume_devices(pm_message_t state) { struct device *dev; ktime_t starttime = ktime_get(); @@ -760,13 +760,6 @@ void dpm_noirq_resume_devices(pm_message_t state) trace_suspend_resume(TPS("dpm_resume_noirq"), state.event, false); } -void dpm_noirq_end(void) -{ - resume_device_irqs(); - device_wakeup_disarm_wake_irqs(); - cpuidle_resume(); -} - /** * dpm_resume_noirq - Execute "noirq resume" callbacks for all devices. * @state: PM transition of the system being carried out. @@ -777,7 +770,11 @@ void dpm_noirq_end(void) void dpm_resume_noirq(pm_message_t state) { dpm_noirq_resume_devices(state); - dpm_noirq_end(); + + resume_device_irqs(); + device_wakeup_disarm_wake_irqs(); + + cpuidle_resume(); } static pm_callback_t dpm_subsys_resume_early_cb(struct device *dev, @@ -1357,14 +1354,7 @@ static int device_suspend_noirq(struct device *dev) return __device_suspend_noirq(dev, pm_transition, false); } -void dpm_noirq_begin(void) -{ - cpuidle_pause(); - device_wakeup_arm_wake_irqs(); - suspend_device_irqs(); -} - -int dpm_noirq_suspend_devices(pm_message_t state) +static int dpm_noirq_suspend_devices(pm_message_t state) { ktime_t starttime = ktime_get(); int error = 0; @@ -1421,7 +1411,11 @@ int dpm_suspend_noirq(pm_message_t state) { int ret; - dpm_noirq_begin(); + cpuidle_pause(); + + device_wakeup_arm_wake_irqs(); + suspend_device_irqs(); + ret = dpm_noirq_suspend_devices(state); if (ret) dpm_resume_noirq(resume_event(state)); diff --git a/include/linux/pm.h b/include/linux/pm.h index 3619a870eaa4..4c441be03079 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -712,8 +712,6 @@ struct dev_pm_domain { extern void device_pm_lock(void); extern void dpm_resume_start(pm_message_t state); extern void dpm_resume_end(pm_message_t state); -extern void dpm_noirq_resume_devices(pm_message_t state); -extern void dpm_noirq_end(void); extern void dpm_resume_noirq(pm_message_t state); extern void dpm_resume_early(pm_message_t state); extern void dpm_resume(pm_message_t state); @@ -722,8 +720,6 @@ extern void dpm_complete(pm_message_t state); extern void device_pm_unlock(void); extern int dpm_suspend_end(pm_message_t state); extern int dpm_suspend_start(pm_message_t state); -extern void dpm_noirq_begin(void); -extern int dpm_noirq_suspend_devices(pm_message_t state); extern int dpm_suspend_noirq(pm_message_t state); extern int dpm_suspend_late(pm_message_t state); extern int dpm_suspend(pm_message_t state); -- cgit v1.2.3 From 201c1db90cd643282185a00770f12f95da330eca Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 23 Jul 2019 09:51:00 +0200 Subject: iommu/iova: Fix compilation error with !CONFIG_IOMMU_IOVA The stub function for !CONFIG_IOMMU_IOVA needs to be 'static inline'. Fixes: effa467870c76 ('iommu/vt-d: Don't queue_iova() if there is no flush queue') Signed-off-by: Joerg Roedel --- include/linux/iova.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/iova.h b/include/linux/iova.h index cd0f1de901a8..a0637abffee8 100644 --- a/include/linux/iova.h +++ b/include/linux/iova.h @@ -236,7 +236,7 @@ static inline void init_iova_domain(struct iova_domain *iovad, { } -bool has_iova_flush_queue(struct iova_domain *iovad) +static inline bool has_iova_flush_queue(struct iova_domain *iovad) { return false; } -- cgit v1.2.3 From bca031e2c8aa22a978a2452bf959e27e9fa73dc7 Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Thu, 18 Jul 2019 08:15:10 +0000 Subject: KVM: arm/arm64: Introduce kvm_pmu_vcpu_init() to setup PMU counter index We use "pmc->idx" and the "chained" bitmap to determine if the pmc is chained, in kvm_pmu_pmc_is_chained(). But idx might be uninitialized (and random) when we doing this decision, through a KVM_ARM_VCPU_INIT ioctl -> kvm_pmu_vcpu_reset(). And the test_bit() against this random idx will potentially hit a KASAN BUG [1]. In general, idx is the static property of a PMU counter that is not expected to be modified across resets, as suggested by Julien. It looks more reasonable if we can setup the PMU counter idx for a vcpu in its creation time. Introduce a new function - kvm_pmu_vcpu_init() for this basic setup. Oh, and the KASAN BUG will get fixed this way. [1] https://www.spinics.net/lists/kvm-arm/msg36700.html Fixes: 80f393a23be6 ("KVM: arm/arm64: Support chained PMU counters") Suggested-by: Andrew Murray Suggested-by: Julien Thierry Acked-by: Julien Thierry Signed-off-by: Zenghui Yu Signed-off-by: Marc Zyngier --- include/kvm/arm_pmu.h | 2 ++ virt/kvm/arm/arm.c | 2 ++ virt/kvm/arm/pmu.c | 18 +++++++++++++++--- 3 files changed, 19 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h index 16c769a7f979..6db030439e29 100644 --- a/include/kvm/arm_pmu.h +++ b/include/kvm/arm_pmu.h @@ -34,6 +34,7 @@ struct kvm_pmu { u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx); void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val); u64 kvm_pmu_valid_counter_mask(struct kvm_vcpu *vcpu); +void kvm_pmu_vcpu_init(struct kvm_vcpu *vcpu); void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu); void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu); void kvm_pmu_disable_counter_mask(struct kvm_vcpu *vcpu, u64 val); @@ -71,6 +72,7 @@ static inline u64 kvm_pmu_valid_counter_mask(struct kvm_vcpu *vcpu) { return 0; } +static inline void kvm_pmu_vcpu_init(struct kvm_vcpu *vcpu) {} static inline void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu) {} static inline void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu) {} static inline void kvm_pmu_disable_counter_mask(struct kvm_vcpu *vcpu, u64 val) {} diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index f645c0fbf7ec..c704fa696184 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -340,6 +340,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) /* Set up the timer */ kvm_timer_vcpu_init(vcpu); + kvm_pmu_vcpu_init(vcpu); + kvm_arm_reset_debug_ptr(vcpu); return kvm_vgic_vcpu_init(vcpu); diff --git a/virt/kvm/arm/pmu.c b/virt/kvm/arm/pmu.c index 3dd8238ed246..362a01886bab 100644 --- a/virt/kvm/arm/pmu.c +++ b/virt/kvm/arm/pmu.c @@ -214,6 +214,20 @@ static void kvm_pmu_stop_counter(struct kvm_vcpu *vcpu, struct kvm_pmc *pmc) kvm_pmu_release_perf_event(pmc); } +/** + * kvm_pmu_vcpu_init - assign pmu counter idx for cpu + * @vcpu: The vcpu pointer + * + */ +void kvm_pmu_vcpu_init(struct kvm_vcpu *vcpu) +{ + int i; + struct kvm_pmu *pmu = &vcpu->arch.pmu; + + for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) + pmu->pmc[i].idx = i; +} + /** * kvm_pmu_vcpu_reset - reset pmu state for cpu * @vcpu: The vcpu pointer @@ -224,10 +238,8 @@ void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu) int i; struct kvm_pmu *pmu = &vcpu->arch.pmu; - for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) { + for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) kvm_pmu_stop_counter(vcpu, &pmu->pmc[i]); - pmu->pmc[i].idx = i; - } bitmap_zero(vcpu->arch.pmu.chained, ARMV8_PMU_MAX_COUNTER_PAIRS); } -- cgit v1.2.3 From 4475f8c4ab7b248991a60d9c02808dbb813d6be8 Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Mon, 22 Jul 2019 10:24:33 +0100 Subject: ALSA: compress: Fix regression on compressed capture streams A previous fix to the stop handling on compressed capture streams causes some knock on issues. The previous fix updated snd_compr_drain_notify to set the state back to PREPARED for capture streams. This causes some issues however as the handling for snd_compr_poll differs between the two states and some user-space applications were relying on the poll failing after the stream had been stopped. To correct this regression whilst still fixing the original problem the patch was addressing, update the capture handling to skip the PREPARED state rather than skipping the SETUP state as it has done until now. Fixes: 4f2ab5e1d13d ("ALSA: compress: Fix stop handling on compressed capture streams") Signed-off-by: Charles Keepax Acked-by: Vinod Koul Signed-off-by: Takashi Iwai --- include/sound/compress_driver.h | 5 +---- sound/core/compress_offload.c | 16 +++++++++++----- 2 files changed, 12 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/sound/compress_driver.h b/include/sound/compress_driver.h index c5188ff724d1..bc88d6f964da 100644 --- a/include/sound/compress_driver.h +++ b/include/sound/compress_driver.h @@ -173,10 +173,7 @@ static inline void snd_compr_drain_notify(struct snd_compr_stream *stream) if (snd_BUG_ON(!stream)) return; - if (stream->direction == SND_COMPRESS_PLAYBACK) - stream->runtime->state = SNDRV_PCM_STATE_SETUP; - else - stream->runtime->state = SNDRV_PCM_STATE_PREPARED; + stream->runtime->state = SNDRV_PCM_STATE_SETUP; wake_up(&stream->runtime->sleep); } diff --git a/sound/core/compress_offload.c b/sound/core/compress_offload.c index 99b882158705..d79aee6b9edd 100644 --- a/sound/core/compress_offload.c +++ b/sound/core/compress_offload.c @@ -574,10 +574,7 @@ snd_compr_set_params(struct snd_compr_stream *stream, unsigned long arg) stream->metadata_set = false; stream->next_track = false; - if (stream->direction == SND_COMPRESS_PLAYBACK) - stream->runtime->state = SNDRV_PCM_STATE_SETUP; - else - stream->runtime->state = SNDRV_PCM_STATE_PREPARED; + stream->runtime->state = SNDRV_PCM_STATE_SETUP; } else { return -EPERM; } @@ -693,8 +690,17 @@ static int snd_compr_start(struct snd_compr_stream *stream) { int retval; - if (stream->runtime->state != SNDRV_PCM_STATE_PREPARED) + switch (stream->runtime->state) { + case SNDRV_PCM_STATE_SETUP: + if (stream->direction != SND_COMPRESS_CAPTURE) + return -EPERM; + break; + case SNDRV_PCM_STATE_PREPARED: + break; + default: return -EPERM; + } + retval = stream->ops->trigger(stream, SNDRV_PCM_TRIGGER_START); if (!retval) stream->runtime->state = SNDRV_PCM_STATE_RUNNING; -- cgit v1.2.3 From 6298b78742be6593d372ed1b5bfa5397e1393595 Mon Sep 17 00:00:00 2001 From: Janusz Jankowski Date: Mon, 22 Jul 2019 09:14:02 -0500 Subject: ASoC: SOF: Intel: ssp: BCLK delay parameter Some codecs require BCLK to be on for some time, before sending any data. SOF can enable BCLK and then wait for guaranteed time, before starting DMA on SSP start. Signed-off-by: Janusz Jankowski Signed-off-by: Pierre-Louis Bossart Link: https://lore.kernel.org/r/20190722141402.7194-22-pierre-louis.bossart@linux.intel.com Signed-off-by: Mark Brown --- include/sound/sof/dai-intel.h | 3 +++ include/uapi/sound/sof/abi.h | 2 +- include/uapi/sound/sof/tokens.h | 1 + sound/soc/sof/topology.c | 3 +++ 4 files changed, 8 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/sound/sof/dai-intel.h b/include/sound/sof/dai-intel.h index 4bb8ee138ba7..a81afd3fbd41 100644 --- a/include/sound/sof/dai-intel.h +++ b/include/sound/sof/dai-intel.h @@ -76,6 +76,9 @@ struct sof_ipc_dai_ssp_params { uint16_t tdm_per_slot_padding_flag; uint32_t clks_control; uint32_t quirks; + uint32_t bclk_delay; /* guaranteed time (ms) for which BCLK + * will be driven, before sending data + */ } __packed; /* HDA Configuration Request - SOF_IPC_DAI_HDA_CONFIG */ diff --git a/include/uapi/sound/sof/abi.h b/include/uapi/sound/sof/abi.h index 4a9c24434f42..dff70a42445a 100644 --- a/include/uapi/sound/sof/abi.h +++ b/include/uapi/sound/sof/abi.h @@ -26,7 +26,7 @@ /* SOF ABI version major, minor and patch numbers */ #define SOF_ABI_MAJOR 3 -#define SOF_ABI_MINOR 8 +#define SOF_ABI_MINOR 9 #define SOF_ABI_PATCH 0 /* SOF ABI version number. Format within 32bit word is MMmmmppp */ diff --git a/include/uapi/sound/sof/tokens.h b/include/uapi/sound/sof/tokens.h index dc1b27daaac6..6435240cef13 100644 --- a/include/uapi/sound/sof/tokens.h +++ b/include/uapi/sound/sof/tokens.h @@ -75,6 +75,7 @@ #define SOF_TKN_INTEL_SSP_FRAME_PULSE_WIDTH 503 #define SOF_TKN_INTEL_SSP_QUIRKS 504 #define SOF_TKN_INTEL_SSP_TDM_PADDING_PER_SLOT 505 +#define SOF_TKN_INTEL_SSP_BCLK_DELAY 506 /* DMIC */ #define SOF_TKN_INTEL_DMIC_DRIVER_VERSION 600 diff --git a/sound/soc/sof/topology.c b/sound/soc/sof/topology.c index 432ae343f960..12b7d900b9c2 100644 --- a/sound/soc/sof/topology.c +++ b/sound/soc/sof/topology.c @@ -748,6 +748,9 @@ static const struct sof_topology_token ssp_tokens[] = { get_token_u16, offsetof(struct sof_ipc_dai_ssp_params, tdm_per_slot_padding_flag), 0}, + {SOF_TKN_INTEL_SSP_BCLK_DELAY, SND_SOC_TPLG_TUPLE_TYPE_WORD, + get_token_u32, + offsetof(struct sof_ipc_dai_ssp_params, bclk_delay), 0}, }; -- cgit v1.2.3 From 967b109096b2c2b7ef45a3fce44908efe05779a9 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Tue, 23 Jul 2019 10:07:40 +1000 Subject: media: uapi: new file needs types.h Today's linux-next build (x86_64 allmodconfig) failed like this: include/media/vp8-ctrls.h:25:2: error: unknown type name '__s8' __s8 quant_update[4]; ^~~~ ... include/media/vp8-ctrls.h:107:2: error: unknown type name '__u64' __u64 flags; ^~~~~ Caused by commit a57d6acaf352 ("media: uapi: Add VP8 stateless decoder API") Fixes: a57d6acaf352 ("media: uapi: Add VP8 stateless decoder API") Signed-off-by: Stephen Rothwell Signed-off-by: Mauro Carvalho Chehab --- include/media/vp8-ctrls.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/media/vp8-ctrls.h b/include/media/vp8-ctrls.h index 6cc2eeea4c90..53cba826e482 100644 --- a/include/media/vp8-ctrls.h +++ b/include/media/vp8-ctrls.h @@ -11,6 +11,8 @@ #ifndef _VP8_CTRLS_H_ #define _VP8_CTRLS_H_ +#include + #define V4L2_PIX_FMT_VP8_FRAME v4l2_fourcc('V', 'P', '8', 'F') #define V4L2_CID_MPEG_VIDEO_VP8_FRAME_HEADER (V4L2_CID_MPEG_BASE + 2000) -- cgit v1.2.3 From 327fe1d42b83f8a06b33ba30159582b49af5fc8e Mon Sep 17 00:00:00 2001 From: Marcos Paulo de Souza Date: Tue, 23 Jul 2019 00:27:41 -0300 Subject: block: blk-mq: Remove blk_mq_sched_started_request and started_request blk_mq_sched_completed_request is a function that checks if the elevator related to the request has started_request implemented, but currently, none of the available IO schedulers implement started_request, so remove both. Signed-off-by: Marcos Paulo de Souza Signed-off-by: Jens Axboe --- block/blk-mq-sched.h | 9 --------- block/blk-mq.c | 2 -- include/linux/elevator.h | 1 - 3 files changed, 12 deletions(-) (limited to 'include') diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index cf22ab00fefb..126021fc3a11 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -61,15 +61,6 @@ static inline void blk_mq_sched_completed_request(struct request *rq, u64 now) e->type->ops.completed_request(rq, now); } -static inline void blk_mq_sched_started_request(struct request *rq) -{ - struct request_queue *q = rq->q; - struct elevator_queue *e = q->elevator; - - if (e && e->type->ops.started_request) - e->type->ops.started_request(rq); -} - static inline void blk_mq_sched_requeue_request(struct request *rq) { struct request_queue *q = rq->q; diff --git a/block/blk-mq.c b/block/blk-mq.c index 2bc2c0705660..f78d3287dd82 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -669,8 +669,6 @@ void blk_mq_start_request(struct request *rq) { struct request_queue *q = rq->q; - blk_mq_sched_started_request(rq); - trace_block_rq_issue(q, rq); if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) { diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 17cd0078377c..1dd014c9c87b 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -45,7 +45,6 @@ struct elevator_mq_ops { struct request *(*dispatch_request)(struct blk_mq_hw_ctx *); bool (*has_work)(struct blk_mq_hw_ctx *); void (*completed_request)(struct request *, u64); - void (*started_request)(struct request *); void (*requeue_request)(struct request *); struct request *(*former_request)(struct request_queue *, struct request *); struct request *(*next_request)(struct request_queue *, struct request *); -- cgit v1.2.3 From 06532750010e06dd4b6d69983773677df7fc5291 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Mon, 22 Jul 2019 18:51:49 +0200 Subject: dma-mapping: use dma_get_mask in dma_addressing_limited We currently have cases where the dma_addressing_limited() gets called with dma_mask unset. This causes a NULL pointer dereference. Use dma_get_mask() accessor to prevent the crash. Fixes: b866455423e0 ("dma-mapping: add a dma_addressing_limited helper") Signed-off-by: Eric Auger Acked-by: Michael S. Tsirkin Signed-off-by: Christoph Hellwig --- include/linux/dma-mapping.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index e11b115dd0e4..f7d1eea32c78 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -689,8 +689,8 @@ static inline int dma_coerce_mask_and_coherent(struct device *dev, u64 mask) */ static inline bool dma_addressing_limited(struct device *dev) { - return min_not_zero(*dev->dma_mask, dev->bus_dma_mask) < - dma_get_required_mask(dev); + return min_not_zero(dma_get_mask(dev), dev->bus_dma_mask) < + dma_get_required_mask(dev); } #ifdef CONFIG_ARCH_HAS_SETUP_DMA_OPS -- cgit v1.2.3 From aa6166c2ac28392d64f2d8b3acfb56c8fe657147 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Mon, 22 Jul 2019 10:33:04 +0900 Subject: ASoC: soc-dai: mv soc_dai_hw_params() to soc-dai Sometimes ALSA SoC naming is very random. Current soc_dai_hw_params() should use snd_soc_dai_xxx() style. And then, 1st parameter should be dai. Otherwise it is confusable. - soc_dai_hw_params(..., dai); + snd_soc_dai_hw_params(dai, ...); Signed-off-by: Kuninori Morimoto Link: https://lore.kernel.org/r/87zhl6hn5b.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- include/sound/soc-dai.h | 4 ++++ include/sound/soc.h | 4 ---- sound/soc/soc-dai.c | 30 ++++++++++++++++++++++++++++++ sound/soc/soc-dapm.c | 4 ++-- sound/soc/soc-pcm.c | 35 +++-------------------------------- 5 files changed, 39 insertions(+), 38 deletions(-) (limited to 'include') diff --git a/include/sound/soc-dai.h b/include/sound/soc-dai.h index f5d70041108f..3773262a1b77 100644 --- a/include/sound/soc-dai.h +++ b/include/sound/soc-dai.h @@ -145,6 +145,10 @@ int snd_soc_dai_get_channel_map(struct snd_soc_dai *dai, int snd_soc_dai_is_dummy(struct snd_soc_dai *dai); +int snd_soc_dai_hw_params(struct snd_soc_dai *dai, + struct snd_pcm_substream *substream, + struct snd_pcm_hw_params *params); + struct snd_soc_dai_ops { /* * DAI clocking configuration, all optional. diff --git a/include/sound/soc.h b/include/sound/soc.h index 4e8071269639..d770606732cd 100644 --- a/include/sound/soc.h +++ b/include/sound/soc.h @@ -505,10 +505,6 @@ int snd_soc_params_to_bclk(struct snd_pcm_hw_params *parms); int snd_soc_set_runtime_hwparams(struct snd_pcm_substream *substream, const struct snd_pcm_hardware *hw); -int soc_dai_hw_params(struct snd_pcm_substream *substream, - struct snd_pcm_hw_params *params, - struct snd_soc_dai *dai); - /* Jack reporting */ int snd_soc_card_jack_new(struct snd_soc_card *card, const char *id, int type, struct snd_soc_jack *jack, struct snd_soc_jack_pin *pins, diff --git a/sound/soc/soc-dai.c b/sound/soc/soc-dai.c index a1009ead40de..f883d27d136f 100644 --- a/sound/soc/soc-dai.c +++ b/sound/soc/soc-dai.c @@ -252,3 +252,33 @@ int snd_soc_dai_digital_mute(struct snd_soc_dai *dai, int mute, return -ENOTSUPP; } EXPORT_SYMBOL_GPL(snd_soc_dai_digital_mute); + +int snd_soc_dai_hw_params(struct snd_soc_dai *dai, + struct snd_pcm_substream *substream, + struct snd_pcm_hw_params *params) +{ + struct snd_soc_pcm_runtime *rtd = substream->private_data; + int ret; + + /* perform any topology hw_params fixups before DAI */ + if (rtd->dai_link->be_hw_params_fixup) { + ret = rtd->dai_link->be_hw_params_fixup(rtd, params); + if (ret < 0) { + dev_err(rtd->dev, + "ASoC: hw_params topology fixup failed %d\n", + ret); + return ret; + } + } + + if (dai->driver->ops->hw_params) { + ret = dai->driver->ops->hw_params(substream, params, dai); + if (ret < 0) { + dev_err(dai->dev, "ASoC: can't set %s hw params: %d\n", + dai->name, ret); + return ret; + } + } + + return 0; +} diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c index f013b24c050a..8fc6a01f5d8b 100644 --- a/sound/soc/soc-dapm.c +++ b/sound/soc/soc-dapm.c @@ -3839,7 +3839,7 @@ static int snd_soc_dai_link_event(struct snd_soc_dapm_widget *w, } } source->active++; - ret = soc_dai_hw_params(&substream, params, source); + ret = snd_soc_dai_hw_params(source, &substream, params); if (ret < 0) goto out; @@ -3861,7 +3861,7 @@ static int snd_soc_dai_link_event(struct snd_soc_dapm_widget *w, } } sink->active++; - ret = soc_dai_hw_params(&substream, params, sink); + ret = snd_soc_dai_hw_params(sink, &substream, params); if (ret < 0) goto out; diff --git a/sound/soc/soc-pcm.c b/sound/soc/soc-pcm.c index 4878d22ebd8c..420cc94e0a46 100644 --- a/sound/soc/soc-pcm.c +++ b/sound/soc/soc-pcm.c @@ -877,36 +877,6 @@ static void soc_pcm_codec_params_fixup(struct snd_pcm_hw_params *params, interval->max = channels; } -int soc_dai_hw_params(struct snd_pcm_substream *substream, - struct snd_pcm_hw_params *params, - struct snd_soc_dai *dai) -{ - struct snd_soc_pcm_runtime *rtd = substream->private_data; - int ret; - - /* perform any topology hw_params fixups before DAI */ - if (rtd->dai_link->be_hw_params_fixup) { - ret = rtd->dai_link->be_hw_params_fixup(rtd, params); - if (ret < 0) { - dev_err(rtd->dev, - "ASoC: hw_params topology fixup failed %d\n", - ret); - return ret; - } - } - - if (dai->driver->ops->hw_params) { - ret = dai->driver->ops->hw_params(substream, params, dai); - if (ret < 0) { - dev_err(dai->dev, "ASoC: can't set %s hw params: %d\n", - dai->name, ret); - return ret; - } - } - - return 0; -} - static int soc_pcm_components_hw_free(struct snd_pcm_substream *substream, struct snd_soc_component *last) { @@ -989,7 +959,8 @@ static int soc_pcm_hw_params(struct snd_pcm_substream *substream, soc_pcm_codec_params_fixup(&codec_params, codec_dai->rx_mask); - ret = soc_dai_hw_params(substream, &codec_params, codec_dai); + ret = snd_soc_dai_hw_params(codec_dai, substream, + &codec_params); if(ret < 0) goto codec_err; @@ -1001,7 +972,7 @@ static int soc_pcm_hw_params(struct snd_pcm_substream *substream, snd_soc_dapm_update_dai(substream, &codec_params, codec_dai); } - ret = soc_dai_hw_params(substream, params, cpu_dai); + ret = snd_soc_dai_hw_params(cpu_dai, substream, params); if (ret < 0) goto interface_err; -- cgit v1.2.3 From 846faaed9df7899e74311db3aec0a41a2f6bc345 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Mon, 22 Jul 2019 10:33:19 +0900 Subject: ASoC: soc-dai: add snd_soc_dai_hw_free() Current ALSA SoC is directly using dai->driver->ops->xxx, thus, it has deep nested bracket, and it makes code unreadable. This patch adds new snd_soc_dai_hw_free() and use it. Signed-off-by: Kuninori Morimoto Link: https://lore.kernel.org/r/87y30qhn4w.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- include/sound/soc-dai.h | 2 ++ sound/soc/soc-dai.c | 7 +++++++ sound/soc/soc-dapm.c | 7 ++----- sound/soc/soc-pcm.c | 12 ++++-------- 4 files changed, 15 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/sound/soc-dai.h b/include/sound/soc-dai.h index 3773262a1b77..5222b6a758f2 100644 --- a/include/sound/soc-dai.h +++ b/include/sound/soc-dai.h @@ -148,6 +148,8 @@ int snd_soc_dai_is_dummy(struct snd_soc_dai *dai); int snd_soc_dai_hw_params(struct snd_soc_dai *dai, struct snd_pcm_substream *substream, struct snd_pcm_hw_params *params); +void snd_soc_dai_hw_free(struct snd_soc_dai *dai, + struct snd_pcm_substream *substream); struct snd_soc_dai_ops { /* diff --git a/sound/soc/soc-dai.c b/sound/soc/soc-dai.c index f883d27d136f..39a685e6acd5 100644 --- a/sound/soc/soc-dai.c +++ b/sound/soc/soc-dai.c @@ -282,3 +282,10 @@ int snd_soc_dai_hw_params(struct snd_soc_dai *dai, return 0; } + +void snd_soc_dai_hw_free(struct snd_soc_dai *dai, + struct snd_pcm_substream *substream) +{ + if (dai->driver->ops->hw_free) + dai->driver->ops->hw_free(substream, dai); +} diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c index 8fc6a01f5d8b..0783b05133ad 100644 --- a/sound/soc/soc-dapm.c +++ b/sound/soc/soc-dapm.c @@ -3898,9 +3898,7 @@ static int snd_soc_dai_link_event(struct snd_soc_dapm_widget *w, snd_soc_dapm_widget_for_each_source_path(w, path) { source = path->source->priv; - if (source->driver->ops->hw_free) - source->driver->ops->hw_free(&substream, - source); + snd_soc_dai_hw_free(source, &substream); source->active--; if (source->driver->ops->shutdown) @@ -3912,8 +3910,7 @@ static int snd_soc_dai_link_event(struct snd_soc_dapm_widget *w, snd_soc_dapm_widget_for_each_sink_path(w, path) { sink = path->sink->priv; - if (sink->driver->ops->hw_free) - sink->driver->ops->hw_free(&substream, sink); + snd_soc_dai_hw_free(sink, &substream); sink->active--; if (sink->driver->ops->shutdown) diff --git a/sound/soc/soc-pcm.c b/sound/soc/soc-pcm.c index 420cc94e0a46..58fc4e98ab59 100644 --- a/sound/soc/soc-pcm.c +++ b/sound/soc/soc-pcm.c @@ -1011,8 +1011,7 @@ out: component_err: soc_pcm_components_hw_free(substream, component); - if (cpu_dai->driver->ops->hw_free) - cpu_dai->driver->ops->hw_free(substream, cpu_dai); + snd_soc_dai_hw_free(cpu_dai, substream); cpu_dai->rate = 0; interface_err: @@ -1023,8 +1022,7 @@ codec_err: if (!snd_soc_dai_stream_valid(codec_dai, substream->stream)) continue; - if (codec_dai->driver->ops->hw_free) - codec_dai->driver->ops->hw_free(substream, codec_dai); + snd_soc_dai_hw_free(codec_dai, substream); codec_dai->rate = 0; } @@ -1083,12 +1081,10 @@ static int soc_pcm_hw_free(struct snd_pcm_substream *substream) if (!snd_soc_dai_stream_valid(codec_dai, substream->stream)) continue; - if (codec_dai->driver->ops->hw_free) - codec_dai->driver->ops->hw_free(substream, codec_dai); + snd_soc_dai_hw_free(codec_dai, substream); } - if (cpu_dai->driver->ops->hw_free) - cpu_dai->driver->ops->hw_free(substream, cpu_dai); + snd_soc_dai_hw_free(cpu_dai, substream); mutex_unlock(&rtd->pcm_mutex); return 0; -- cgit v1.2.3 From 5a52a04531486e2ab069b7882432c8b266db36e6 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Mon, 22 Jul 2019 10:33:32 +0900 Subject: ASoC: soc-dai: add snd_soc_dai_startup() Current ALSA SoC is directly using dai->driver->ops->xxx, thus, it has deep nested bracket, and it makes code unreadable. This patch adds new snd_soc_dai_startup() and use it. Signed-off-by: Kuninori Morimoto Link: https://lore.kernel.org/r/87wogahn4i.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- include/sound/soc-dai.h | 2 ++ sound/soc/soc-dai.c | 11 +++++++++++ sound/soc/soc-dapm.c | 28 ++++++++++------------------ sound/soc/soc-pcm.c | 27 +++++++++++---------------- 4 files changed, 34 insertions(+), 34 deletions(-) (limited to 'include') diff --git a/include/sound/soc-dai.h b/include/sound/soc-dai.h index 5222b6a758f2..0d16c5bb20bb 100644 --- a/include/sound/soc-dai.h +++ b/include/sound/soc-dai.h @@ -150,6 +150,8 @@ int snd_soc_dai_hw_params(struct snd_soc_dai *dai, struct snd_pcm_hw_params *params); void snd_soc_dai_hw_free(struct snd_soc_dai *dai, struct snd_pcm_substream *substream); +int snd_soc_dai_startup(struct snd_soc_dai *dai, + struct snd_pcm_substream *substream); struct snd_soc_dai_ops { /* diff --git a/sound/soc/soc-dai.c b/sound/soc/soc-dai.c index 39a685e6acd5..6e196636e42f 100644 --- a/sound/soc/soc-dai.c +++ b/sound/soc/soc-dai.c @@ -289,3 +289,14 @@ void snd_soc_dai_hw_free(struct snd_soc_dai *dai, if (dai->driver->ops->hw_free) dai->driver->ops->hw_free(substream, dai); } + +int snd_soc_dai_startup(struct snd_soc_dai *dai, + struct snd_pcm_substream *substream) +{ + int ret = 0; + + if (dai->driver->ops->startup) + ret = dai->driver->ops->startup(substream, dai); + + return ret; +} diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c index 0783b05133ad..71bfd049480a 100644 --- a/sound/soc/soc-dapm.c +++ b/sound/soc/soc-dapm.c @@ -3828,15 +3828,11 @@ static int snd_soc_dai_link_event(struct snd_soc_dapm_widget *w, snd_soc_dapm_widget_for_each_source_path(w, path) { source = path->source->priv; - if (source->driver->ops->startup) { - ret = source->driver->ops->startup(&substream, - source); - if (ret < 0) { - dev_err(source->dev, - "ASoC: startup() failed: %d\n", - ret); - goto out; - } + ret = snd_soc_dai_startup(source, &substream); + if (ret < 0) { + dev_err(source->dev, + "ASoC: startup() failed: %d\n", ret); + goto out; } source->active++; ret = snd_soc_dai_hw_params(source, &substream, params); @@ -3850,15 +3846,11 @@ static int snd_soc_dai_link_event(struct snd_soc_dapm_widget *w, snd_soc_dapm_widget_for_each_sink_path(w, path) { sink = path->sink->priv; - if (sink->driver->ops->startup) { - ret = sink->driver->ops->startup(&substream, - sink); - if (ret < 0) { - dev_err(sink->dev, - "ASoC: startup() failed: %d\n", - ret); - goto out; - } + ret = snd_soc_dai_startup(sink, &substream); + if (ret < 0) { + dev_err(sink->dev, + "ASoC: startup() failed: %d\n", ret); + goto out; } sink->active++; ret = snd_soc_dai_hw_params(sink, &substream, params); diff --git a/sound/soc/soc-pcm.c b/sound/soc/soc-pcm.c index 58fc4e98ab59..9c8713a3eef1 100644 --- a/sound/soc/soc-pcm.c +++ b/sound/soc/soc-pcm.c @@ -535,13 +535,11 @@ static int soc_pcm_open(struct snd_pcm_substream *substream) mutex_lock_nested(&rtd->pcm_mutex, rtd->pcm_subclass); /* startup the audio subsystem */ - if (cpu_dai->driver->ops->startup) { - ret = cpu_dai->driver->ops->startup(substream, cpu_dai); - if (ret < 0) { - dev_err(cpu_dai->dev, "ASoC: can't open interface" - " %s: %d\n", cpu_dai->name, ret); - goto out; - } + ret = snd_soc_dai_startup(cpu_dai, substream); + if (ret < 0) { + dev_err(cpu_dai->dev, "ASoC: can't open interface %s: %d\n", + cpu_dai->name, ret); + goto out; } ret = soc_pcm_components_open(substream, &component); @@ -549,15 +547,12 @@ static int soc_pcm_open(struct snd_pcm_substream *substream) goto component_err; for_each_rtd_codec_dai(rtd, i, codec_dai) { - if (codec_dai->driver->ops->startup) { - ret = codec_dai->driver->ops->startup(substream, - codec_dai); - if (ret < 0) { - dev_err(codec_dai->dev, - "ASoC: can't open codec %s: %d\n", - codec_dai->name, ret); - goto codec_dai_err; - } + ret = snd_soc_dai_startup(codec_dai, substream); + if (ret < 0) { + dev_err(codec_dai->dev, + "ASoC: can't open codec %s: %d\n", + codec_dai->name, ret); + goto codec_dai_err; } if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) -- cgit v1.2.3 From 330fcb5135e0588b1ea3b0bbab587d1317c1cf7b Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Mon, 22 Jul 2019 10:33:39 +0900 Subject: ASoC: soc-dai: add snd_soc_dai_shutdown() Current ALSA SoC is directly using dai->driver->ops->xxx, thus, it has deep nested bracket, and it makes code unreadable. This patch adds new snd_soc_dai_shutdown() and use it. Signed-off-by: Kuninori Morimoto Link: https://lore.kernel.org/r/87v9vuhn4b.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- include/sound/soc-dai.h | 2 ++ sound/soc/soc-dai.c | 7 +++++++ sound/soc/soc-dapm.c | 7 ++----- sound/soc/soc-pcm.c | 18 ++++++------------ 4 files changed, 17 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/sound/soc-dai.h b/include/sound/soc-dai.h index 0d16c5bb20bb..32545d457b3d 100644 --- a/include/sound/soc-dai.h +++ b/include/sound/soc-dai.h @@ -152,6 +152,8 @@ void snd_soc_dai_hw_free(struct snd_soc_dai *dai, struct snd_pcm_substream *substream); int snd_soc_dai_startup(struct snd_soc_dai *dai, struct snd_pcm_substream *substream); +void snd_soc_dai_shutdown(struct snd_soc_dai *dai, + struct snd_pcm_substream *substream); struct snd_soc_dai_ops { /* diff --git a/sound/soc/soc-dai.c b/sound/soc/soc-dai.c index 6e196636e42f..67ff6cc1fe02 100644 --- a/sound/soc/soc-dai.c +++ b/sound/soc/soc-dai.c @@ -300,3 +300,10 @@ int snd_soc_dai_startup(struct snd_soc_dai *dai, return ret; } + +void snd_soc_dai_shutdown(struct snd_soc_dai *dai, + struct snd_pcm_substream *substream) +{ + if (dai->driver->ops->shutdown) + dai->driver->ops->shutdown(substream, dai); +} diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c index 71bfd049480a..1d04612601ad 100644 --- a/sound/soc/soc-dapm.c +++ b/sound/soc/soc-dapm.c @@ -3893,9 +3893,7 @@ static int snd_soc_dai_link_event(struct snd_soc_dapm_widget *w, snd_soc_dai_hw_free(source, &substream); source->active--; - if (source->driver->ops->shutdown) - source->driver->ops->shutdown(&substream, - source); + snd_soc_dai_shutdown(source, &substream); } substream.stream = SNDRV_PCM_STREAM_PLAYBACK; @@ -3905,8 +3903,7 @@ static int snd_soc_dai_link_event(struct snd_soc_dapm_widget *w, snd_soc_dai_hw_free(sink, &substream); sink->active--; - if (sink->driver->ops->shutdown) - sink->driver->ops->shutdown(&substream, sink); + snd_soc_dai_shutdown(sink, &substream); } break; diff --git a/sound/soc/soc-pcm.c b/sound/soc/soc-pcm.c index 9c8713a3eef1..ed5ae23c7104 100644 --- a/sound/soc/soc-pcm.c +++ b/sound/soc/soc-pcm.c @@ -641,16 +641,13 @@ machine_err: i = rtd->num_codecs; codec_dai_err: - for_each_rtd_codec_dai_rollback(rtd, i, codec_dai) { - if (codec_dai->driver->ops->shutdown) - codec_dai->driver->ops->shutdown(substream, codec_dai); - } + for_each_rtd_codec_dai_rollback(rtd, i, codec_dai) + snd_soc_dai_shutdown(codec_dai, substream); component_err: soc_pcm_components_close(substream, component); - if (cpu_dai->driver->ops->shutdown) - cpu_dai->driver->ops->shutdown(substream, cpu_dai); + snd_soc_dai_shutdown(cpu_dai, substream); out: mutex_unlock(&rtd->pcm_mutex); @@ -728,13 +725,10 @@ static int soc_pcm_close(struct snd_pcm_substream *substream) snd_soc_dai_digital_mute(cpu_dai, 1, substream->stream); - if (cpu_dai->driver->ops->shutdown) - cpu_dai->driver->ops->shutdown(substream, cpu_dai); + snd_soc_dai_shutdown(cpu_dai, substream); - for_each_rtd_codec_dai(rtd, i, codec_dai) { - if (codec_dai->driver->ops->shutdown) - codec_dai->driver->ops->shutdown(substream, codec_dai); - } + for_each_rtd_codec_dai(rtd, i, codec_dai) + snd_soc_dai_shutdown(codec_dai, substream); if (rtd->dai_link->ops->shutdown) rtd->dai_link->ops->shutdown(substream); -- cgit v1.2.3 From 4beb8e109d30d339d44308a767dd6f5614492f3e Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Mon, 22 Jul 2019 10:33:45 +0900 Subject: ASoC: soc-dai: add snd_soc_dai_prepare() Current ALSA SoC is directly using dai->driver->ops->xxx, thus, it has deep nested bracket, and it makes code unreadable. This patch adds new snd_soc_dai_prepare() and use it. Signed-off-by: Kuninori Morimoto Link: https://lore.kernel.org/r/87tvbehn46.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- include/sound/soc-dai.h | 2 ++ sound/soc/soc-dai.c | 11 +++++++++++ sound/soc/soc-pcm.c | 27 +++++++++++---------------- 3 files changed, 24 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/sound/soc-dai.h b/include/sound/soc-dai.h index 32545d457b3d..c7dff6a0b5b9 100644 --- a/include/sound/soc-dai.h +++ b/include/sound/soc-dai.h @@ -154,6 +154,8 @@ int snd_soc_dai_startup(struct snd_soc_dai *dai, struct snd_pcm_substream *substream); void snd_soc_dai_shutdown(struct snd_soc_dai *dai, struct snd_pcm_substream *substream); +int snd_soc_dai_prepare(struct snd_soc_dai *dai, + struct snd_pcm_substream *substream); struct snd_soc_dai_ops { /* diff --git a/sound/soc/soc-dai.c b/sound/soc/soc-dai.c index 67ff6cc1fe02..cb810888c563 100644 --- a/sound/soc/soc-dai.c +++ b/sound/soc/soc-dai.c @@ -307,3 +307,14 @@ void snd_soc_dai_shutdown(struct snd_soc_dai *dai, if (dai->driver->ops->shutdown) dai->driver->ops->shutdown(substream, dai); } + +int snd_soc_dai_prepare(struct snd_soc_dai *dai, + struct snd_pcm_substream *substream) +{ + int ret = 0; + + if (dai->driver->ops->prepare) + ret = dai->driver->ops->prepare(substream, dai); + + return ret; +} diff --git a/sound/soc/soc-pcm.c b/sound/soc/soc-pcm.c index ed5ae23c7104..d7611af90dce 100644 --- a/sound/soc/soc-pcm.c +++ b/sound/soc/soc-pcm.c @@ -814,27 +814,22 @@ static int soc_pcm_prepare(struct snd_pcm_substream *substream) } for_each_rtd_codec_dai(rtd, i, codec_dai) { - if (codec_dai->driver->ops->prepare) { - ret = codec_dai->driver->ops->prepare(substream, - codec_dai); - if (ret < 0) { - dev_err(codec_dai->dev, - "ASoC: codec DAI prepare error: %d\n", - ret); - goto out; - } - } - } - - if (cpu_dai->driver->ops->prepare) { - ret = cpu_dai->driver->ops->prepare(substream, cpu_dai); + ret = snd_soc_dai_prepare(codec_dai, substream); if (ret < 0) { - dev_err(cpu_dai->dev, - "ASoC: cpu DAI prepare error: %d\n", ret); + dev_err(codec_dai->dev, + "ASoC: codec DAI prepare error: %d\n", + ret); goto out; } } + ret = snd_soc_dai_prepare(cpu_dai, substream); + if (ret < 0) { + dev_err(cpu_dai->dev, + "ASoC: cpu DAI prepare error: %d\n", ret); + goto out; + } + /* cancel any delayed stream shutdown that is pending */ if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK && rtd->pop_wait) { -- cgit v1.2.3 From 95aef35533844f35544851b0cdc1fc154b603307 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Mon, 22 Jul 2019 10:33:51 +0900 Subject: ASoC: soc-dai: add snd_soc_dai_trigger() Current ALSA SoC is directly using dai->driver->ops->xxx, thus, it has deep nested bracket, and it makes code unreadable. This patch adds new snd_soc_dai_trigger() and use it. Signed-off-by: Kuninori Morimoto Link: https://lore.kernel.org/r/87sgqyhn40.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- include/sound/soc-dai.h | 2 ++ sound/soc/soc-dai.c | 12 ++++++++++++ sound/soc/soc-pcm.c | 17 ++++++----------- 3 files changed, 20 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/sound/soc-dai.h b/include/sound/soc-dai.h index c7dff6a0b5b9..72b8e76f1cc4 100644 --- a/include/sound/soc-dai.h +++ b/include/sound/soc-dai.h @@ -156,6 +156,8 @@ void snd_soc_dai_shutdown(struct snd_soc_dai *dai, struct snd_pcm_substream *substream); int snd_soc_dai_prepare(struct snd_soc_dai *dai, struct snd_pcm_substream *substream); +int snd_soc_dai_trigger(struct snd_soc_dai *dai, + struct snd_pcm_substream *substream, int cmd); struct snd_soc_dai_ops { /* diff --git a/sound/soc/soc-dai.c b/sound/soc/soc-dai.c index cb810888c563..18c447e169f6 100644 --- a/sound/soc/soc-dai.c +++ b/sound/soc/soc-dai.c @@ -318,3 +318,15 @@ int snd_soc_dai_prepare(struct snd_soc_dai *dai, return ret; } + +int snd_soc_dai_trigger(struct snd_soc_dai *dai, + struct snd_pcm_substream *substream, + int cmd) +{ + int ret = 0; + + if (dai->driver->ops->trigger) + ret = dai->driver->ops->trigger(substream, cmd, dai); + + return ret; +} diff --git a/sound/soc/soc-pcm.c b/sound/soc/soc-pcm.c index d7611af90dce..a628b08f966e 100644 --- a/sound/soc/soc-pcm.c +++ b/sound/soc/soc-pcm.c @@ -1084,12 +1084,9 @@ static int soc_pcm_trigger(struct snd_pcm_substream *substream, int cmd) int i, ret; for_each_rtd_codec_dai(rtd, i, codec_dai) { - if (codec_dai->driver->ops->trigger) { - ret = codec_dai->driver->ops->trigger(substream, - cmd, codec_dai); - if (ret < 0) - return ret; - } + ret = snd_soc_dai_trigger(codec_dai, substream, cmd); + if (ret < 0) + return ret; } for_each_rtdcom(rtd, rtdcom) { @@ -1104,11 +1101,9 @@ static int soc_pcm_trigger(struct snd_pcm_substream *substream, int cmd) return ret; } - if (cpu_dai->driver->ops->trigger) { - ret = cpu_dai->driver->ops->trigger(substream, cmd, cpu_dai); - if (ret < 0) - return ret; - } + snd_soc_dai_trigger(cpu_dai, substream, cmd); + if (ret < 0) + return ret; if (rtd->dai_link->ops->trigger) { ret = rtd->dai_link->ops->trigger(substream, cmd); -- cgit v1.2.3 From 5c0769af4caf8fbdad2e9c0051ab0081b8e22b0a Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Mon, 22 Jul 2019 10:33:56 +0900 Subject: ASoC: soc-dai: add snd_soc_dai_bespoke_trigger() Current ALSA SoC is directly using dai->driver->ops->xxx, thus, it has deep nested bracket, and it makes code unreadable. This patch adds new snd_soc_dai_bespoke_trigger() and use it. Signed-off-by: Kuninori Morimoto Link: https://lore.kernel.org/r/87r26ihn3u.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- include/sound/soc-dai.h | 2 ++ sound/soc/soc-dai.c | 12 ++++++++++++ sound/soc/soc-pcm.c | 16 ++++++---------- 3 files changed, 20 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/sound/soc-dai.h b/include/sound/soc-dai.h index 72b8e76f1cc4..6a5566d459ad 100644 --- a/include/sound/soc-dai.h +++ b/include/sound/soc-dai.h @@ -158,6 +158,8 @@ int snd_soc_dai_prepare(struct snd_soc_dai *dai, struct snd_pcm_substream *substream); int snd_soc_dai_trigger(struct snd_soc_dai *dai, struct snd_pcm_substream *substream, int cmd); +int snd_soc_dai_bespoke_trigger(struct snd_soc_dai *dai, + struct snd_pcm_substream *substream, int cmd); struct snd_soc_dai_ops { /* diff --git a/sound/soc/soc-dai.c b/sound/soc/soc-dai.c index 18c447e169f6..6f466cfcbeef 100644 --- a/sound/soc/soc-dai.c +++ b/sound/soc/soc-dai.c @@ -330,3 +330,15 @@ int snd_soc_dai_trigger(struct snd_soc_dai *dai, return ret; } + +int snd_soc_dai_bespoke_trigger(struct snd_soc_dai *dai, + struct snd_pcm_substream *substream, + int cmd) +{ + int ret = 0; + + if (dai->driver->ops->bespoke_trigger) + ret = dai->driver->ops->bespoke_trigger(substream, cmd, dai); + + return ret; +} diff --git a/sound/soc/soc-pcm.c b/sound/soc/soc-pcm.c index a628b08f966e..a10627f1ceff 100644 --- a/sound/soc/soc-pcm.c +++ b/sound/soc/soc-pcm.c @@ -1123,19 +1123,15 @@ static int soc_pcm_bespoke_trigger(struct snd_pcm_substream *substream, int i, ret; for_each_rtd_codec_dai(rtd, i, codec_dai) { - if (codec_dai->driver->ops->bespoke_trigger) { - ret = codec_dai->driver->ops->bespoke_trigger(substream, - cmd, codec_dai); - if (ret < 0) - return ret; - } - } - - if (cpu_dai->driver->ops->bespoke_trigger) { - ret = cpu_dai->driver->ops->bespoke_trigger(substream, cmd, cpu_dai); + ret = snd_soc_dai_bespoke_trigger(codec_dai, substream, cmd); if (ret < 0) return ret; } + + snd_soc_dai_bespoke_trigger(cpu_dai, substream, cmd); + if (ret < 0) + return ret; + return 0; } /* -- cgit v1.2.3 From 1dea80d4b2bd3b53c58f008ca2bcd73182583711 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Mon, 22 Jul 2019 10:34:09 +0900 Subject: ASoC: soc-dai: add snd_soc_dai_delay() Current ALSA SoC is directly using dai->driver->ops->xxx, thus, it has deep nested bracket, and it makes code unreadable. This patch adds new snd_soc_dai_delay() and use it. Signed-off-by: Kuninori Morimoto Link: https://lore.kernel.org/r/87o91mhn3i.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- include/sound/soc-dai.h | 2 ++ sound/soc/soc-dai.c | 11 +++++++++++ sound/soc/soc-pcm.c | 9 +++------ 3 files changed, 16 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/sound/soc-dai.h b/include/sound/soc-dai.h index 6a5566d459ad..7cfed3034511 100644 --- a/include/sound/soc-dai.h +++ b/include/sound/soc-dai.h @@ -160,6 +160,8 @@ int snd_soc_dai_trigger(struct snd_soc_dai *dai, struct snd_pcm_substream *substream, int cmd); int snd_soc_dai_bespoke_trigger(struct snd_soc_dai *dai, struct snd_pcm_substream *substream, int cmd); +snd_pcm_sframes_t snd_soc_dai_delay(struct snd_soc_dai *dai, + struct snd_pcm_substream *substream); struct snd_soc_dai_ops { /* diff --git a/sound/soc/soc-dai.c b/sound/soc/soc-dai.c index 6f466cfcbeef..5b5b979cd1f3 100644 --- a/sound/soc/soc-dai.c +++ b/sound/soc/soc-dai.c @@ -342,3 +342,14 @@ int snd_soc_dai_bespoke_trigger(struct snd_soc_dai *dai, return ret; } + +snd_pcm_sframes_t snd_soc_dai_delay(struct snd_soc_dai *dai, + struct snd_pcm_substream *substream) +{ + int delay = 0; + + if (dai->driver->ops->delay) + delay = dai->driver->ops->delay(substream, dai); + + return delay; +} diff --git a/sound/soc/soc-pcm.c b/sound/soc/soc-pcm.c index a10627f1ceff..f3137723301c 100644 --- a/sound/soc/soc-pcm.c +++ b/sound/soc/soc-pcm.c @@ -1169,14 +1169,11 @@ static snd_pcm_uframes_t soc_pcm_pointer(struct snd_pcm_substream *substream) /* base delay if assigned in pointer callback */ delay = runtime->delay; - if (cpu_dai->driver->ops->delay) - delay += cpu_dai->driver->ops->delay(substream, cpu_dai); + delay += snd_soc_dai_delay(cpu_dai, substream); for_each_rtd_codec_dai(rtd, i, codec_dai) { - if (codec_dai->driver->ops->delay) - codec_delay = max(codec_delay, - codec_dai->driver->ops->delay(substream, - codec_dai)); + codec_delay = max(codec_delay, + snd_soc_dai_delay(codec_dai, substream)); } delay += codec_delay; -- cgit v1.2.3 From e0f2262292d0c8160cfd9a8c40425107fb65ab29 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Mon, 22 Jul 2019 10:34:29 +0900 Subject: ASoC: soc-dai: add snd_soc_dai_suspend() Current ALSA SoC is directly using dai->driver->xxx, thus, it has deep nested bracket, and it makes code unreadable. This patch adds new snd_soc_dai_suspend() and use it. Signed-off-by: Kuninori Morimoto Link: https://lore.kernel.org/r/87muh6hn2x.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- include/sound/soc-dai.h | 1 + sound/soc/soc-core.c | 8 ++++---- sound/soc/soc-dai.c | 6 ++++++ 3 files changed, 11 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/sound/soc-dai.h b/include/sound/soc-dai.h index 7cfed3034511..6c5604a7dbc2 100644 --- a/include/sound/soc-dai.h +++ b/include/sound/soc-dai.h @@ -162,6 +162,7 @@ int snd_soc_dai_bespoke_trigger(struct snd_soc_dai *dai, struct snd_pcm_substream *substream, int cmd); snd_pcm_sframes_t snd_soc_dai_delay(struct snd_soc_dai *dai, struct snd_pcm_substream *substream); +void snd_soc_dai_suspend(struct snd_soc_dai *dai); struct snd_soc_dai_ops { /* diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c index 6e8c5c8eeaec..7493afb2371c 100644 --- a/sound/soc/soc-core.c +++ b/sound/soc/soc-core.c @@ -511,8 +511,8 @@ int snd_soc_suspend(struct device *dev) if (rtd->dai_link->ignore_suspend) continue; - if (cpu_dai->driver->suspend && !cpu_dai->driver->bus_control) - cpu_dai->driver->suspend(cpu_dai); + if (!cpu_dai->driver->bus_control) + snd_soc_dai_suspend(cpu_dai); } /* close any waiting streams */ @@ -584,8 +584,8 @@ int snd_soc_suspend(struct device *dev) if (rtd->dai_link->ignore_suspend) continue; - if (cpu_dai->driver->suspend && cpu_dai->driver->bus_control) - cpu_dai->driver->suspend(cpu_dai); + if (cpu_dai->driver->bus_control) + snd_soc_dai_suspend(cpu_dai); /* deactivate pins to sleep state */ pinctrl_pm_select_sleep_state(cpu_dai->dev); diff --git a/sound/soc/soc-dai.c b/sound/soc/soc-dai.c index 5b5b979cd1f3..3373598e0682 100644 --- a/sound/soc/soc-dai.c +++ b/sound/soc/soc-dai.c @@ -353,3 +353,9 @@ snd_pcm_sframes_t snd_soc_dai_delay(struct snd_soc_dai *dai, return delay; } + +void snd_soc_dai_suspend(struct snd_soc_dai *dai) +{ + if (dai->driver->suspend) + dai->driver->suspend(dai); +} -- cgit v1.2.3 From 24b09d051164680f0a1d1910efe21ce36ad5c1ca Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Mon, 22 Jul 2019 10:34:43 +0900 Subject: ASoC: soc-dai: add snd_soc_dai_resume() Current ALSA SoC is directly using dai->driver->xxx, thus, it has deep nested bracket, and it makes code unreadable. This patch adds new snd_soc_dai_resume() and use it. Signed-off-by: Kuninori Morimoto Link: https://lore.kernel.org/r/87lfwqhn2j.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- include/sound/soc-dai.h | 1 + sound/soc/soc-core.c | 8 ++++---- sound/soc/soc-dai.c | 6 ++++++ 3 files changed, 11 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/sound/soc-dai.h b/include/sound/soc-dai.h index 6c5604a7dbc2..ed78e34a814e 100644 --- a/include/sound/soc-dai.h +++ b/include/sound/soc-dai.h @@ -163,6 +163,7 @@ int snd_soc_dai_bespoke_trigger(struct snd_soc_dai *dai, snd_pcm_sframes_t snd_soc_dai_delay(struct snd_soc_dai *dai, struct snd_pcm_substream *substream); void snd_soc_dai_suspend(struct snd_soc_dai *dai); +void snd_soc_dai_resume(struct snd_soc_dai *dai); struct snd_soc_dai_ops { /* diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c index 7493afb2371c..5c02f90cea69 100644 --- a/sound/soc/soc-core.c +++ b/sound/soc/soc-core.c @@ -631,8 +631,8 @@ static void soc_resume_deferred(struct work_struct *work) if (rtd->dai_link->ignore_suspend) continue; - if (cpu_dai->driver->resume && cpu_dai->driver->bus_control) - cpu_dai->driver->resume(cpu_dai); + if (cpu_dai->driver->bus_control) + snd_soc_dai_resume(cpu_dai); } for_each_card_components(card, component) { @@ -678,8 +678,8 @@ static void soc_resume_deferred(struct work_struct *work) if (rtd->dai_link->ignore_suspend) continue; - if (cpu_dai->driver->resume && !cpu_dai->driver->bus_control) - cpu_dai->driver->resume(cpu_dai); + if (!cpu_dai->driver->bus_control) + snd_soc_dai_resume(cpu_dai); } if (card->resume_post) diff --git a/sound/soc/soc-dai.c b/sound/soc/soc-dai.c index 3373598e0682..ddb6f217c0ed 100644 --- a/sound/soc/soc-dai.c +++ b/sound/soc/soc-dai.c @@ -359,3 +359,9 @@ void snd_soc_dai_suspend(struct snd_soc_dai *dai) if (dai->driver->suspend) dai->driver->suspend(dai); } + +void snd_soc_dai_resume(struct snd_soc_dai *dai) +{ + if (dai->driver->resume) + dai->driver->resume(dai); +} -- cgit v1.2.3 From cfd9b5fbfe1e8763018aea2600aa0d6ff015ebfc Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Mon, 22 Jul 2019 10:34:56 +0900 Subject: ASoC: soc-dai: add snd_soc_dai_probe() Current ALSA SoC is directly using dai->driver->xxx, thus, it has deep nested bracket, and it makes code unreadable. This patch adds new snd_soc_dai_probe() and use it. Signed-off-by: Kuninori Morimoto Link: https://lore.kernel.org/r/87k1cahn26.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- include/sound/soc-dai.h | 1 + sound/soc/soc-core.c | 15 +++++++-------- sound/soc/soc-dai.c | 7 +++++++ 3 files changed, 15 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/sound/soc-dai.h b/include/sound/soc-dai.h index ed78e34a814e..da8d8b889089 100644 --- a/include/sound/soc-dai.h +++ b/include/sound/soc-dai.h @@ -164,6 +164,7 @@ snd_pcm_sframes_t snd_soc_dai_delay(struct snd_soc_dai *dai, struct snd_pcm_substream *substream); void snd_soc_dai_suspend(struct snd_soc_dai *dai); void snd_soc_dai_resume(struct snd_soc_dai *dai); +int snd_soc_dai_probe(struct snd_soc_dai *dai); struct snd_soc_dai_ops { /* diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c index 5c02f90cea69..3e73468225f9 100644 --- a/sound/soc/soc-core.c +++ b/sound/soc/soc-core.c @@ -1434,18 +1434,17 @@ static int soc_probe_link_components(struct snd_soc_card *card, static int soc_probe_dai(struct snd_soc_dai *dai, int order) { + int ret; + if (dai->probed || dai->driver->probe_order != order) return 0; - if (dai->driver->probe) { - int ret = dai->driver->probe(dai); - - if (ret < 0) { - dev_err(dai->dev, "ASoC: failed to probe DAI %s: %d\n", - dai->name, ret); - return ret; - } + ret = snd_soc_dai_probe(dai); + if (ret < 0) { + dev_err(dai->dev, "ASoC: failed to probe DAI %s: %d\n", + dai->name, ret); + return ret; } dai->probed = 1; diff --git a/sound/soc/soc-dai.c b/sound/soc/soc-dai.c index ddb6f217c0ed..55c1fac99613 100644 --- a/sound/soc/soc-dai.c +++ b/sound/soc/soc-dai.c @@ -365,3 +365,10 @@ void snd_soc_dai_resume(struct snd_soc_dai *dai) if (dai->driver->resume) dai->driver->resume(dai); } + +int snd_soc_dai_probe(struct snd_soc_dai *dai) +{ + if (dai->driver->probe) + return dai->driver->probe(dai); + return 0; +} -- cgit v1.2.3 From dcdab5820edd6123911dbd767ee1e389008b6a83 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Mon, 22 Jul 2019 10:35:05 +0900 Subject: ASoC: soc-dai: add snd_soc_dai_remove() Current ALSA SoC is directly using dai->driver->xxx, thus, it has deep nested bracket, and it makes code unreadable. This patch adds new snd_soc_dai_remvoe() and use it. Signed-off-by: Kuninori Morimoto Link: https://lore.kernel.org/r/87imruhn1x.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- include/sound/soc-dai.h | 1 + sound/soc/soc-core.c | 13 ++++++------- sound/soc/soc-dai.c | 7 +++++++ 3 files changed, 14 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/sound/soc-dai.h b/include/sound/soc-dai.h index da8d8b889089..2a11f177ce01 100644 --- a/include/sound/soc-dai.h +++ b/include/sound/soc-dai.h @@ -165,6 +165,7 @@ snd_pcm_sframes_t snd_soc_dai_delay(struct snd_soc_dai *dai, void snd_soc_dai_suspend(struct snd_soc_dai *dai); void snd_soc_dai_resume(struct snd_soc_dai *dai); int snd_soc_dai_probe(struct snd_soc_dai *dai); +int snd_soc_dai_remove(struct snd_soc_dai *dai); struct snd_soc_dai_ops { /* diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c index 3e73468225f9..727fd342b3fb 100644 --- a/sound/soc/soc-core.c +++ b/sound/soc/soc-core.c @@ -992,13 +992,12 @@ static void soc_remove_dai(struct snd_soc_dai *dai, int order) dai->driver->remove_order != order) return; - if (dai->driver->remove) { - err = dai->driver->remove(dai); - if (err < 0) - dev_err(dai->dev, - "ASoC: failed to remove %s: %d\n", - dai->name, err); - } + err = snd_soc_dai_remove(dai); + if (err < 0) + dev_err(dai->dev, + "ASoC: failed to remove %s: %d\n", + dai->name, err); + dai->probed = 0; } diff --git a/sound/soc/soc-dai.c b/sound/soc/soc-dai.c index 55c1fac99613..384765c747da 100644 --- a/sound/soc/soc-dai.c +++ b/sound/soc/soc-dai.c @@ -372,3 +372,10 @@ int snd_soc_dai_probe(struct snd_soc_dai *dai) return dai->driver->probe(dai); return 0; } + +int snd_soc_dai_remove(struct snd_soc_dai *dai) +{ + if (dai->driver->remove) + return dai->driver->remove(dai); + return 0; +} -- cgit v1.2.3 From b423c4202135f7794e0a9c55a884f5933d8e7156 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Mon, 22 Jul 2019 10:35:29 +0900 Subject: ASoC: soc-dai: add snd_soc_dai_compress_new() Current ALSA SoC is directly using dai->driver->xxx, thus, it has deep nested bracket, and it makes code unreadable. This patch adds new snd_soc_dai_compress_new() and use it. Signed-off-by: Kuninori Morimoto Link: https://lore.kernel.org/r/87h87ehn1a.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- include/sound/soc-dai.h | 2 ++ sound/soc/soc-core.c | 15 ++++++++------- sound/soc/soc-dai.c | 8 ++++++++ 3 files changed, 18 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/sound/soc-dai.h b/include/sound/soc-dai.h index 2a11f177ce01..0f8b09520020 100644 --- a/include/sound/soc-dai.h +++ b/include/sound/soc-dai.h @@ -166,6 +166,8 @@ void snd_soc_dai_suspend(struct snd_soc_dai *dai); void snd_soc_dai_resume(struct snd_soc_dai *dai); int snd_soc_dai_probe(struct snd_soc_dai *dai); int snd_soc_dai_remove(struct snd_soc_dai *dai); +int snd_soc_dai_compress_new(struct snd_soc_dai *dai, + struct snd_soc_pcm_runtime *rtd, int num); struct snd_soc_dai_ops { /* diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c index 727fd342b3fb..458b090f026a 100644 --- a/sound/soc/soc-core.c +++ b/sound/soc/soc-core.c @@ -1545,15 +1545,16 @@ static int soc_probe_link_dais(struct snd_soc_card *card, num = rtd->dai_link->id; } - if (cpu_dai->driver->compress_new) { - /* create compress_device" */ - ret = cpu_dai->driver->compress_new(rtd, num); - if (ret < 0) { + /* create compress_device if possible */ + ret = snd_soc_dai_compress_new(cpu_dai, rtd, num); + if (ret != -ENOTSUPP) { + if (ret < 0) dev_err(card->dev, "ASoC: can't create compress %s\n", dai_link->stream_name); - return ret; - } - } else if (!dai_link->params) { + return ret; + } + + if (!dai_link->params) { /* create the pcm */ ret = soc_new_pcm(rtd, num); if (ret < 0) { diff --git a/sound/soc/soc-dai.c b/sound/soc/soc-dai.c index 384765c747da..e6f161b9f975 100644 --- a/sound/soc/soc-dai.c +++ b/sound/soc/soc-dai.c @@ -379,3 +379,11 @@ int snd_soc_dai_remove(struct snd_soc_dai *dai) return dai->driver->remove(dai); return 0; } + +int snd_soc_dai_compress_new(struct snd_soc_dai *dai, + struct snd_soc_pcm_runtime *rtd, int num) +{ + if (dai->driver->compress_new) + return dai->driver->compress_new(rtd, num); + return -ENOTSUPP; +} -- cgit v1.2.3 From 467fece8fbc6774a3a3bd0981e1a342fb5022706 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Mon, 22 Jul 2019 10:36:16 +0900 Subject: ASoC: soc-dai: move snd_soc_dai_stream_valid() to soc-dai.c snd_soc_dai_stream_valid() is function to check stream validity. But, some code is using it, some code are checking stream->channels_min directly. Doing samethings by different method is confusable. This patch uses same funcntion for same purpose. Signed-off-by: Kuninori Morimoto Link: https://lore.kernel.org/r/87ftmyhmzz.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- include/sound/soc-dai.h | 1 + sound/soc/soc-compress.c | 9 ++++----- sound/soc/soc-dai.c | 18 ++++++++++++++++++ sound/soc/soc-pcm.c | 39 ++++++++++----------------------------- 4 files changed, 33 insertions(+), 34 deletions(-) (limited to 'include') diff --git a/include/sound/soc-dai.h b/include/sound/soc-dai.h index 0f8b09520020..dc48fe081a20 100644 --- a/include/sound/soc-dai.h +++ b/include/sound/soc-dai.h @@ -168,6 +168,7 @@ int snd_soc_dai_probe(struct snd_soc_dai *dai); int snd_soc_dai_remove(struct snd_soc_dai *dai); int snd_soc_dai_compress_new(struct snd_soc_dai *dai, struct snd_soc_pcm_runtime *rtd, int num); +bool snd_soc_dai_stream_valid(struct snd_soc_dai *dai, int stream); struct snd_soc_dai_ops { /* diff --git a/sound/soc/soc-compress.c b/sound/soc/soc-compress.c index ddef4ff677ce..289211069a1e 100644 --- a/sound/soc/soc-compress.c +++ b/sound/soc/soc-compress.c @@ -872,14 +872,13 @@ int snd_soc_new_compress(struct snd_soc_pcm_runtime *rtd, int num) } /* check client and interface hw capabilities */ - if (codec_dai->driver->playback.channels_min) + if (snd_soc_dai_stream_valid(codec_dai, SNDRV_PCM_STREAM_PLAYBACK) && + snd_soc_dai_stream_valid(cpu_dai, SNDRV_PCM_STREAM_PLAYBACK)) playback = 1; - if (codec_dai->driver->capture.channels_min) + if (snd_soc_dai_stream_valid(codec_dai, SNDRV_PCM_STREAM_CAPTURE) && + snd_soc_dai_stream_valid(cpu_dai, SNDRV_PCM_STREAM_CAPTURE)) capture = 1; - capture = capture && cpu_dai->driver->capture.channels_min; - playback = playback && cpu_dai->driver->playback.channels_min; - /* * Compress devices are unidirectional so only one of the directions * should be set, check for that (xor) diff --git a/sound/soc/soc-dai.c b/sound/soc/soc-dai.c index e6f161b9f975..1c7f63871c1d 100644 --- a/sound/soc/soc-dai.c +++ b/sound/soc/soc-dai.c @@ -387,3 +387,21 @@ int snd_soc_dai_compress_new(struct snd_soc_dai *dai, return dai->driver->compress_new(rtd, num); return -ENOTSUPP; } + +/* + * snd_soc_dai_stream_valid() - check if a DAI supports the given stream + * + * Returns true if the DAI supports the indicated stream type. + */ +bool snd_soc_dai_stream_valid(struct snd_soc_dai *dai, int dir) +{ + struct snd_soc_pcm_stream *stream; + + if (dir == SNDRV_PCM_STREAM_PLAYBACK) + stream = &dai->driver->playback; + else + stream = &dai->driver->capture; + + /* If the codec specifies any channels at all, it supports the stream */ + return stream->channels_min; +} diff --git a/sound/soc/soc-pcm.c b/sound/soc/soc-pcm.c index f3137723301c..fabeac164a6c 100644 --- a/sound/soc/soc-pcm.c +++ b/sound/soc/soc-pcm.c @@ -29,24 +29,6 @@ #define DPCM_MAX_BE_USERS 8 -/* - * snd_soc_dai_stream_valid() - check if a DAI supports the given stream - * - * Returns true if the DAI supports the indicated stream type. - */ -static bool snd_soc_dai_stream_valid(struct snd_soc_dai *dai, int stream) -{ - struct snd_soc_pcm_stream *codec_stream; - - if (stream == SNDRV_PCM_STREAM_PLAYBACK) - codec_stream = &dai->driver->playback; - else - codec_stream = &dai->driver->capture; - - /* If the codec specifies any channels at all, it supports the stream */ - return codec_stream->channels_min; -} - /** * snd_soc_runtime_activate() - Increment active count for PCM runtime components * @rtd: ASoC PCM runtime that is activated @@ -2688,8 +2670,8 @@ static int soc_dpcm_fe_runtime_update(struct snd_soc_pcm_runtime *fe, int new) new ? "new" : "old", fe->dai_link->name); /* skip if FE doesn't have playback capability */ - if (!fe->cpu_dai->driver->playback.channels_min || - !fe->codec_dai->driver->playback.channels_min) + if (!snd_soc_dai_stream_valid(fe->cpu_dai, SNDRV_PCM_STREAM_PLAYBACK) || + !snd_soc_dai_stream_valid(fe->codec_dai, SNDRV_PCM_STREAM_PLAYBACK)) goto capture; /* skip if FE isn't currently playing */ @@ -2719,8 +2701,8 @@ static int soc_dpcm_fe_runtime_update(struct snd_soc_pcm_runtime *fe, int new) capture: /* skip if FE doesn't have capture capability */ - if (!fe->cpu_dai->driver->capture.channels_min || - !fe->codec_dai->driver->capture.channels_min) + if (!snd_soc_dai_stream_valid(fe->cpu_dai, SNDRV_PCM_STREAM_CAPTURE) || + !snd_soc_dai_stream_valid(fe->codec_dai, SNDRV_PCM_STREAM_CAPTURE)) return 0; /* skip if FE isn't currently capturing */ @@ -3030,14 +3012,13 @@ int soc_new_pcm(struct snd_soc_pcm_runtime *rtd, int num) capture = rtd->dai_link->dpcm_capture; } else { for_each_rtd_codec_dai(rtd, i, codec_dai) { - if (codec_dai->driver->playback.channels_min) + if (snd_soc_dai_stream_valid(codec_dai, SNDRV_PCM_STREAM_PLAYBACK) && + snd_soc_dai_stream_valid(cpu_dai, SNDRV_PCM_STREAM_PLAYBACK)) playback = 1; - if (codec_dai->driver->capture.channels_min) + if (snd_soc_dai_stream_valid(codec_dai, SNDRV_PCM_STREAM_CAPTURE) && + snd_soc_dai_stream_valid(cpu_dai, SNDRV_PCM_STREAM_CAPTURE)) capture = 1; } - - capture = capture && cpu_dai->driver->capture.channels_min; - playback = playback && cpu_dai->driver->playback.channels_min; } if (rtd->dai_link->playback_only) { @@ -3375,11 +3356,11 @@ static ssize_t dpcm_state_read_file(struct file *file, char __user *user_buf, if (!buf) return -ENOMEM; - if (fe->cpu_dai->driver->playback.channels_min) + if (snd_soc_dai_stream_valid(fe->cpu_dai, SNDRV_PCM_STREAM_PLAYBACK)) offset += dpcm_show_state(fe, SNDRV_PCM_STREAM_PLAYBACK, buf + offset, out_count - offset); - if (fe->cpu_dai->driver->capture.channels_min) + if (snd_soc_dai_stream_valid(fe->cpu_dai, SNDRV_PCM_STREAM_CAPTURE)) offset += dpcm_show_state(fe, SNDRV_PCM_STREAM_CAPTURE, buf + offset, out_count - offset); -- cgit v1.2.3 From 6749d59016981bca6d7000e40bdb08eed78dfa6f Mon Sep 17 00:00:00 2001 From: John Hurley Date: Tue, 23 Jul 2019 15:33:59 +0100 Subject: net: sched: include mpls actions in hardware intermediate representation A recent addition to TC actions is the ability to manipulate the MPLS headers on packets. In preparation to offload such actions to hardware, update the IR code to accept and prepare the new actions. Note that no driver currently impliments the MPLS dec_ttl action so this is not included. Signed-off-by: John Hurley Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/flow_offload.h | 19 +++++++++++ include/net/tc_act/tc_mpls.h | 75 ++++++++++++++++++++++++++++++++++++++++++++ net/sched/cls_api.c | 25 +++++++++++++++ 3 files changed, 119 insertions(+) (limited to 'include') diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h index b16d21636d69..00b9aab5fdc1 100644 --- a/include/net/flow_offload.h +++ b/include/net/flow_offload.h @@ -131,6 +131,9 @@ enum flow_action_id { FLOW_ACTION_SAMPLE, FLOW_ACTION_POLICE, FLOW_ACTION_CT, + FLOW_ACTION_MPLS_PUSH, + FLOW_ACTION_MPLS_POP, + FLOW_ACTION_MPLS_MANGLE, }; /* This is mirroring enum pedit_header_type definition for easy mapping between @@ -184,6 +187,22 @@ struct flow_action_entry { int action; u16 zone; } ct; + struct { /* FLOW_ACTION_MPLS_PUSH */ + u32 label; + __be16 proto; + u8 tc; + u8 bos; + u8 ttl; + } mpls_push; + struct { /* FLOW_ACTION_MPLS_POP */ + __be16 proto; + } mpls_pop; + struct { /* FLOW_ACTION_MPLS_MANGLE */ + u32 label; + u8 tc; + u8 bos; + u8 ttl; + } mpls_mangle; }; }; diff --git a/include/net/tc_act/tc_mpls.h b/include/net/tc_act/tc_mpls.h index 4bc3d9250ef0..721de4f5733a 100644 --- a/include/net/tc_act/tc_mpls.h +++ b/include/net/tc_act/tc_mpls.h @@ -27,4 +27,79 @@ struct tcf_mpls { }; #define to_mpls(a) ((struct tcf_mpls *)a) +static inline bool is_tcf_mpls(const struct tc_action *a) +{ +#ifdef CONFIG_NET_CLS_ACT + if (a->ops && a->ops->id == TCA_ID_MPLS) + return true; +#endif + return false; +} + +static inline u32 tcf_mpls_action(const struct tc_action *a) +{ + u32 tcfm_action; + + rcu_read_lock(); + tcfm_action = rcu_dereference(to_mpls(a)->mpls_p)->tcfm_action; + rcu_read_unlock(); + + return tcfm_action; +} + +static inline __be16 tcf_mpls_proto(const struct tc_action *a) +{ + __be16 tcfm_proto; + + rcu_read_lock(); + tcfm_proto = rcu_dereference(to_mpls(a)->mpls_p)->tcfm_proto; + rcu_read_unlock(); + + return tcfm_proto; +} + +static inline u32 tcf_mpls_label(const struct tc_action *a) +{ + u32 tcfm_label; + + rcu_read_lock(); + tcfm_label = rcu_dereference(to_mpls(a)->mpls_p)->tcfm_label; + rcu_read_unlock(); + + return tcfm_label; +} + +static inline u8 tcf_mpls_tc(const struct tc_action *a) +{ + u8 tcfm_tc; + + rcu_read_lock(); + tcfm_tc = rcu_dereference(to_mpls(a)->mpls_p)->tcfm_tc; + rcu_read_unlock(); + + return tcfm_tc; +} + +static inline u8 tcf_mpls_bos(const struct tc_action *a) +{ + u8 tcfm_bos; + + rcu_read_lock(); + tcfm_bos = rcu_dereference(to_mpls(a)->mpls_p)->tcfm_bos; + rcu_read_unlock(); + + return tcfm_bos; +} + +static inline u8 tcf_mpls_ttl(const struct tc_action *a) +{ + u8 tcfm_ttl; + + rcu_read_lock(); + tcfm_ttl = rcu_dereference(to_mpls(a)->mpls_p)->tcfm_ttl; + rcu_read_unlock(); + + return tcfm_ttl; +} + #endif /* __NET_TC_MPLS_H */ diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index efd3cfb80a2a..3565d9aa09aa 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -36,6 +36,7 @@ #include #include #include +#include extern const struct nla_policy rtm_tca_policy[TCA_MAX + 1]; @@ -3269,6 +3270,30 @@ int tc_setup_flow_action(struct flow_action *flow_action, entry->id = FLOW_ACTION_CT; entry->ct.action = tcf_ct_action(act); entry->ct.zone = tcf_ct_zone(act); + } else if (is_tcf_mpls(act)) { + switch (tcf_mpls_action(act)) { + case TCA_MPLS_ACT_PUSH: + entry->id = FLOW_ACTION_MPLS_PUSH; + entry->mpls_push.proto = tcf_mpls_proto(act); + entry->mpls_push.label = tcf_mpls_label(act); + entry->mpls_push.tc = tcf_mpls_tc(act); + entry->mpls_push.bos = tcf_mpls_bos(act); + entry->mpls_push.ttl = tcf_mpls_ttl(act); + break; + case TCA_MPLS_ACT_POP: + entry->id = FLOW_ACTION_MPLS_POP; + entry->mpls_pop.proto = tcf_mpls_proto(act); + break; + case TCA_MPLS_ACT_MODIFY: + entry->id = FLOW_ACTION_MPLS_MANGLE; + entry->mpls_mangle.label = tcf_mpls_label(act); + entry->mpls_mangle.tc = tcf_mpls_tc(act); + entry->mpls_mangle.bos = tcf_mpls_bos(act); + entry->mpls_mangle.ttl = tcf_mpls_ttl(act); + break; + default: + goto err_out; + } } else { goto err_out; } -- cgit v1.2.3 From d9b8aadaffa65809d146cf0f8632a22a946367d7 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Fri, 19 Jul 2019 11:18:15 +0200 Subject: bpf: fix narrower loads on s390 The very first check in test_pkt_md_access is failing on s390, which happens because loading a part of a struct __sk_buff field produces an incorrect result. The preprocessed code of the check is: { __u8 tmp = *((volatile __u8 *)&skb->len + ((sizeof(skb->len) - sizeof(__u8)) / sizeof(__u8))); if (tmp != ((*(volatile __u32 *)&skb->len) & 0xFF)) return 2; }; clang generates the following code for it: 0: 71 21 00 03 00 00 00 00 r2 = *(u8 *)(r1 + 3) 1: 61 31 00 00 00 00 00 00 r3 = *(u32 *)(r1 + 0) 2: 57 30 00 00 00 00 00 ff r3 &= 255 3: 5d 23 00 1d 00 00 00 00 if r2 != r3 goto +29 Finally, verifier transforms it to: 0: (61) r2 = *(u32 *)(r1 +104) 1: (bc) w2 = w2 2: (74) w2 >>= 24 3: (bc) w2 = w2 4: (54) w2 &= 255 5: (bc) w2 = w2 The problem is that when verifier emits the code to replace a partial load of a struct __sk_buff field (*(u8 *)(r1 + 3)) with a full load of struct sk_buff field (*(u32 *)(r1 + 104)), an optional shift and a bitwise AND, it assumes that the machine is little endian and incorrectly decides to use a shift. Adjust shift count calculation to account for endianness. Fixes: 31fd85816dbe ("bpf: permits narrower load from bpf program context fields") Signed-off-by: Ilya Leoshkevich Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 13 +++++++++++++ kernel/bpf/verifier.c | 4 ++-- 2 files changed, 15 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index ff65d22cf336..92c6e31fb008 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -24,6 +24,7 @@ #include +#include #include #include @@ -747,6 +748,18 @@ bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default) return size <= size_default && (size & (size - 1)) == 0; } +static inline u8 +bpf_ctx_narrow_load_shift(u32 off, u32 size, u32 size_default) +{ + u8 load_off = off & (size_default - 1); + +#ifdef __LITTLE_ENDIAN + return load_off * 8; +#else + return (size_default - (load_off + size)) * 8; +#endif +} + #define bpf_ctx_wide_access_ok(off, size, type, field) \ (size == sizeof(__u64) && \ off >= offsetof(type, field) && \ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5900cbb966b1..c84d83f86141 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8616,8 +8616,8 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) } if (is_narrower_load && size < target_size) { - u8 shift = (off & (size_default - 1)) * 8; - + u8 shift = bpf_ctx_narrow_load_shift(off, size, + size_default); if (ctx_field_size <= 4) { if (shift) insn_buf[cnt++] = BPF_ALU32_IMM(BPF_RSH, -- cgit v1.2.3 From 4f77e0956bd99901af2bfc6641437ff8bff8d4a4 Mon Sep 17 00:00:00 2001 From: Kelsey Skunberg Date: Mon, 15 Jul 2019 14:34:16 -0600 Subject: PCI: Remove pci_block_cfg_access() et al (unused) Remove the following unused functions from include/linux/pci.h: pci_block_cfg_access() pci_block_cfg_access_in_atomic() pci_unblock_cfg_access() These were added by fb51ccbf217c ("PCI: Rework config space blocking services"), though no callers were added. Link: https://lore.kernel.org/r/20190715203416.37547-1-skunberg.kelsey@gmail.com Signed-off-by: Kelsey Skunberg Signed-off-by: Bjorn Helgaas Reviewed-by: Lukas Bulwahn --- include/linux/pci.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/linux/pci.h b/include/linux/pci.h index 9e700d9f9f28..61b64fa1b0b1 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1747,11 +1747,6 @@ static inline void pci_release_regions(struct pci_dev *dev) { } static inline unsigned long pci_address_to_pio(phys_addr_t addr) { return -1; } -static inline void pci_block_cfg_access(struct pci_dev *dev) { } -static inline int pci_block_cfg_access_in_atomic(struct pci_dev *dev) -{ return 0; } -static inline void pci_unblock_cfg_access(struct pci_dev *dev) { } - static inline struct pci_bus *pci_find_next_bus(const struct pci_bus *from) { return NULL; } static inline struct pci_dev *pci_get_slot(struct pci_bus *bus, -- cgit v1.2.3 From 5523ca8f624dc9268bda109d37cbdc3efb5e79be Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 21 Jul 2019 14:50:39 +0200 Subject: scsi: fcoe: fix a typo #define relative to FCOE CTLR start with FCOE_CTLR, except FCOE_CTRL_SOL_TOV. This is likely a typo and CTRL should be CTLR here as well. Signed-off-by: Christophe JAILLET Reviewed-by: Hannes Reinecke Signed-off-by: Martin K. Petersen --- drivers/scsi/fcoe/fcoe_ctlr.c | 2 +- include/scsi/libfcoe.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/scsi/fcoe/fcoe_ctlr.c b/drivers/scsi/fcoe/fcoe_ctlr.c index 590ec8009f52..1a85fe9e4b7b 100644 --- a/drivers/scsi/fcoe/fcoe_ctlr.c +++ b/drivers/scsi/fcoe/fcoe_ctlr.c @@ -1019,7 +1019,7 @@ static void fcoe_ctlr_recv_adv(struct fcoe_ctlr *fip, struct sk_buff *skb) { struct fcoe_fcf *fcf; struct fcoe_fcf new; - unsigned long sol_tov = msecs_to_jiffies(FCOE_CTRL_SOL_TOV); + unsigned long sol_tov = msecs_to_jiffies(FCOE_CTLR_SOL_TOV); int first = 0; int mtu_valid; int found = 0; diff --git a/include/scsi/libfcoe.h b/include/scsi/libfcoe.h index c50fb297e265..dc14b52577f7 100644 --- a/include/scsi/libfcoe.h +++ b/include/scsi/libfcoe.h @@ -31,7 +31,7 @@ * FIP tunable parameters. */ #define FCOE_CTLR_START_DELAY 2000 /* mS after first adv. to choose FCF */ -#define FCOE_CTRL_SOL_TOV 2000 /* min. solicitation interval (mS) */ +#define FCOE_CTLR_SOL_TOV 2000 /* min. solicitation interval (mS) */ #define FCOE_CTLR_FCF_LIMIT 20 /* max. number of FCF entries */ #define FCOE_CTLR_VN2VN_LOGIN_LIMIT 3 /* max. VN2VN rport login retries */ -- cgit v1.2.3 From 60649d4e0af6c26b6c423dea9c57f39e823fc0c5 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Tue, 23 Jul 2019 14:08:47 +0200 Subject: can: remove obsolete empty ioctl() handler With commit c7cbdbf29f488a ("net: rework SIOCGSTAMP ioctl handling") the only ioctl function in can_ioctl() has been removed. As this SIOCGSTAMP ioctl command is now handled in net/socket.c we can entirely remove the CAN specific ioctl functions. Signed-off-by: Oliver Hartkopp Signed-off-by: Marc Kleine-Budde --- include/linux/can/core.h | 1 - net/can/af_can.c | 9 --------- net/can/bcm.c | 2 +- net/can/raw.c | 2 +- 4 files changed, 2 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/can/core.h b/include/linux/can/core.h index 6099bc18bd0c..f8284a94a13d 100644 --- a/include/linux/can/core.h +++ b/include/linux/can/core.h @@ -57,6 +57,5 @@ extern void can_rx_unregister(struct net *net, struct net_device *dev, void *data); extern int can_send(struct sk_buff *skb, int loop); -extern int can_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); #endif /* !_CAN_CORE_H */ diff --git a/net/can/af_can.c b/net/can/af_can.c index 80281ef2ccbd..9c86de2da45e 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -87,15 +87,6 @@ static atomic_t skbcounter = ATOMIC_INIT(0); * af_can socket functions */ -int can_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) -{ - switch (cmd) { - default: - return -ENOIOCTLCMD; - } -} -EXPORT_SYMBOL(can_ioctl); - static void can_sock_destruct(struct sock *sk) { skb_queue_purge(&sk->sk_receive_queue); diff --git a/net/can/bcm.c b/net/can/bcm.c index a34ee52f19ea..1eecf4d3e8d2 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -1688,7 +1688,7 @@ static const struct proto_ops bcm_ops = { .accept = sock_no_accept, .getname = sock_no_getname, .poll = datagram_poll, - .ioctl = can_ioctl, /* use can_ioctl() from af_can.c */ + .ioctl = sock_no_ioctl, .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = sock_no_shutdown, diff --git a/net/can/raw.c b/net/can/raw.c index afcbff063a67..bbbe3dd0abe9 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -845,7 +845,7 @@ static const struct proto_ops raw_ops = { .accept = sock_no_accept, .getname = raw_getname, .poll = datagram_poll, - .ioctl = can_ioctl, /* use can_ioctl() from af_can.c */ + .ioctl = sock_no_ioctl, .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = sock_no_shutdown, -- cgit v1.2.3 From fba76a58452694b9b13c07e48839fa84c75f57af Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Tue, 23 Jul 2019 15:17:55 +0200 Subject: can: Add SPDX license identifiers for CAN subsystem Add missing SPDX identifiers for the CAN network layer and correct the SPDX license for two of its include files to make sure the BSD-3-Clause applies for the entire subsystem. Signed-off-by: Oliver Hartkopp Signed-off-by: Marc Kleine-Budde --- include/linux/can/core.h | 2 +- include/linux/can/skb.h | 2 +- net/can/af_can.c | 1 + net/can/af_can.h | 1 + net/can/bcm.c | 1 + net/can/gw.c | 1 + net/can/proc.c | 1 + net/can/raw.c | 1 + 8 files changed, 8 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/can/core.h b/include/linux/can/core.h index f8284a94a13d..708c10d3417a 100644 --- a/include/linux/can/core.h +++ b/include/linux/can/core.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */ /* * linux/can/core.h * diff --git a/include/linux/can/skb.h b/include/linux/can/skb.h index b3379a97245c..a954def26c0d 100644 --- a/include/linux/can/skb.h +++ b/include/linux/can/skb.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */ /* * linux/can/skb.h * diff --git a/net/can/af_can.c b/net/can/af_can.c index 9c86de2da45e..76cf83b2bd40 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) /* * af_can.c - Protocol family CAN core module * (used by different CAN protocol modules) diff --git a/net/can/af_can.h b/net/can/af_can.h index 9cb3719632bd..ef21f7c6bc80 100644 --- a/net/can/af_can.h +++ b/net/can/af_can.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */ /* * Copyright (c) 2002-2007 Volkswagen Group Electronic Research * All rights reserved. diff --git a/net/can/bcm.c b/net/can/bcm.c index 1eecf4d3e8d2..8da986b19d88 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) /* * bcm.c - Broadcast Manager to filter/send (cyclic) CAN content * diff --git a/net/can/gw.c b/net/can/gw.c index 5275ddf580bc..8abae841d504 100644 --- a/net/can/gw.c +++ b/net/can/gw.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) /* * gw.c - CAN frame Gateway/Router/Bridge with netlink interface * diff --git a/net/can/proc.c b/net/can/proc.c index 70fea17bb04c..edb822c31902 100644 --- a/net/can/proc.c +++ b/net/can/proc.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) /* * proc.c - procfs support for Protocol family CAN core module * diff --git a/net/can/raw.c b/net/can/raw.c index bbbe3dd0abe9..ff720272f7b7 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) /* * raw.c - Raw sockets for protocol family CAN * -- cgit v1.2.3 From 2f5947dfcaecb99f2dd559156eecbeb7b95e4c02 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 24 Jul 2019 09:24:49 +0200 Subject: Documentation: move Documentation/virtual to Documentation/virt Renaming docs seems to be en vogue at the moment, so fix on of the grossly misnamed directories. We usually never use "virtual" as a shortcut for virtualization in the kernel, but always virt, as seen in the virt/ top-level directory. Fix up the documentation to match that. Fixes: ed16648eb5b8 ("Move kvm, uml, and lguest subdirectories under a common "virtual" directory, I.E:") Signed-off-by: Christoph Hellwig Signed-off-by: Paolo Bonzini --- Documentation/admin-guide/kernel-parameters.txt | 2 +- Documentation/virt/index.rst | 18 + Documentation/virt/kvm/amd-memory-encryption.rst | 250 + Documentation/virt/kvm/api.txt | 5296 ++++++++++++++++++++ Documentation/virt/kvm/arm/hyp-abi.txt | 53 + Documentation/virt/kvm/arm/psci.txt | 61 + Documentation/virt/kvm/cpuid.rst | 107 + Documentation/virt/kvm/devices/README | 1 + Documentation/virt/kvm/devices/arm-vgic-its.txt | 181 + Documentation/virt/kvm/devices/arm-vgic-v3.txt | 251 + Documentation/virt/kvm/devices/arm-vgic.txt | 127 + Documentation/virt/kvm/devices/mpic.txt | 53 + Documentation/virt/kvm/devices/s390_flic.txt | 163 + Documentation/virt/kvm/devices/vcpu.txt | 62 + Documentation/virt/kvm/devices/vfio.txt | 36 + Documentation/virt/kvm/devices/vm.txt | 270 + Documentation/virt/kvm/devices/xics.txt | 66 + Documentation/virt/kvm/devices/xive.txt | 197 + Documentation/virt/kvm/halt-polling.txt | 136 + Documentation/virt/kvm/hypercalls.txt | 154 + Documentation/virt/kvm/index.rst | 11 + Documentation/virt/kvm/locking.txt | 215 + Documentation/virt/kvm/mmu.txt | 449 ++ Documentation/virt/kvm/msr.txt | 284 ++ Documentation/virt/kvm/nested-vmx.txt | 240 + Documentation/virt/kvm/ppc-pv.txt | 212 + Documentation/virt/kvm/review-checklist.txt | 38 + Documentation/virt/kvm/s390-diag.txt | 83 + Documentation/virt/kvm/timekeeping.txt | 612 +++ Documentation/virt/kvm/vcpu-requests.rst | 307 ++ Documentation/virt/paravirt_ops.rst | 35 + Documentation/virt/uml/UserModeLinux-HOWTO.txt | 4589 +++++++++++++++++ Documentation/virtual/index.rst | 18 - .../virtual/kvm/amd-memory-encryption.rst | 250 - Documentation/virtual/kvm/api.txt | 5296 -------------------- Documentation/virtual/kvm/arm/hyp-abi.txt | 53 - Documentation/virtual/kvm/arm/psci.txt | 61 - Documentation/virtual/kvm/cpuid.rst | 107 - Documentation/virtual/kvm/devices/README | 1 - Documentation/virtual/kvm/devices/arm-vgic-its.txt | 181 - Documentation/virtual/kvm/devices/arm-vgic-v3.txt | 251 - Documentation/virtual/kvm/devices/arm-vgic.txt | 127 - Documentation/virtual/kvm/devices/mpic.txt | 53 - Documentation/virtual/kvm/devices/s390_flic.txt | 163 - Documentation/virtual/kvm/devices/vcpu.txt | 62 - Documentation/virtual/kvm/devices/vfio.txt | 36 - Documentation/virtual/kvm/devices/vm.txt | 270 - Documentation/virtual/kvm/devices/xics.txt | 66 - Documentation/virtual/kvm/devices/xive.txt | 197 - Documentation/virtual/kvm/halt-polling.txt | 136 - Documentation/virtual/kvm/hypercalls.txt | 154 - Documentation/virtual/kvm/index.rst | 11 - Documentation/virtual/kvm/locking.txt | 215 - Documentation/virtual/kvm/mmu.txt | 449 -- Documentation/virtual/kvm/msr.txt | 284 -- Documentation/virtual/kvm/nested-vmx.txt | 240 - Documentation/virtual/kvm/ppc-pv.txt | 212 - Documentation/virtual/kvm/review-checklist.txt | 38 - Documentation/virtual/kvm/s390-diag.txt | 83 - Documentation/virtual/kvm/timekeeping.txt | 612 --- Documentation/virtual/kvm/vcpu-requests.rst | 307 -- Documentation/virtual/paravirt_ops.rst | 35 - Documentation/virtual/uml/UserModeLinux-HOWTO.txt | 4589 ----------------- MAINTAINERS | 6 +- arch/powerpc/include/uapi/asm/kvm_para.h | 2 +- arch/x86/kvm/mmu.c | 2 +- include/uapi/linux/kvm.h | 4 +- tools/include/uapi/linux/kvm.h | 4 +- virt/kvm/arm/arm.c | 2 +- virt/kvm/arm/vgic/vgic-mmio-v3.c | 2 +- virt/kvm/arm/vgic/vgic.h | 4 +- 71 files changed, 14571 insertions(+), 14571 deletions(-) create mode 100644 Documentation/virt/index.rst create mode 100644 Documentation/virt/kvm/amd-memory-encryption.rst create mode 100644 Documentation/virt/kvm/api.txt create mode 100644 Documentation/virt/kvm/arm/hyp-abi.txt create mode 100644 Documentation/virt/kvm/arm/psci.txt create mode 100644 Documentation/virt/kvm/cpuid.rst create mode 100644 Documentation/virt/kvm/devices/README create mode 100644 Documentation/virt/kvm/devices/arm-vgic-its.txt create mode 100644 Documentation/virt/kvm/devices/arm-vgic-v3.txt create mode 100644 Documentation/virt/kvm/devices/arm-vgic.txt create mode 100644 Documentation/virt/kvm/devices/mpic.txt create mode 100644 Documentation/virt/kvm/devices/s390_flic.txt create mode 100644 Documentation/virt/kvm/devices/vcpu.txt create mode 100644 Documentation/virt/kvm/devices/vfio.txt create mode 100644 Documentation/virt/kvm/devices/vm.txt create mode 100644 Documentation/virt/kvm/devices/xics.txt create mode 100644 Documentation/virt/kvm/devices/xive.txt create mode 100644 Documentation/virt/kvm/halt-polling.txt create mode 100644 Documentation/virt/kvm/hypercalls.txt create mode 100644 Documentation/virt/kvm/index.rst create mode 100644 Documentation/virt/kvm/locking.txt create mode 100644 Documentation/virt/kvm/mmu.txt create mode 100644 Documentation/virt/kvm/msr.txt create mode 100644 Documentation/virt/kvm/nested-vmx.txt create mode 100644 Documentation/virt/kvm/ppc-pv.txt create mode 100644 Documentation/virt/kvm/review-checklist.txt create mode 100644 Documentation/virt/kvm/s390-diag.txt create mode 100644 Documentation/virt/kvm/timekeeping.txt create mode 100644 Documentation/virt/kvm/vcpu-requests.rst create mode 100644 Documentation/virt/paravirt_ops.rst create mode 100644 Documentation/virt/uml/UserModeLinux-HOWTO.txt delete mode 100644 Documentation/virtual/index.rst delete mode 100644 Documentation/virtual/kvm/amd-memory-encryption.rst delete mode 100644 Documentation/virtual/kvm/api.txt delete mode 100644 Documentation/virtual/kvm/arm/hyp-abi.txt delete mode 100644 Documentation/virtual/kvm/arm/psci.txt delete mode 100644 Documentation/virtual/kvm/cpuid.rst delete mode 100644 Documentation/virtual/kvm/devices/README delete mode 100644 Documentation/virtual/kvm/devices/arm-vgic-its.txt delete mode 100644 Documentation/virtual/kvm/devices/arm-vgic-v3.txt delete mode 100644 Documentation/virtual/kvm/devices/arm-vgic.txt delete mode 100644 Documentation/virtual/kvm/devices/mpic.txt delete mode 100644 Documentation/virtual/kvm/devices/s390_flic.txt delete mode 100644 Documentation/virtual/kvm/devices/vcpu.txt delete mode 100644 Documentation/virtual/kvm/devices/vfio.txt delete mode 100644 Documentation/virtual/kvm/devices/vm.txt delete mode 100644 Documentation/virtual/kvm/devices/xics.txt delete mode 100644 Documentation/virtual/kvm/devices/xive.txt delete mode 100644 Documentation/virtual/kvm/halt-polling.txt delete mode 100644 Documentation/virtual/kvm/hypercalls.txt delete mode 100644 Documentation/virtual/kvm/index.rst delete mode 100644 Documentation/virtual/kvm/locking.txt delete mode 100644 Documentation/virtual/kvm/mmu.txt delete mode 100644 Documentation/virtual/kvm/msr.txt delete mode 100644 Documentation/virtual/kvm/nested-vmx.txt delete mode 100644 Documentation/virtual/kvm/ppc-pv.txt delete mode 100644 Documentation/virtual/kvm/review-checklist.txt delete mode 100644 Documentation/virtual/kvm/s390-diag.txt delete mode 100644 Documentation/virtual/kvm/timekeeping.txt delete mode 100644 Documentation/virtual/kvm/vcpu-requests.rst delete mode 100644 Documentation/virtual/paravirt_ops.rst delete mode 100644 Documentation/virtual/uml/UserModeLinux-HOWTO.txt (limited to 'include') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 099c5a4be95b..8a8880cec34b 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2532,7 +2532,7 @@ mem_encrypt=on: Activate SME mem_encrypt=off: Do not activate SME - Refer to Documentation/virtual/kvm/amd-memory-encryption.rst + Refer to Documentation/virt/kvm/amd-memory-encryption.rst for details on when memory encryption can be activated. mem_sleep_default= [SUSPEND] Default system suspend mode: diff --git a/Documentation/virt/index.rst b/Documentation/virt/index.rst new file mode 100644 index 000000000000..062ffb527043 --- /dev/null +++ b/Documentation/virt/index.rst @@ -0,0 +1,18 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============================ +Linux Virtualization Support +============================ + +.. toctree:: + :maxdepth: 2 + + kvm/index + paravirt_ops + +.. only:: html and subproject + + Indices + ======= + + * :ref:`genindex` diff --git a/Documentation/virt/kvm/amd-memory-encryption.rst b/Documentation/virt/kvm/amd-memory-encryption.rst new file mode 100644 index 000000000000..d18c97b4e140 --- /dev/null +++ b/Documentation/virt/kvm/amd-memory-encryption.rst @@ -0,0 +1,250 @@ +====================================== +Secure Encrypted Virtualization (SEV) +====================================== + +Overview +======== + +Secure Encrypted Virtualization (SEV) is a feature found on AMD processors. + +SEV is an extension to the AMD-V architecture which supports running +virtual machines (VMs) under the control of a hypervisor. When enabled, +the memory contents of a VM will be transparently encrypted with a key +unique to that VM. + +The hypervisor can determine the SEV support through the CPUID +instruction. The CPUID function 0x8000001f reports information related +to SEV:: + + 0x8000001f[eax]: + Bit[1] indicates support for SEV + ... + [ecx]: + Bits[31:0] Number of encrypted guests supported simultaneously + +If support for SEV is present, MSR 0xc001_0010 (MSR_K8_SYSCFG) and MSR 0xc001_0015 +(MSR_K7_HWCR) can be used to determine if it can be enabled:: + + 0xc001_0010: + Bit[23] 1 = memory encryption can be enabled + 0 = memory encryption can not be enabled + + 0xc001_0015: + Bit[0] 1 = memory encryption can be enabled + 0 = memory encryption can not be enabled + +When SEV support is available, it can be enabled in a specific VM by +setting the SEV bit before executing VMRUN.:: + + VMCB[0x90]: + Bit[1] 1 = SEV is enabled + 0 = SEV is disabled + +SEV hardware uses ASIDs to associate a memory encryption key with a VM. +Hence, the ASID for the SEV-enabled guests must be from 1 to a maximum value +defined in the CPUID 0x8000001f[ecx] field. + +SEV Key Management +================== + +The SEV guest key management is handled by a separate processor called the AMD +Secure Processor (AMD-SP). Firmware running inside the AMD-SP provides a secure +key management interface to perform common hypervisor activities such as +encrypting bootstrap code, snapshot, migrating and debugging the guest. For more +information, see the SEV Key Management spec [api-spec]_ + +KVM implements the following commands to support common lifecycle events of SEV +guests, such as launching, running, snapshotting, migrating and decommissioning. + +1. KVM_SEV_INIT +--------------- + +The KVM_SEV_INIT command is used by the hypervisor to initialize the SEV platform +context. In a typical workflow, this command should be the first command issued. + +Returns: 0 on success, -negative on error + +2. KVM_SEV_LAUNCH_START +----------------------- + +The KVM_SEV_LAUNCH_START command is used for creating the memory encryption +context. To create the encryption context, user must provide a guest policy, +the owner's public Diffie-Hellman (PDH) key and session information. + +Parameters: struct kvm_sev_launch_start (in/out) + +Returns: 0 on success, -negative on error + +:: + + struct kvm_sev_launch_start { + __u32 handle; /* if zero then firmware creates a new handle */ + __u32 policy; /* guest's policy */ + + __u64 dh_uaddr; /* userspace address pointing to the guest owner's PDH key */ + __u32 dh_len; + + __u64 session_addr; /* userspace address which points to the guest session information */ + __u32 session_len; + }; + +On success, the 'handle' field contains a new handle and on error, a negative value. + +For more details, see SEV spec Section 6.2. + +3. KVM_SEV_LAUNCH_UPDATE_DATA +----------------------------- + +The KVM_SEV_LAUNCH_UPDATE_DATA is used for encrypting a memory region. It also +calculates a measurement of the memory contents. The measurement is a signature +of the memory contents that can be sent to the guest owner as an attestation +that the memory was encrypted correctly by the firmware. + +Parameters (in): struct kvm_sev_launch_update_data + +Returns: 0 on success, -negative on error + +:: + + struct kvm_sev_launch_update { + __u64 uaddr; /* userspace address to be encrypted (must be 16-byte aligned) */ + __u32 len; /* length of the data to be encrypted (must be 16-byte aligned) */ + }; + +For more details, see SEV spec Section 6.3. + +4. KVM_SEV_LAUNCH_MEASURE +------------------------- + +The KVM_SEV_LAUNCH_MEASURE command is used to retrieve the measurement of the +data encrypted by the KVM_SEV_LAUNCH_UPDATE_DATA command. The guest owner may +wait to provide the guest with confidential information until it can verify the +measurement. Since the guest owner knows the initial contents of the guest at +boot, the measurement can be verified by comparing it to what the guest owner +expects. + +Parameters (in): struct kvm_sev_launch_measure + +Returns: 0 on success, -negative on error + +:: + + struct kvm_sev_launch_measure { + __u64 uaddr; /* where to copy the measurement */ + __u32 len; /* length of measurement blob */ + }; + +For more details on the measurement verification flow, see SEV spec Section 6.4. + +5. KVM_SEV_LAUNCH_FINISH +------------------------ + +After completion of the launch flow, the KVM_SEV_LAUNCH_FINISH command can be +issued to make the guest ready for the execution. + +Returns: 0 on success, -negative on error + +6. KVM_SEV_GUEST_STATUS +----------------------- + +The KVM_SEV_GUEST_STATUS command is used to retrieve status information about a +SEV-enabled guest. + +Parameters (out): struct kvm_sev_guest_status + +Returns: 0 on success, -negative on error + +:: + + struct kvm_sev_guest_status { + __u32 handle; /* guest handle */ + __u32 policy; /* guest policy */ + __u8 state; /* guest state (see enum below) */ + }; + +SEV guest state: + +:: + + enum { + SEV_STATE_INVALID = 0; + SEV_STATE_LAUNCHING, /* guest is currently being launched */ + SEV_STATE_SECRET, /* guest is being launched and ready to accept the ciphertext data */ + SEV_STATE_RUNNING, /* guest is fully launched and running */ + SEV_STATE_RECEIVING, /* guest is being migrated in from another SEV machine */ + SEV_STATE_SENDING /* guest is getting migrated out to another SEV machine */ + }; + +7. KVM_SEV_DBG_DECRYPT +---------------------- + +The KVM_SEV_DEBUG_DECRYPT command can be used by the hypervisor to request the +firmware to decrypt the data at the given memory region. + +Parameters (in): struct kvm_sev_dbg + +Returns: 0 on success, -negative on error + +:: + + struct kvm_sev_dbg { + __u64 src_uaddr; /* userspace address of data to decrypt */ + __u64 dst_uaddr; /* userspace address of destination */ + __u32 len; /* length of memory region to decrypt */ + }; + +The command returns an error if the guest policy does not allow debugging. + +8. KVM_SEV_DBG_ENCRYPT +---------------------- + +The KVM_SEV_DEBUG_ENCRYPT command can be used by the hypervisor to request the +firmware to encrypt the data at the given memory region. + +Parameters (in): struct kvm_sev_dbg + +Returns: 0 on success, -negative on error + +:: + + struct kvm_sev_dbg { + __u64 src_uaddr; /* userspace address of data to encrypt */ + __u64 dst_uaddr; /* userspace address of destination */ + __u32 len; /* length of memory region to encrypt */ + }; + +The command returns an error if the guest policy does not allow debugging. + +9. KVM_SEV_LAUNCH_SECRET +------------------------ + +The KVM_SEV_LAUNCH_SECRET command can be used by the hypervisor to inject secret +data after the measurement has been validated by the guest owner. + +Parameters (in): struct kvm_sev_launch_secret + +Returns: 0 on success, -negative on error + +:: + + struct kvm_sev_launch_secret { + __u64 hdr_uaddr; /* userspace address containing the packet header */ + __u32 hdr_len; + + __u64 guest_uaddr; /* the guest memory region where the secret should be injected */ + __u32 guest_len; + + __u64 trans_uaddr; /* the hypervisor memory region which contains the secret */ + __u32 trans_len; + }; + +References +========== + + +See [white-paper]_, [api-spec]_, [amd-apm]_ and [kvm-forum]_ for more info. + +.. [white-paper] http://amd-dev.wpengine.netdna-cdn.com/wordpress/media/2013/12/AMD_Memory_Encryption_Whitepaper_v7-Public.pdf +.. [api-spec] http://support.amd.com/TechDocs/55766_SEV-KM_API_Specification.pdf +.. [amd-apm] http://support.amd.com/TechDocs/24593.pdf (section 15.34) +.. [kvm-forum] http://www.linux-kvm.org/images/7/74/02x08A-Thomas_Lendacky-AMDs_Virtualizatoin_Memory_Encryption_Technology.pdf diff --git a/Documentation/virt/kvm/api.txt b/Documentation/virt/kvm/api.txt new file mode 100644 index 000000000000..2d067767b617 --- /dev/null +++ b/Documentation/virt/kvm/api.txt @@ -0,0 +1,5296 @@ +The Definitive KVM (Kernel-based Virtual Machine) API Documentation +=================================================================== + +1. General description +---------------------- + +The kvm API is a set of ioctls that are issued to control various aspects +of a virtual machine. The ioctls belong to three classes: + + - System ioctls: These query and set global attributes which affect the + whole kvm subsystem. In addition a system ioctl is used to create + virtual machines. + + - VM ioctls: These query and set attributes that affect an entire virtual + machine, for example memory layout. In addition a VM ioctl is used to + create virtual cpus (vcpus) and devices. + + VM ioctls must be issued from the same process (address space) that was + used to create the VM. + + - vcpu ioctls: These query and set attributes that control the operation + of a single virtual cpu. + + vcpu ioctls should be issued from the same thread that was used to create + the vcpu, except for asynchronous vcpu ioctl that are marked as such in + the documentation. Otherwise, the first ioctl after switching threads + could see a performance impact. + + - device ioctls: These query and set attributes that control the operation + of a single device. + + device ioctls must be issued from the same process (address space) that + was used to create the VM. + +2. File descriptors +------------------- + +The kvm API is centered around file descriptors. An initial +open("/dev/kvm") obtains a handle to the kvm subsystem; this handle +can be used to issue system ioctls. A KVM_CREATE_VM ioctl on this +handle will create a VM file descriptor which can be used to issue VM +ioctls. A KVM_CREATE_VCPU or KVM_CREATE_DEVICE ioctl on a VM fd will +create a virtual cpu or device and return a file descriptor pointing to +the new resource. Finally, ioctls on a vcpu or device fd can be used +to control the vcpu or device. For vcpus, this includes the important +task of actually running guest code. + +In general file descriptors can be migrated among processes by means +of fork() and the SCM_RIGHTS facility of unix domain socket. These +kinds of tricks are explicitly not supported by kvm. While they will +not cause harm to the host, their actual behavior is not guaranteed by +the API. See "General description" for details on the ioctl usage +model that is supported by KVM. + +It is important to note that althought VM ioctls may only be issued from +the process that created the VM, a VM's lifecycle is associated with its +file descriptor, not its creator (process). In other words, the VM and +its resources, *including the associated address space*, are not freed +until the last reference to the VM's file descriptor has been released. +For example, if fork() is issued after ioctl(KVM_CREATE_VM), the VM will +not be freed until both the parent (original) process and its child have +put their references to the VM's file descriptor. + +Because a VM's resources are not freed until the last reference to its +file descriptor is released, creating additional references to a VM via +via fork(), dup(), etc... without careful consideration is strongly +discouraged and may have unwanted side effects, e.g. memory allocated +by and on behalf of the VM's process may not be freed/unaccounted when +the VM is shut down. + + +3. Extensions +------------- + +As of Linux 2.6.22, the KVM ABI has been stabilized: no backward +incompatible change are allowed. However, there is an extension +facility that allows backward-compatible extensions to the API to be +queried and used. + +The extension mechanism is not based on the Linux version number. +Instead, kvm defines extension identifiers and a facility to query +whether a particular extension identifier is available. If it is, a +set of ioctls is available for application use. + + +4. API description +------------------ + +This section describes ioctls that can be used to control kvm guests. +For each ioctl, the following information is provided along with a +description: + + Capability: which KVM extension provides this ioctl. Can be 'basic', + which means that is will be provided by any kernel that supports + API version 12 (see section 4.1), a KVM_CAP_xyz constant, which + means availability needs to be checked with KVM_CHECK_EXTENSION + (see section 4.4), or 'none' which means that while not all kernels + support this ioctl, there's no capability bit to check its + availability: for kernels that don't support the ioctl, + the ioctl returns -ENOTTY. + + Architectures: which instruction set architectures provide this ioctl. + x86 includes both i386 and x86_64. + + Type: system, vm, or vcpu. + + Parameters: what parameters are accepted by the ioctl. + + Returns: the return value. General error numbers (EBADF, ENOMEM, EINVAL) + are not detailed, but errors with specific meanings are. + + +4.1 KVM_GET_API_VERSION + +Capability: basic +Architectures: all +Type: system ioctl +Parameters: none +Returns: the constant KVM_API_VERSION (=12) + +This identifies the API version as the stable kvm API. It is not +expected that this number will change. However, Linux 2.6.20 and +2.6.21 report earlier versions; these are not documented and not +supported. Applications should refuse to run if KVM_GET_API_VERSION +returns a value other than 12. If this check passes, all ioctls +described as 'basic' will be available. + + +4.2 KVM_CREATE_VM + +Capability: basic +Architectures: all +Type: system ioctl +Parameters: machine type identifier (KVM_VM_*) +Returns: a VM fd that can be used to control the new virtual machine. + +The new VM has no virtual cpus and no memory. +You probably want to use 0 as machine type. + +In order to create user controlled virtual machines on S390, check +KVM_CAP_S390_UCONTROL and use the flag KVM_VM_S390_UCONTROL as +privileged user (CAP_SYS_ADMIN). + +To use hardware assisted virtualization on MIPS (VZ ASE) rather than +the default trap & emulate implementation (which changes the virtual +memory layout to fit in user mode), check KVM_CAP_MIPS_VZ and use the +flag KVM_VM_MIPS_VZ. + + +On arm64, the physical address size for a VM (IPA Size limit) is limited +to 40bits by default. The limit can be configured if the host supports the +extension KVM_CAP_ARM_VM_IPA_SIZE. When supported, use +KVM_VM_TYPE_ARM_IPA_SIZE(IPA_Bits) to set the size in the machine type +identifier, where IPA_Bits is the maximum width of any physical +address used by the VM. The IPA_Bits is encoded in bits[7-0] of the +machine type identifier. + +e.g, to configure a guest to use 48bit physical address size : + + vm_fd = ioctl(dev_fd, KVM_CREATE_VM, KVM_VM_TYPE_ARM_IPA_SIZE(48)); + +The requested size (IPA_Bits) must be : + 0 - Implies default size, 40bits (for backward compatibility) + + or + + N - Implies N bits, where N is a positive integer such that, + 32 <= N <= Host_IPA_Limit + +Host_IPA_Limit is the maximum possible value for IPA_Bits on the host and +is dependent on the CPU capability and the kernel configuration. The limit can +be retrieved using KVM_CAP_ARM_VM_IPA_SIZE of the KVM_CHECK_EXTENSION +ioctl() at run-time. + +Please note that configuring the IPA size does not affect the capability +exposed by the guest CPUs in ID_AA64MMFR0_EL1[PARange]. It only affects +size of the address translated by the stage2 level (guest physical to +host physical address translations). + + +4.3 KVM_GET_MSR_INDEX_LIST, KVM_GET_MSR_FEATURE_INDEX_LIST + +Capability: basic, KVM_CAP_GET_MSR_FEATURES for KVM_GET_MSR_FEATURE_INDEX_LIST +Architectures: x86 +Type: system ioctl +Parameters: struct kvm_msr_list (in/out) +Returns: 0 on success; -1 on error +Errors: + EFAULT: the msr index list cannot be read from or written to + E2BIG: the msr index list is to be to fit in the array specified by + the user. + +struct kvm_msr_list { + __u32 nmsrs; /* number of msrs in entries */ + __u32 indices[0]; +}; + +The user fills in the size of the indices array in nmsrs, and in return +kvm adjusts nmsrs to reflect the actual number of msrs and fills in the +indices array with their numbers. + +KVM_GET_MSR_INDEX_LIST returns the guest msrs that are supported. The list +varies by kvm version and host processor, but does not change otherwise. + +Note: if kvm indicates supports MCE (KVM_CAP_MCE), then the MCE bank MSRs are +not returned in the MSR list, as different vcpus can have a different number +of banks, as set via the KVM_X86_SETUP_MCE ioctl. + +KVM_GET_MSR_FEATURE_INDEX_LIST returns the list of MSRs that can be passed +to the KVM_GET_MSRS system ioctl. This lets userspace probe host capabilities +and processor features that are exposed via MSRs (e.g., VMX capabilities). +This list also varies by kvm version and host processor, but does not change +otherwise. + + +4.4 KVM_CHECK_EXTENSION + +Capability: basic, KVM_CAP_CHECK_EXTENSION_VM for vm ioctl +Architectures: all +Type: system ioctl, vm ioctl +Parameters: extension identifier (KVM_CAP_*) +Returns: 0 if unsupported; 1 (or some other positive integer) if supported + +The API allows the application to query about extensions to the core +kvm API. Userspace passes an extension identifier (an integer) and +receives an integer that describes the extension availability. +Generally 0 means no and 1 means yes, but some extensions may report +additional information in the integer return value. + +Based on their initialization different VMs may have different capabilities. +It is thus encouraged to use the vm ioctl to query for capabilities (available +with KVM_CAP_CHECK_EXTENSION_VM on the vm fd) + +4.5 KVM_GET_VCPU_MMAP_SIZE + +Capability: basic +Architectures: all +Type: system ioctl +Parameters: none +Returns: size of vcpu mmap area, in bytes + +The KVM_RUN ioctl (cf.) communicates with userspace via a shared +memory region. This ioctl returns the size of that region. See the +KVM_RUN documentation for details. + + +4.6 KVM_SET_MEMORY_REGION + +Capability: basic +Architectures: all +Type: vm ioctl +Parameters: struct kvm_memory_region (in) +Returns: 0 on success, -1 on error + +This ioctl is obsolete and has been removed. + + +4.7 KVM_CREATE_VCPU + +Capability: basic +Architectures: all +Type: vm ioctl +Parameters: vcpu id (apic id on x86) +Returns: vcpu fd on success, -1 on error + +This API adds a vcpu to a virtual machine. No more than max_vcpus may be added. +The vcpu id is an integer in the range [0, max_vcpu_id). + +The recommended max_vcpus value can be retrieved using the KVM_CAP_NR_VCPUS of +the KVM_CHECK_EXTENSION ioctl() at run-time. +The maximum possible value for max_vcpus can be retrieved using the +KVM_CAP_MAX_VCPUS of the KVM_CHECK_EXTENSION ioctl() at run-time. + +If the KVM_CAP_NR_VCPUS does not exist, you should assume that max_vcpus is 4 +cpus max. +If the KVM_CAP_MAX_VCPUS does not exist, you should assume that max_vcpus is +same as the value returned from KVM_CAP_NR_VCPUS. + +The maximum possible value for max_vcpu_id can be retrieved using the +KVM_CAP_MAX_VCPU_ID of the KVM_CHECK_EXTENSION ioctl() at run-time. + +If the KVM_CAP_MAX_VCPU_ID does not exist, you should assume that max_vcpu_id +is the same as the value returned from KVM_CAP_MAX_VCPUS. + +On powerpc using book3s_hv mode, the vcpus are mapped onto virtual +threads in one or more virtual CPU cores. (This is because the +hardware requires all the hardware threads in a CPU core to be in the +same partition.) The KVM_CAP_PPC_SMT capability indicates the number +of vcpus per virtual core (vcore). The vcore id is obtained by +dividing the vcpu id by the number of vcpus per vcore. The vcpus in a +given vcore will always be in the same physical core as each other +(though that might be a different physical core from time to time). +Userspace can control the threading (SMT) mode of the guest by its +allocation of vcpu ids. For example, if userspace wants +single-threaded guest vcpus, it should make all vcpu ids be a multiple +of the number of vcpus per vcore. + +For virtual cpus that have been created with S390 user controlled virtual +machines, the resulting vcpu fd can be memory mapped at page offset +KVM_S390_SIE_PAGE_OFFSET in order to obtain a memory map of the virtual +cpu's hardware control block. + + +4.8 KVM_GET_DIRTY_LOG (vm ioctl) + +Capability: basic +Architectures: all +Type: vm ioctl +Parameters: struct kvm_dirty_log (in/out) +Returns: 0 on success, -1 on error + +/* for KVM_GET_DIRTY_LOG */ +struct kvm_dirty_log { + __u32 slot; + __u32 padding; + union { + void __user *dirty_bitmap; /* one bit per page */ + __u64 padding; + }; +}; + +Given a memory slot, return a bitmap containing any pages dirtied +since the last call to this ioctl. Bit 0 is the first page in the +memory slot. Ensure the entire structure is cleared to avoid padding +issues. + +If KVM_CAP_MULTI_ADDRESS_SPACE is available, bits 16-31 specifies +the address space for which you want to return the dirty bitmap. +They must be less than the value that KVM_CHECK_EXTENSION returns for +the KVM_CAP_MULTI_ADDRESS_SPACE capability. + +The bits in the dirty bitmap are cleared before the ioctl returns, unless +KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 is enabled. For more information, +see the description of the capability. + +4.9 KVM_SET_MEMORY_ALIAS + +Capability: basic +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_memory_alias (in) +Returns: 0 (success), -1 (error) + +This ioctl is obsolete and has been removed. + + +4.10 KVM_RUN + +Capability: basic +Architectures: all +Type: vcpu ioctl +Parameters: none +Returns: 0 on success, -1 on error +Errors: + EINTR: an unmasked signal is pending + +This ioctl is used to run a guest virtual cpu. While there are no +explicit parameters, there is an implicit parameter block that can be +obtained by mmap()ing the vcpu fd at offset 0, with the size given by +KVM_GET_VCPU_MMAP_SIZE. The parameter block is formatted as a 'struct +kvm_run' (see below). + + +4.11 KVM_GET_REGS + +Capability: basic +Architectures: all except ARM, arm64 +Type: vcpu ioctl +Parameters: struct kvm_regs (out) +Returns: 0 on success, -1 on error + +Reads the general purpose registers from the vcpu. + +/* x86 */ +struct kvm_regs { + /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */ + __u64 rax, rbx, rcx, rdx; + __u64 rsi, rdi, rsp, rbp; + __u64 r8, r9, r10, r11; + __u64 r12, r13, r14, r15; + __u64 rip, rflags; +}; + +/* mips */ +struct kvm_regs { + /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */ + __u64 gpr[32]; + __u64 hi; + __u64 lo; + __u64 pc; +}; + + +4.12 KVM_SET_REGS + +Capability: basic +Architectures: all except ARM, arm64 +Type: vcpu ioctl +Parameters: struct kvm_regs (in) +Returns: 0 on success, -1 on error + +Writes the general purpose registers into the vcpu. + +See KVM_GET_REGS for the data structure. + + +4.13 KVM_GET_SREGS + +Capability: basic +Architectures: x86, ppc +Type: vcpu ioctl +Parameters: struct kvm_sregs (out) +Returns: 0 on success, -1 on error + +Reads special registers from the vcpu. + +/* x86 */ +struct kvm_sregs { + struct kvm_segment cs, ds, es, fs, gs, ss; + struct kvm_segment tr, ldt; + struct kvm_dtable gdt, idt; + __u64 cr0, cr2, cr3, cr4, cr8; + __u64 efer; + __u64 apic_base; + __u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64]; +}; + +/* ppc -- see arch/powerpc/include/uapi/asm/kvm.h */ + +interrupt_bitmap is a bitmap of pending external interrupts. At most +one bit may be set. This interrupt has been acknowledged by the APIC +but not yet injected into the cpu core. + + +4.14 KVM_SET_SREGS + +Capability: basic +Architectures: x86, ppc +Type: vcpu ioctl +Parameters: struct kvm_sregs (in) +Returns: 0 on success, -1 on error + +Writes special registers into the vcpu. See KVM_GET_SREGS for the +data structures. + + +4.15 KVM_TRANSLATE + +Capability: basic +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_translation (in/out) +Returns: 0 on success, -1 on error + +Translates a virtual address according to the vcpu's current address +translation mode. + +struct kvm_translation { + /* in */ + __u64 linear_address; + + /* out */ + __u64 physical_address; + __u8 valid; + __u8 writeable; + __u8 usermode; + __u8 pad[5]; +}; + + +4.16 KVM_INTERRUPT + +Capability: basic +Architectures: x86, ppc, mips +Type: vcpu ioctl +Parameters: struct kvm_interrupt (in) +Returns: 0 on success, negative on failure. + +Queues a hardware interrupt vector to be injected. + +/* for KVM_INTERRUPT */ +struct kvm_interrupt { + /* in */ + __u32 irq; +}; + +X86: + +Returns: 0 on success, + -EEXIST if an interrupt is already enqueued + -EINVAL the the irq number is invalid + -ENXIO if the PIC is in the kernel + -EFAULT if the pointer is invalid + +Note 'irq' is an interrupt vector, not an interrupt pin or line. This +ioctl is useful if the in-kernel PIC is not used. + +PPC: + +Queues an external interrupt to be injected. This ioctl is overleaded +with 3 different irq values: + +a) KVM_INTERRUPT_SET + + This injects an edge type external interrupt into the guest once it's ready + to receive interrupts. When injected, the interrupt is done. + +b) KVM_INTERRUPT_UNSET + + This unsets any pending interrupt. + + Only available with KVM_CAP_PPC_UNSET_IRQ. + +c) KVM_INTERRUPT_SET_LEVEL + + This injects a level type external interrupt into the guest context. The + interrupt stays pending until a specific ioctl with KVM_INTERRUPT_UNSET + is triggered. + + Only available with KVM_CAP_PPC_IRQ_LEVEL. + +Note that any value for 'irq' other than the ones stated above is invalid +and incurs unexpected behavior. + +This is an asynchronous vcpu ioctl and can be invoked from any thread. + +MIPS: + +Queues an external interrupt to be injected into the virtual CPU. A negative +interrupt number dequeues the interrupt. + +This is an asynchronous vcpu ioctl and can be invoked from any thread. + + +4.17 KVM_DEBUG_GUEST + +Capability: basic +Architectures: none +Type: vcpu ioctl +Parameters: none) +Returns: -1 on error + +Support for this has been removed. Use KVM_SET_GUEST_DEBUG instead. + + +4.18 KVM_GET_MSRS + +Capability: basic (vcpu), KVM_CAP_GET_MSR_FEATURES (system) +Architectures: x86 +Type: system ioctl, vcpu ioctl +Parameters: struct kvm_msrs (in/out) +Returns: number of msrs successfully returned; + -1 on error + +When used as a system ioctl: +Reads the values of MSR-based features that are available for the VM. This +is similar to KVM_GET_SUPPORTED_CPUID, but it returns MSR indices and values. +The list of msr-based features can be obtained using KVM_GET_MSR_FEATURE_INDEX_LIST +in a system ioctl. + +When used as a vcpu ioctl: +Reads model-specific registers from the vcpu. Supported msr indices can +be obtained using KVM_GET_MSR_INDEX_LIST in a system ioctl. + +struct kvm_msrs { + __u32 nmsrs; /* number of msrs in entries */ + __u32 pad; + + struct kvm_msr_entry entries[0]; +}; + +struct kvm_msr_entry { + __u32 index; + __u32 reserved; + __u64 data; +}; + +Application code should set the 'nmsrs' member (which indicates the +size of the entries array) and the 'index' member of each array entry. +kvm will fill in the 'data' member. + + +4.19 KVM_SET_MSRS + +Capability: basic +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_msrs (in) +Returns: 0 on success, -1 on error + +Writes model-specific registers to the vcpu. See KVM_GET_MSRS for the +data structures. + +Application code should set the 'nmsrs' member (which indicates the +size of the entries array), and the 'index' and 'data' members of each +array entry. + + +4.20 KVM_SET_CPUID + +Capability: basic +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_cpuid (in) +Returns: 0 on success, -1 on error + +Defines the vcpu responses to the cpuid instruction. Applications +should use the KVM_SET_CPUID2 ioctl if available. + + +struct kvm_cpuid_entry { + __u32 function; + __u32 eax; + __u32 ebx; + __u32 ecx; + __u32 edx; + __u32 padding; +}; + +/* for KVM_SET_CPUID */ +struct kvm_cpuid { + __u32 nent; + __u32 padding; + struct kvm_cpuid_entry entries[0]; +}; + + +4.21 KVM_SET_SIGNAL_MASK + +Capability: basic +Architectures: all +Type: vcpu ioctl +Parameters: struct kvm_signal_mask (in) +Returns: 0 on success, -1 on error + +Defines which signals are blocked during execution of KVM_RUN. This +signal mask temporarily overrides the threads signal mask. Any +unblocked signal received (except SIGKILL and SIGSTOP, which retain +their traditional behaviour) will cause KVM_RUN to return with -EINTR. + +Note the signal will only be delivered if not blocked by the original +signal mask. + +/* for KVM_SET_SIGNAL_MASK */ +struct kvm_signal_mask { + __u32 len; + __u8 sigset[0]; +}; + + +4.22 KVM_GET_FPU + +Capability: basic +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_fpu (out) +Returns: 0 on success, -1 on error + +Reads the floating point state from the vcpu. + +/* for KVM_GET_FPU and KVM_SET_FPU */ +struct kvm_fpu { + __u8 fpr[8][16]; + __u16 fcw; + __u16 fsw; + __u8 ftwx; /* in fxsave format */ + __u8 pad1; + __u16 last_opcode; + __u64 last_ip; + __u64 last_dp; + __u8 xmm[16][16]; + __u32 mxcsr; + __u32 pad2; +}; + + +4.23 KVM_SET_FPU + +Capability: basic +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_fpu (in) +Returns: 0 on success, -1 on error + +Writes the floating point state to the vcpu. + +/* for KVM_GET_FPU and KVM_SET_FPU */ +struct kvm_fpu { + __u8 fpr[8][16]; + __u16 fcw; + __u16 fsw; + __u8 ftwx; /* in fxsave format */ + __u8 pad1; + __u16 last_opcode; + __u64 last_ip; + __u64 last_dp; + __u8 xmm[16][16]; + __u32 mxcsr; + __u32 pad2; +}; + + +4.24 KVM_CREATE_IRQCHIP + +Capability: KVM_CAP_IRQCHIP, KVM_CAP_S390_IRQCHIP (s390) +Architectures: x86, ARM, arm64, s390 +Type: vm ioctl +Parameters: none +Returns: 0 on success, -1 on error + +Creates an interrupt controller model in the kernel. +On x86, creates a virtual ioapic, a virtual PIC (two PICs, nested), and sets up +future vcpus to have a local APIC. IRQ routing for GSIs 0-15 is set to both +PIC and IOAPIC; GSI 16-23 only go to the IOAPIC. +On ARM/arm64, a GICv2 is created. Any other GIC versions require the usage of +KVM_CREATE_DEVICE, which also supports creating a GICv2. Using +KVM_CREATE_DEVICE is preferred over KVM_CREATE_IRQCHIP for GICv2. +On s390, a dummy irq routing table is created. + +Note that on s390 the KVM_CAP_S390_IRQCHIP vm capability needs to be enabled +before KVM_CREATE_IRQCHIP can be used. + + +4.25 KVM_IRQ_LINE + +Capability: KVM_CAP_IRQCHIP +Architectures: x86, arm, arm64 +Type: vm ioctl +Parameters: struct kvm_irq_level +Returns: 0 on success, -1 on error + +Sets the level of a GSI input to the interrupt controller model in the kernel. +On some architectures it is required that an interrupt controller model has +been previously created with KVM_CREATE_IRQCHIP. Note that edge-triggered +interrupts require the level to be set to 1 and then back to 0. + +On real hardware, interrupt pins can be active-low or active-high. This +does not matter for the level field of struct kvm_irq_level: 1 always +means active (asserted), 0 means inactive (deasserted). + +x86 allows the operating system to program the interrupt polarity +(active-low/active-high) for level-triggered interrupts, and KVM used +to consider the polarity. However, due to bitrot in the handling of +active-low interrupts, the above convention is now valid on x86 too. +This is signaled by KVM_CAP_X86_IOAPIC_POLARITY_IGNORED. Userspace +should not present interrupts to the guest as active-low unless this +capability is present (or unless it is not using the in-kernel irqchip, +of course). + + +ARM/arm64 can signal an interrupt either at the CPU level, or at the +in-kernel irqchip (GIC), and for in-kernel irqchip can tell the GIC to +use PPIs designated for specific cpus. The irq field is interpreted +like this: + +  bits: | 31 ... 24 | 23 ... 16 | 15 ... 0 | + field: | irq_type | vcpu_index | irq_id | + +The irq_type field has the following values: +- irq_type[0]: out-of-kernel GIC: irq_id 0 is IRQ, irq_id 1 is FIQ +- irq_type[1]: in-kernel GIC: SPI, irq_id between 32 and 1019 (incl.) + (the vcpu_index field is ignored) +- irq_type[2]: in-kernel GIC: PPI, irq_id between 16 and 31 (incl.) + +(The irq_id field thus corresponds nicely to the IRQ ID in the ARM GIC specs) + +In both cases, level is used to assert/deassert the line. + +struct kvm_irq_level { + union { + __u32 irq; /* GSI */ + __s32 status; /* not used for KVM_IRQ_LEVEL */ + }; + __u32 level; /* 0 or 1 */ +}; + + +4.26 KVM_GET_IRQCHIP + +Capability: KVM_CAP_IRQCHIP +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_irqchip (in/out) +Returns: 0 on success, -1 on error + +Reads the state of a kernel interrupt controller created with +KVM_CREATE_IRQCHIP into a buffer provided by the caller. + +struct kvm_irqchip { + __u32 chip_id; /* 0 = PIC1, 1 = PIC2, 2 = IOAPIC */ + __u32 pad; + union { + char dummy[512]; /* reserving space */ + struct kvm_pic_state pic; + struct kvm_ioapic_state ioapic; + } chip; +}; + + +4.27 KVM_SET_IRQCHIP + +Capability: KVM_CAP_IRQCHIP +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_irqchip (in) +Returns: 0 on success, -1 on error + +Sets the state of a kernel interrupt controller created with +KVM_CREATE_IRQCHIP from a buffer provided by the caller. + +struct kvm_irqchip { + __u32 chip_id; /* 0 = PIC1, 1 = PIC2, 2 = IOAPIC */ + __u32 pad; + union { + char dummy[512]; /* reserving space */ + struct kvm_pic_state pic; + struct kvm_ioapic_state ioapic; + } chip; +}; + + +4.28 KVM_XEN_HVM_CONFIG + +Capability: KVM_CAP_XEN_HVM +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_xen_hvm_config (in) +Returns: 0 on success, -1 on error + +Sets the MSR that the Xen HVM guest uses to initialize its hypercall +page, and provides the starting address and size of the hypercall +blobs in userspace. When the guest writes the MSR, kvm copies one +page of a blob (32- or 64-bit, depending on the vcpu mode) to guest +memory. + +struct kvm_xen_hvm_config { + __u32 flags; + __u32 msr; + __u64 blob_addr_32; + __u64 blob_addr_64; + __u8 blob_size_32; + __u8 blob_size_64; + __u8 pad2[30]; +}; + + +4.29 KVM_GET_CLOCK + +Capability: KVM_CAP_ADJUST_CLOCK +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_clock_data (out) +Returns: 0 on success, -1 on error + +Gets the current timestamp of kvmclock as seen by the current guest. In +conjunction with KVM_SET_CLOCK, it is used to ensure monotonicity on scenarios +such as migration. + +When KVM_CAP_ADJUST_CLOCK is passed to KVM_CHECK_EXTENSION, it returns the +set of bits that KVM can return in struct kvm_clock_data's flag member. + +The only flag defined now is KVM_CLOCK_TSC_STABLE. If set, the returned +value is the exact kvmclock value seen by all VCPUs at the instant +when KVM_GET_CLOCK was called. If clear, the returned value is simply +CLOCK_MONOTONIC plus a constant offset; the offset can be modified +with KVM_SET_CLOCK. KVM will try to make all VCPUs follow this clock, +but the exact value read by each VCPU could differ, because the host +TSC is not stable. + +struct kvm_clock_data { + __u64 clock; /* kvmclock current value */ + __u32 flags; + __u32 pad[9]; +}; + + +4.30 KVM_SET_CLOCK + +Capability: KVM_CAP_ADJUST_CLOCK +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_clock_data (in) +Returns: 0 on success, -1 on error + +Sets the current timestamp of kvmclock to the value specified in its parameter. +In conjunction with KVM_GET_CLOCK, it is used to ensure monotonicity on scenarios +such as migration. + +struct kvm_clock_data { + __u64 clock; /* kvmclock current value */ + __u32 flags; + __u32 pad[9]; +}; + + +4.31 KVM_GET_VCPU_EVENTS + +Capability: KVM_CAP_VCPU_EVENTS +Extended by: KVM_CAP_INTR_SHADOW +Architectures: x86, arm, arm64 +Type: vcpu ioctl +Parameters: struct kvm_vcpu_event (out) +Returns: 0 on success, -1 on error + +X86: + +Gets currently pending exceptions, interrupts, and NMIs as well as related +states of the vcpu. + +struct kvm_vcpu_events { + struct { + __u8 injected; + __u8 nr; + __u8 has_error_code; + __u8 pending; + __u32 error_code; + } exception; + struct { + __u8 injected; + __u8 nr; + __u8 soft; + __u8 shadow; + } interrupt; + struct { + __u8 injected; + __u8 pending; + __u8 masked; + __u8 pad; + } nmi; + __u32 sipi_vector; + __u32 flags; + struct { + __u8 smm; + __u8 pending; + __u8 smm_inside_nmi; + __u8 latched_init; + } smi; + __u8 reserved[27]; + __u8 exception_has_payload; + __u64 exception_payload; +}; + +The following bits are defined in the flags field: + +- KVM_VCPUEVENT_VALID_SHADOW may be set to signal that + interrupt.shadow contains a valid state. + +- KVM_VCPUEVENT_VALID_SMM may be set to signal that smi contains a + valid state. + +- KVM_VCPUEVENT_VALID_PAYLOAD may be set to signal that the + exception_has_payload, exception_payload, and exception.pending + fields contain a valid state. This bit will be set whenever + KVM_CAP_EXCEPTION_PAYLOAD is enabled. + +ARM/ARM64: + +If the guest accesses a device that is being emulated by the host kernel in +such a way that a real device would generate a physical SError, KVM may make +a virtual SError pending for that VCPU. This system error interrupt remains +pending until the guest takes the exception by unmasking PSTATE.A. + +Running the VCPU may cause it to take a pending SError, or make an access that +causes an SError to become pending. The event's description is only valid while +the VPCU is not running. + +This API provides a way to read and write the pending 'event' state that is not +visible to the guest. To save, restore or migrate a VCPU the struct representing +the state can be read then written using this GET/SET API, along with the other +guest-visible registers. It is not possible to 'cancel' an SError that has been +made pending. + +A device being emulated in user-space may also wish to generate an SError. To do +this the events structure can be populated by user-space. The current state +should be read first, to ensure no existing SError is pending. If an existing +SError is pending, the architecture's 'Multiple SError interrupts' rules should +be followed. (2.5.3 of DDI0587.a "ARM Reliability, Availability, and +Serviceability (RAS) Specification"). + +SError exceptions always have an ESR value. Some CPUs have the ability to +specify what the virtual SError's ESR value should be. These systems will +advertise KVM_CAP_ARM_INJECT_SERROR_ESR. In this case exception.has_esr will +always have a non-zero value when read, and the agent making an SError pending +should specify the ISS field in the lower 24 bits of exception.serror_esr. If +the system supports KVM_CAP_ARM_INJECT_SERROR_ESR, but user-space sets the events +with exception.has_esr as zero, KVM will choose an ESR. + +Specifying exception.has_esr on a system that does not support it will return +-EINVAL. Setting anything other than the lower 24bits of exception.serror_esr +will return -EINVAL. + +struct kvm_vcpu_events { + struct { + __u8 serror_pending; + __u8 serror_has_esr; + /* Align it to 8 bytes */ + __u8 pad[6]; + __u64 serror_esr; + } exception; + __u32 reserved[12]; +}; + +4.32 KVM_SET_VCPU_EVENTS + +Capability: KVM_CAP_VCPU_EVENTS +Extended by: KVM_CAP_INTR_SHADOW +Architectures: x86, arm, arm64 +Type: vcpu ioctl +Parameters: struct kvm_vcpu_event (in) +Returns: 0 on success, -1 on error + +X86: + +Set pending exceptions, interrupts, and NMIs as well as related states of the +vcpu. + +See KVM_GET_VCPU_EVENTS for the data structure. + +Fields that may be modified asynchronously by running VCPUs can be excluded +from the update. These fields are nmi.pending, sipi_vector, smi.smm, +smi.pending. Keep the corresponding bits in the flags field cleared to +suppress overwriting the current in-kernel state. The bits are: + +KVM_VCPUEVENT_VALID_NMI_PENDING - transfer nmi.pending to the kernel +KVM_VCPUEVENT_VALID_SIPI_VECTOR - transfer sipi_vector +KVM_VCPUEVENT_VALID_SMM - transfer the smi sub-struct. + +If KVM_CAP_INTR_SHADOW is available, KVM_VCPUEVENT_VALID_SHADOW can be set in +the flags field to signal that interrupt.shadow contains a valid state and +shall be written into the VCPU. + +KVM_VCPUEVENT_VALID_SMM can only be set if KVM_CAP_X86_SMM is available. + +If KVM_CAP_EXCEPTION_PAYLOAD is enabled, KVM_VCPUEVENT_VALID_PAYLOAD +can be set in the flags field to signal that the +exception_has_payload, exception_payload, and exception.pending fields +contain a valid state and shall be written into the VCPU. + +ARM/ARM64: + +Set the pending SError exception state for this VCPU. It is not possible to +'cancel' an Serror that has been made pending. + +See KVM_GET_VCPU_EVENTS for the data structure. + + +4.33 KVM_GET_DEBUGREGS + +Capability: KVM_CAP_DEBUGREGS +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_debugregs (out) +Returns: 0 on success, -1 on error + +Reads debug registers from the vcpu. + +struct kvm_debugregs { + __u64 db[4]; + __u64 dr6; + __u64 dr7; + __u64 flags; + __u64 reserved[9]; +}; + + +4.34 KVM_SET_DEBUGREGS + +Capability: KVM_CAP_DEBUGREGS +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_debugregs (in) +Returns: 0 on success, -1 on error + +Writes debug registers into the vcpu. + +See KVM_GET_DEBUGREGS for the data structure. The flags field is unused +yet and must be cleared on entry. + + +4.35 KVM_SET_USER_MEMORY_REGION + +Capability: KVM_CAP_USER_MEMORY +Architectures: all +Type: vm ioctl +Parameters: struct kvm_userspace_memory_region (in) +Returns: 0 on success, -1 on error + +struct kvm_userspace_memory_region { + __u32 slot; + __u32 flags; + __u64 guest_phys_addr; + __u64 memory_size; /* bytes */ + __u64 userspace_addr; /* start of the userspace allocated memory */ +}; + +/* for kvm_memory_region::flags */ +#define KVM_MEM_LOG_DIRTY_PAGES (1UL << 0) +#define KVM_MEM_READONLY (1UL << 1) + +This ioctl allows the user to create, modify or delete a guest physical +memory slot. Bits 0-15 of "slot" specify the slot id and this value +should be less than the maximum number of user memory slots supported per +VM. The maximum allowed slots can be queried using KVM_CAP_NR_MEMSLOTS. +Slots may not overlap in guest physical address space. + +If KVM_CAP_MULTI_ADDRESS_SPACE is available, bits 16-31 of "slot" +specifies the address space which is being modified. They must be +less than the value that KVM_CHECK_EXTENSION returns for the +KVM_CAP_MULTI_ADDRESS_SPACE capability. Slots in separate address spaces +are unrelated; the restriction on overlapping slots only applies within +each address space. + +Deleting a slot is done by passing zero for memory_size. When changing +an existing slot, it may be moved in the guest physical memory space, +or its flags may be modified, but it may not be resized. + +Memory for the region is taken starting at the address denoted by the +field userspace_addr, which must point at user addressable memory for +the entire memory slot size. Any object may back this memory, including +anonymous memory, ordinary files, and hugetlbfs. + +It is recommended that the lower 21 bits of guest_phys_addr and userspace_addr +be identical. This allows large pages in the guest to be backed by large +pages in the host. + +The flags field supports two flags: KVM_MEM_LOG_DIRTY_PAGES and +KVM_MEM_READONLY. The former can be set to instruct KVM to keep track of +writes to memory within the slot. See KVM_GET_DIRTY_LOG ioctl to know how to +use it. The latter can be set, if KVM_CAP_READONLY_MEM capability allows it, +to make a new slot read-only. In this case, writes to this memory will be +posted to userspace as KVM_EXIT_MMIO exits. + +When the KVM_CAP_SYNC_MMU capability is available, changes in the backing of +the memory region are automatically reflected into the guest. For example, an +mmap() that affects the region will be made visible immediately. Another +example is madvise(MADV_DROP). + +It is recommended to use this API instead of the KVM_SET_MEMORY_REGION ioctl. +The KVM_SET_MEMORY_REGION does not allow fine grained control over memory +allocation and is deprecated. + + +4.36 KVM_SET_TSS_ADDR + +Capability: KVM_CAP_SET_TSS_ADDR +Architectures: x86 +Type: vm ioctl +Parameters: unsigned long tss_address (in) +Returns: 0 on success, -1 on error + +This ioctl defines the physical address of a three-page region in the guest +physical address space. The region must be within the first 4GB of the +guest physical address space and must not conflict with any memory slot +or any mmio address. The guest may malfunction if it accesses this memory +region. + +This ioctl is required on Intel-based hosts. This is needed on Intel hardware +because of a quirk in the virtualization implementation (see the internals +documentation when it pops into existence). + + +4.37 KVM_ENABLE_CAP + +Capability: KVM_CAP_ENABLE_CAP +Architectures: mips, ppc, s390 +Type: vcpu ioctl +Parameters: struct kvm_enable_cap (in) +Returns: 0 on success; -1 on error + +Capability: KVM_CAP_ENABLE_CAP_VM +Architectures: all +Type: vcpu ioctl +Parameters: struct kvm_enable_cap (in) +Returns: 0 on success; -1 on error + ++Not all extensions are enabled by default. Using this ioctl the application +can enable an extension, making it available to the guest. + +On systems that do not support this ioctl, it always fails. On systems that +do support it, it only works for extensions that are supported for enablement. + +To check if a capability can be enabled, the KVM_CHECK_EXTENSION ioctl should +be used. + +struct kvm_enable_cap { + /* in */ + __u32 cap; + +The capability that is supposed to get enabled. + + __u32 flags; + +A bitfield indicating future enhancements. Has to be 0 for now. + + __u64 args[4]; + +Arguments for enabling a feature. If a feature needs initial values to +function properly, this is the place to put them. + + __u8 pad[64]; +}; + +The vcpu ioctl should be used for vcpu-specific capabilities, the vm ioctl +for vm-wide capabilities. + +4.38 KVM_GET_MP_STATE + +Capability: KVM_CAP_MP_STATE +Architectures: x86, s390, arm, arm64 +Type: vcpu ioctl +Parameters: struct kvm_mp_state (out) +Returns: 0 on success; -1 on error + +struct kvm_mp_state { + __u32 mp_state; +}; + +Returns the vcpu's current "multiprocessing state" (though also valid on +uniprocessor guests). + +Possible values are: + + - KVM_MP_STATE_RUNNABLE: the vcpu is currently running [x86,arm/arm64] + - KVM_MP_STATE_UNINITIALIZED: the vcpu is an application processor (AP) + which has not yet received an INIT signal [x86] + - KVM_MP_STATE_INIT_RECEIVED: the vcpu has received an INIT signal, and is + now ready for a SIPI [x86] + - KVM_MP_STATE_HALTED: the vcpu has executed a HLT instruction and + is waiting for an interrupt [x86] + - KVM_MP_STATE_SIPI_RECEIVED: the vcpu has just received a SIPI (vector + accessible via KVM_GET_VCPU_EVENTS) [x86] + - KVM_MP_STATE_STOPPED: the vcpu is stopped [s390,arm/arm64] + - KVM_MP_STATE_CHECK_STOP: the vcpu is in a special error state [s390] + - KVM_MP_STATE_OPERATING: the vcpu is operating (running or halted) + [s390] + - KVM_MP_STATE_LOAD: the vcpu is in a special load/startup state + [s390] + +On x86, this ioctl is only useful after KVM_CREATE_IRQCHIP. Without an +in-kernel irqchip, the multiprocessing state must be maintained by userspace on +these architectures. + +For arm/arm64: + +The only states that are valid are KVM_MP_STATE_STOPPED and +KVM_MP_STATE_RUNNABLE which reflect if the vcpu is paused or not. + +4.39 KVM_SET_MP_STATE + +Capability: KVM_CAP_MP_STATE +Architectures: x86, s390, arm, arm64 +Type: vcpu ioctl +Parameters: struct kvm_mp_state (in) +Returns: 0 on success; -1 on error + +Sets the vcpu's current "multiprocessing state"; see KVM_GET_MP_STATE for +arguments. + +On x86, this ioctl is only useful after KVM_CREATE_IRQCHIP. Without an +in-kernel irqchip, the multiprocessing state must be maintained by userspace on +these architectures. + +For arm/arm64: + +The only states that are valid are KVM_MP_STATE_STOPPED and +KVM_MP_STATE_RUNNABLE which reflect if the vcpu should be paused or not. + +4.40 KVM_SET_IDENTITY_MAP_ADDR + +Capability: KVM_CAP_SET_IDENTITY_MAP_ADDR +Architectures: x86 +Type: vm ioctl +Parameters: unsigned long identity (in) +Returns: 0 on success, -1 on error + +This ioctl defines the physical address of a one-page region in the guest +physical address space. The region must be within the first 4GB of the +guest physical address space and must not conflict with any memory slot +or any mmio address. The guest may malfunction if it accesses this memory +region. + +Setting the address to 0 will result in resetting the address to its default +(0xfffbc000). + +This ioctl is required on Intel-based hosts. This is needed on Intel hardware +because of a quirk in the virtualization implementation (see the internals +documentation when it pops into existence). + +Fails if any VCPU has already been created. + +4.41 KVM_SET_BOOT_CPU_ID + +Capability: KVM_CAP_SET_BOOT_CPU_ID +Architectures: x86 +Type: vm ioctl +Parameters: unsigned long vcpu_id +Returns: 0 on success, -1 on error + +Define which vcpu is the Bootstrap Processor (BSP). Values are the same +as the vcpu id in KVM_CREATE_VCPU. If this ioctl is not called, the default +is vcpu 0. + + +4.42 KVM_GET_XSAVE + +Capability: KVM_CAP_XSAVE +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_xsave (out) +Returns: 0 on success, -1 on error + +struct kvm_xsave { + __u32 region[1024]; +}; + +This ioctl would copy current vcpu's xsave struct to the userspace. + + +4.43 KVM_SET_XSAVE + +Capability: KVM_CAP_XSAVE +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_xsave (in) +Returns: 0 on success, -1 on error + +struct kvm_xsave { + __u32 region[1024]; +}; + +This ioctl would copy userspace's xsave struct to the kernel. + + +4.44 KVM_GET_XCRS + +Capability: KVM_CAP_XCRS +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_xcrs (out) +Returns: 0 on success, -1 on error + +struct kvm_xcr { + __u32 xcr; + __u32 reserved; + __u64 value; +}; + +struct kvm_xcrs { + __u32 nr_xcrs; + __u32 flags; + struct kvm_xcr xcrs[KVM_MAX_XCRS]; + __u64 padding[16]; +}; + +This ioctl would copy current vcpu's xcrs to the userspace. + + +4.45 KVM_SET_XCRS + +Capability: KVM_CAP_XCRS +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_xcrs (in) +Returns: 0 on success, -1 on error + +struct kvm_xcr { + __u32 xcr; + __u32 reserved; + __u64 value; +}; + +struct kvm_xcrs { + __u32 nr_xcrs; + __u32 flags; + struct kvm_xcr xcrs[KVM_MAX_XCRS]; + __u64 padding[16]; +}; + +This ioctl would set vcpu's xcr to the value userspace specified. + + +4.46 KVM_GET_SUPPORTED_CPUID + +Capability: KVM_CAP_EXT_CPUID +Architectures: x86 +Type: system ioctl +Parameters: struct kvm_cpuid2 (in/out) +Returns: 0 on success, -1 on error + +struct kvm_cpuid2 { + __u32 nent; + __u32 padding; + struct kvm_cpuid_entry2 entries[0]; +}; + +#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX BIT(0) +#define KVM_CPUID_FLAG_STATEFUL_FUNC BIT(1) +#define KVM_CPUID_FLAG_STATE_READ_NEXT BIT(2) + +struct kvm_cpuid_entry2 { + __u32 function; + __u32 index; + __u32 flags; + __u32 eax; + __u32 ebx; + __u32 ecx; + __u32 edx; + __u32 padding[3]; +}; + +This ioctl returns x86 cpuid features which are supported by both the +hardware and kvm in its default configuration. Userspace can use the +information returned by this ioctl to construct cpuid information (for +KVM_SET_CPUID2) that is consistent with hardware, kernel, and +userspace capabilities, and with user requirements (for example, the +user may wish to constrain cpuid to emulate older hardware, or for +feature consistency across a cluster). + +Note that certain capabilities, such as KVM_CAP_X86_DISABLE_EXITS, may +expose cpuid features (e.g. MONITOR) which are not supported by kvm in +its default configuration. If userspace enables such capabilities, it +is responsible for modifying the results of this ioctl appropriately. + +Userspace invokes KVM_GET_SUPPORTED_CPUID by passing a kvm_cpuid2 structure +with the 'nent' field indicating the number of entries in the variable-size +array 'entries'. If the number of entries is too low to describe the cpu +capabilities, an error (E2BIG) is returned. If the number is too high, +the 'nent' field is adjusted and an error (ENOMEM) is returned. If the +number is just right, the 'nent' field is adjusted to the number of valid +entries in the 'entries' array, which is then filled. + +The entries returned are the host cpuid as returned by the cpuid instruction, +with unknown or unsupported features masked out. Some features (for example, +x2apic), may not be present in the host cpu, but are exposed by kvm if it can +emulate them efficiently. The fields in each entry are defined as follows: + + function: the eax value used to obtain the entry + index: the ecx value used to obtain the entry (for entries that are + affected by ecx) + flags: an OR of zero or more of the following: + KVM_CPUID_FLAG_SIGNIFCANT_INDEX: + if the index field is valid + KVM_CPUID_FLAG_STATEFUL_FUNC: + if cpuid for this function returns different values for successive + invocations; there will be several entries with the same function, + all with this flag set + KVM_CPUID_FLAG_STATE_READ_NEXT: + for KVM_CPUID_FLAG_STATEFUL_FUNC entries, set if this entry is + the first entry to be read by a cpu + eax, ebx, ecx, edx: the values returned by the cpuid instruction for + this function/index combination + +The TSC deadline timer feature (CPUID leaf 1, ecx[24]) is always returned +as false, since the feature depends on KVM_CREATE_IRQCHIP for local APIC +support. Instead it is reported via + + ioctl(KVM_CHECK_EXTENSION, KVM_CAP_TSC_DEADLINE_TIMER) + +if that returns true and you use KVM_CREATE_IRQCHIP, or if you emulate the +feature in userspace, then you can enable the feature for KVM_SET_CPUID2. + + +4.47 KVM_PPC_GET_PVINFO + +Capability: KVM_CAP_PPC_GET_PVINFO +Architectures: ppc +Type: vm ioctl +Parameters: struct kvm_ppc_pvinfo (out) +Returns: 0 on success, !0 on error + +struct kvm_ppc_pvinfo { + __u32 flags; + __u32 hcall[4]; + __u8 pad[108]; +}; + +This ioctl fetches PV specific information that need to be passed to the guest +using the device tree or other means from vm context. + +The hcall array defines 4 instructions that make up a hypercall. + +If any additional field gets added to this structure later on, a bit for that +additional piece of information will be set in the flags bitmap. + +The flags bitmap is defined as: + + /* the host supports the ePAPR idle hcall + #define KVM_PPC_PVINFO_FLAGS_EV_IDLE (1<<0) + +4.52 KVM_SET_GSI_ROUTING + +Capability: KVM_CAP_IRQ_ROUTING +Architectures: x86 s390 arm arm64 +Type: vm ioctl +Parameters: struct kvm_irq_routing (in) +Returns: 0 on success, -1 on error + +Sets the GSI routing table entries, overwriting any previously set entries. + +On arm/arm64, GSI routing has the following limitation: +- GSI routing does not apply to KVM_IRQ_LINE but only to KVM_IRQFD. + +struct kvm_irq_routing { + __u32 nr; + __u32 flags; + struct kvm_irq_routing_entry entries[0]; +}; + +No flags are specified so far, the corresponding field must be set to zero. + +struct kvm_irq_routing_entry { + __u32 gsi; + __u32 type; + __u32 flags; + __u32 pad; + union { + struct kvm_irq_routing_irqchip irqchip; + struct kvm_irq_routing_msi msi; + struct kvm_irq_routing_s390_adapter adapter; + struct kvm_irq_routing_hv_sint hv_sint; + __u32 pad[8]; + } u; +}; + +/* gsi routing entry types */ +#define KVM_IRQ_ROUTING_IRQCHIP 1 +#define KVM_IRQ_ROUTING_MSI 2 +#define KVM_IRQ_ROUTING_S390_ADAPTER 3 +#define KVM_IRQ_ROUTING_HV_SINT 4 + +flags: +- KVM_MSI_VALID_DEVID: used along with KVM_IRQ_ROUTING_MSI routing entry + type, specifies that the devid field contains a valid value. The per-VM + KVM_CAP_MSI_DEVID capability advertises the requirement to provide + the device ID. If this capability is not available, userspace should + never set the KVM_MSI_VALID_DEVID flag as the ioctl might fail. +- zero otherwise + +struct kvm_irq_routing_irqchip { + __u32 irqchip; + __u32 pin; +}; + +struct kvm_irq_routing_msi { + __u32 address_lo; + __u32 address_hi; + __u32 data; + union { + __u32 pad; + __u32 devid; + }; +}; + +If KVM_MSI_VALID_DEVID is set, devid contains a unique device identifier +for the device that wrote the MSI message. For PCI, this is usually a +BFD identifier in the lower 16 bits. + +On x86, address_hi is ignored unless the KVM_X2APIC_API_USE_32BIT_IDS +feature of KVM_CAP_X2APIC_API capability is enabled. If it is enabled, +address_hi bits 31-8 provide bits 31-8 of the destination id. Bits 7-0 of +address_hi must be zero. + +struct kvm_irq_routing_s390_adapter { + __u64 ind_addr; + __u64 summary_addr; + __u64 ind_offset; + __u32 summary_offset; + __u32 adapter_id; +}; + +struct kvm_irq_routing_hv_sint { + __u32 vcpu; + __u32 sint; +}; + + +4.55 KVM_SET_TSC_KHZ + +Capability: KVM_CAP_TSC_CONTROL +Architectures: x86 +Type: vcpu ioctl +Parameters: virtual tsc_khz +Returns: 0 on success, -1 on error + +Specifies the tsc frequency for the virtual machine. The unit of the +frequency is KHz. + + +4.56 KVM_GET_TSC_KHZ + +Capability: KVM_CAP_GET_TSC_KHZ +Architectures: x86 +Type: vcpu ioctl +Parameters: none +Returns: virtual tsc-khz on success, negative value on error + +Returns the tsc frequency of the guest. The unit of the return value is +KHz. If the host has unstable tsc this ioctl returns -EIO instead as an +error. + + +4.57 KVM_GET_LAPIC + +Capability: KVM_CAP_IRQCHIP +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_lapic_state (out) +Returns: 0 on success, -1 on error + +#define KVM_APIC_REG_SIZE 0x400 +struct kvm_lapic_state { + char regs[KVM_APIC_REG_SIZE]; +}; + +Reads the Local APIC registers and copies them into the input argument. The +data format and layout are the same as documented in the architecture manual. + +If KVM_X2APIC_API_USE_32BIT_IDS feature of KVM_CAP_X2APIC_API is +enabled, then the format of APIC_ID register depends on the APIC mode +(reported by MSR_IA32_APICBASE) of its VCPU. x2APIC stores APIC ID in +the APIC_ID register (bytes 32-35). xAPIC only allows an 8-bit APIC ID +which is stored in bits 31-24 of the APIC register, or equivalently in +byte 35 of struct kvm_lapic_state's regs field. KVM_GET_LAPIC must then +be called after MSR_IA32_APICBASE has been set with KVM_SET_MSR. + +If KVM_X2APIC_API_USE_32BIT_IDS feature is disabled, struct kvm_lapic_state +always uses xAPIC format. + + +4.58 KVM_SET_LAPIC + +Capability: KVM_CAP_IRQCHIP +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_lapic_state (in) +Returns: 0 on success, -1 on error + +#define KVM_APIC_REG_SIZE 0x400 +struct kvm_lapic_state { + char regs[KVM_APIC_REG_SIZE]; +}; + +Copies the input argument into the Local APIC registers. The data format +and layout are the same as documented in the architecture manual. + +The format of the APIC ID register (bytes 32-35 of struct kvm_lapic_state's +regs field) depends on the state of the KVM_CAP_X2APIC_API capability. +See the note in KVM_GET_LAPIC. + + +4.59 KVM_IOEVENTFD + +Capability: KVM_CAP_IOEVENTFD +Architectures: all +Type: vm ioctl +Parameters: struct kvm_ioeventfd (in) +Returns: 0 on success, !0 on error + +This ioctl attaches or detaches an ioeventfd to a legal pio/mmio address +within the guest. A guest write in the registered address will signal the +provided event instead of triggering an exit. + +struct kvm_ioeventfd { + __u64 datamatch; + __u64 addr; /* legal pio/mmio address */ + __u32 len; /* 0, 1, 2, 4, or 8 bytes */ + __s32 fd; + __u32 flags; + __u8 pad[36]; +}; + +For the special case of virtio-ccw devices on s390, the ioevent is matched +to a subchannel/virtqueue tuple instead. + +The following flags are defined: + +#define KVM_IOEVENTFD_FLAG_DATAMATCH (1 << kvm_ioeventfd_flag_nr_datamatch) +#define KVM_IOEVENTFD_FLAG_PIO (1 << kvm_ioeventfd_flag_nr_pio) +#define KVM_IOEVENTFD_FLAG_DEASSIGN (1 << kvm_ioeventfd_flag_nr_deassign) +#define KVM_IOEVENTFD_FLAG_VIRTIO_CCW_NOTIFY \ + (1 << kvm_ioeventfd_flag_nr_virtio_ccw_notify) + +If datamatch flag is set, the event will be signaled only if the written value +to the registered address is equal to datamatch in struct kvm_ioeventfd. + +For virtio-ccw devices, addr contains the subchannel id and datamatch the +virtqueue index. + +With KVM_CAP_IOEVENTFD_ANY_LENGTH, a zero length ioeventfd is allowed, and +the kernel will ignore the length of guest write and may get a faster vmexit. +The speedup may only apply to specific architectures, but the ioeventfd will +work anyway. + +4.60 KVM_DIRTY_TLB + +Capability: KVM_CAP_SW_TLB +Architectures: ppc +Type: vcpu ioctl +Parameters: struct kvm_dirty_tlb (in) +Returns: 0 on success, -1 on error + +struct kvm_dirty_tlb { + __u64 bitmap; + __u32 num_dirty; +}; + +This must be called whenever userspace has changed an entry in the shared +TLB, prior to calling KVM_RUN on the associated vcpu. + +The "bitmap" field is the userspace address of an array. This array +consists of a number of bits, equal to the total number of TLB entries as +determined by the last successful call to KVM_CONFIG_TLB, rounded up to the +nearest multiple of 64. + +Each bit corresponds to one TLB entry, ordered the same as in the shared TLB +array. + +The array is little-endian: the bit 0 is the least significant bit of the +first byte, bit 8 is the least significant bit of the second byte, etc. +This avoids any complications with differing word sizes. + +The "num_dirty" field is a performance hint for KVM to determine whether it +should skip processing the bitmap and just invalidate everything. It must +be set to the number of set bits in the bitmap. + + +4.62 KVM_CREATE_SPAPR_TCE + +Capability: KVM_CAP_SPAPR_TCE +Architectures: powerpc +Type: vm ioctl +Parameters: struct kvm_create_spapr_tce (in) +Returns: file descriptor for manipulating the created TCE table + +This creates a virtual TCE (translation control entry) table, which +is an IOMMU for PAPR-style virtual I/O. It is used to translate +logical addresses used in virtual I/O into guest physical addresses, +and provides a scatter/gather capability for PAPR virtual I/O. + +/* for KVM_CAP_SPAPR_TCE */ +struct kvm_create_spapr_tce { + __u64 liobn; + __u32 window_size; +}; + +The liobn field gives the logical IO bus number for which to create a +TCE table. The window_size field specifies the size of the DMA window +which this TCE table will translate - the table will contain one 64 +bit TCE entry for every 4kiB of the DMA window. + +When the guest issues an H_PUT_TCE hcall on a liobn for which a TCE +table has been created using this ioctl(), the kernel will handle it +in real mode, updating the TCE table. H_PUT_TCE calls for other +liobns will cause a vm exit and must be handled by userspace. + +The return value is a file descriptor which can be passed to mmap(2) +to map the created TCE table into userspace. This lets userspace read +the entries written by kernel-handled H_PUT_TCE calls, and also lets +userspace update the TCE table directly which is useful in some +circumstances. + + +4.63 KVM_ALLOCATE_RMA + +Capability: KVM_CAP_PPC_RMA +Architectures: powerpc +Type: vm ioctl +Parameters: struct kvm_allocate_rma (out) +Returns: file descriptor for mapping the allocated RMA + +This allocates a Real Mode Area (RMA) from the pool allocated at boot +time by the kernel. An RMA is a physically-contiguous, aligned region +of memory used on older POWER processors to provide the memory which +will be accessed by real-mode (MMU off) accesses in a KVM guest. +POWER processors support a set of sizes for the RMA that usually +includes 64MB, 128MB, 256MB and some larger powers of two. + +/* for KVM_ALLOCATE_RMA */ +struct kvm_allocate_rma { + __u64 rma_size; +}; + +The return value is a file descriptor which can be passed to mmap(2) +to map the allocated RMA into userspace. The mapped area can then be +passed to the KVM_SET_USER_MEMORY_REGION ioctl to establish it as the +RMA for a virtual machine. The size of the RMA in bytes (which is +fixed at host kernel boot time) is returned in the rma_size field of +the argument structure. + +The KVM_CAP_PPC_RMA capability is 1 or 2 if the KVM_ALLOCATE_RMA ioctl +is supported; 2 if the processor requires all virtual machines to have +an RMA, or 1 if the processor can use an RMA but doesn't require it, +because it supports the Virtual RMA (VRMA) facility. + + +4.64 KVM_NMI + +Capability: KVM_CAP_USER_NMI +Architectures: x86 +Type: vcpu ioctl +Parameters: none +Returns: 0 on success, -1 on error + +Queues an NMI on the thread's vcpu. Note this is well defined only +when KVM_CREATE_IRQCHIP has not been called, since this is an interface +between the virtual cpu core and virtual local APIC. After KVM_CREATE_IRQCHIP +has been called, this interface is completely emulated within the kernel. + +To use this to emulate the LINT1 input with KVM_CREATE_IRQCHIP, use the +following algorithm: + + - pause the vcpu + - read the local APIC's state (KVM_GET_LAPIC) + - check whether changing LINT1 will queue an NMI (see the LVT entry for LINT1) + - if so, issue KVM_NMI + - resume the vcpu + +Some guests configure the LINT1 NMI input to cause a panic, aiding in +debugging. + + +4.65 KVM_S390_UCAS_MAP + +Capability: KVM_CAP_S390_UCONTROL +Architectures: s390 +Type: vcpu ioctl +Parameters: struct kvm_s390_ucas_mapping (in) +Returns: 0 in case of success + +The parameter is defined like this: + struct kvm_s390_ucas_mapping { + __u64 user_addr; + __u64 vcpu_addr; + __u64 length; + }; + +This ioctl maps the memory at "user_addr" with the length "length" to +the vcpu's address space starting at "vcpu_addr". All parameters need to +be aligned by 1 megabyte. + + +4.66 KVM_S390_UCAS_UNMAP + +Capability: KVM_CAP_S390_UCONTROL +Architectures: s390 +Type: vcpu ioctl +Parameters: struct kvm_s390_ucas_mapping (in) +Returns: 0 in case of success + +The parameter is defined like this: + struct kvm_s390_ucas_mapping { + __u64 user_addr; + __u64 vcpu_addr; + __u64 length; + }; + +This ioctl unmaps the memory in the vcpu's address space starting at +"vcpu_addr" with the length "length". The field "user_addr" is ignored. +All parameters need to be aligned by 1 megabyte. + + +4.67 KVM_S390_VCPU_FAULT + +Capability: KVM_CAP_S390_UCONTROL +Architectures: s390 +Type: vcpu ioctl +Parameters: vcpu absolute address (in) +Returns: 0 in case of success + +This call creates a page table entry on the virtual cpu's address space +(for user controlled virtual machines) or the virtual machine's address +space (for regular virtual machines). This only works for minor faults, +thus it's recommended to access subject memory page via the user page +table upfront. This is useful to handle validity intercepts for user +controlled virtual machines to fault in the virtual cpu's lowcore pages +prior to calling the KVM_RUN ioctl. + + +4.68 KVM_SET_ONE_REG + +Capability: KVM_CAP_ONE_REG +Architectures: all +Type: vcpu ioctl +Parameters: struct kvm_one_reg (in) +Returns: 0 on success, negative value on failure +Errors: +  ENOENT:   no such register +  EINVAL:   invalid register ID, or no such register +  EPERM:    (arm64) register access not allowed before vcpu finalization +(These error codes are indicative only: do not rely on a specific error +code being returned in a specific situation.) + +struct kvm_one_reg { + __u64 id; + __u64 addr; +}; + +Using this ioctl, a single vcpu register can be set to a specific value +defined by user space with the passed in struct kvm_one_reg, where id +refers to the register identifier as described below and addr is a pointer +to a variable with the respective size. There can be architecture agnostic +and architecture specific registers. Each have their own range of operation +and their own constants and width. To keep track of the implemented +registers, find a list below: + + Arch | Register | Width (bits) + | | + PPC | KVM_REG_PPC_HIOR | 64 + PPC | KVM_REG_PPC_IAC1 | 64 + PPC | KVM_REG_PPC_IAC2 | 64 + PPC | KVM_REG_PPC_IAC3 | 64 + PPC | KVM_REG_PPC_IAC4 | 64 + PPC | KVM_REG_PPC_DAC1 | 64 + PPC | KVM_REG_PPC_DAC2 | 64 + PPC | KVM_REG_PPC_DABR | 64 + PPC | KVM_REG_PPC_DSCR | 64 + PPC | KVM_REG_PPC_PURR | 64 + PPC | KVM_REG_PPC_SPURR | 64 + PPC | KVM_REG_PPC_DAR | 64 + PPC | KVM_REG_PPC_DSISR | 32 + PPC | KVM_REG_PPC_AMR | 64 + PPC | KVM_REG_PPC_UAMOR | 64 + PPC | KVM_REG_PPC_MMCR0 | 64 + PPC | KVM_REG_PPC_MMCR1 | 64 + PPC | KVM_REG_PPC_MMCRA | 64 + PPC | KVM_REG_PPC_MMCR2 | 64 + PPC | KVM_REG_PPC_MMCRS | 64 + PPC | KVM_REG_PPC_SIAR | 64 + PPC | KVM_REG_PPC_SDAR | 64 + PPC | KVM_REG_PPC_SIER | 64 + PPC | KVM_REG_PPC_PMC1 | 32 + PPC | KVM_REG_PPC_PMC2 | 32 + PPC | KVM_REG_PPC_PMC3 | 32 + PPC | KVM_REG_PPC_PMC4 | 32 + PPC | KVM_REG_PPC_PMC5 | 32 + PPC | KVM_REG_PPC_PMC6 | 32 + PPC | KVM_REG_PPC_PMC7 | 32 + PPC | KVM_REG_PPC_PMC8 | 32 + PPC | KVM_REG_PPC_FPR0 | 64 + ... + PPC | KVM_REG_PPC_FPR31 | 64 + PPC | KVM_REG_PPC_VR0 | 128 + ... + PPC | KVM_REG_PPC_VR31 | 128 + PPC | KVM_REG_PPC_VSR0 | 128 + ... + PPC | KVM_REG_PPC_VSR31 | 128 + PPC | KVM_REG_PPC_FPSCR | 64 + PPC | KVM_REG_PPC_VSCR | 32 + PPC | KVM_REG_PPC_VPA_ADDR | 64 + PPC | KVM_REG_PPC_VPA_SLB | 128 + PPC | KVM_REG_PPC_VPA_DTL | 128 + PPC | KVM_REG_PPC_EPCR | 32 + PPC | KVM_REG_PPC_EPR | 32 + PPC | KVM_REG_PPC_TCR | 32 + PPC | KVM_REG_PPC_TSR | 32 + PPC | KVM_REG_PPC_OR_TSR | 32 + PPC | KVM_REG_PPC_CLEAR_TSR | 32 + PPC | KVM_REG_PPC_MAS0 | 32 + PPC | KVM_REG_PPC_MAS1 | 32 + PPC | KVM_REG_PPC_MAS2 | 64 + PPC | KVM_REG_PPC_MAS7_3 | 64 + PPC | KVM_REG_PPC_MAS4 | 32 + PPC | KVM_REG_PPC_MAS6 | 32 + PPC | KVM_REG_PPC_MMUCFG | 32 + PPC | KVM_REG_PPC_TLB0CFG | 32 + PPC | KVM_REG_PPC_TLB1CFG | 32 + PPC | KVM_REG_PPC_TLB2CFG | 32 + PPC | KVM_REG_PPC_TLB3CFG | 32 + PPC | KVM_REG_PPC_TLB0PS | 32 + PPC | KVM_REG_PPC_TLB1PS | 32 + PPC | KVM_REG_PPC_TLB2PS | 32 + PPC | KVM_REG_PPC_TLB3PS | 32 + PPC | KVM_REG_PPC_EPTCFG | 32 + PPC | KVM_REG_PPC_ICP_STATE | 64 + PPC | KVM_REG_PPC_VP_STATE | 128 + PPC | KVM_REG_PPC_TB_OFFSET | 64 + PPC | KVM_REG_PPC_SPMC1 | 32 + PPC | KVM_REG_PPC_SPMC2 | 32 + PPC | KVM_REG_PPC_IAMR | 64 + PPC | KVM_REG_PPC_TFHAR | 64 + PPC | KVM_REG_PPC_TFIAR | 64 + PPC | KVM_REG_PPC_TEXASR | 64 + PPC | KVM_REG_PPC_FSCR | 64 + PPC | KVM_REG_PPC_PSPB | 32 + PPC | KVM_REG_PPC_EBBHR | 64 + PPC | KVM_REG_PPC_EBBRR | 64 + PPC | KVM_REG_PPC_BESCR | 64 + PPC | KVM_REG_PPC_TAR | 64 + PPC | KVM_REG_PPC_DPDES | 64 + PPC | KVM_REG_PPC_DAWR | 64 + PPC | KVM_REG_PPC_DAWRX | 64 + PPC | KVM_REG_PPC_CIABR | 64 + PPC | KVM_REG_PPC_IC | 64 + PPC | KVM_REG_PPC_VTB | 64 + PPC | KVM_REG_PPC_CSIGR | 64 + PPC | KVM_REG_PPC_TACR | 64 + PPC | KVM_REG_PPC_TCSCR | 64 + PPC | KVM_REG_PPC_PID | 64 + PPC | KVM_REG_PPC_ACOP | 64 + PPC | KVM_REG_PPC_VRSAVE | 32 + PPC | KVM_REG_PPC_LPCR | 32 + PPC | KVM_REG_PPC_LPCR_64 | 64 + PPC | KVM_REG_PPC_PPR | 64 + PPC | KVM_REG_PPC_ARCH_COMPAT | 32 + PPC | KVM_REG_PPC_DABRX | 32 + PPC | KVM_REG_PPC_WORT | 64 + PPC | KVM_REG_PPC_SPRG9 | 64 + PPC | KVM_REG_PPC_DBSR | 32 + PPC | KVM_REG_PPC_TIDR | 64 + PPC | KVM_REG_PPC_PSSCR | 64 + PPC | KVM_REG_PPC_DEC_EXPIRY | 64 + PPC | KVM_REG_PPC_PTCR | 64 + PPC | KVM_REG_PPC_TM_GPR0 | 64 + ... + PPC | KVM_REG_PPC_TM_GPR31 | 64 + PPC | KVM_REG_PPC_TM_VSR0 | 128 + ... + PPC | KVM_REG_PPC_TM_VSR63 | 128 + PPC | KVM_REG_PPC_TM_CR | 64 + PPC | KVM_REG_PPC_TM_LR | 64 + PPC | KVM_REG_PPC_TM_CTR | 64 + PPC | KVM_REG_PPC_TM_FPSCR | 64 + PPC | KVM_REG_PPC_TM_AMR | 64 + PPC | KVM_REG_PPC_TM_PPR | 64 + PPC | KVM_REG_PPC_TM_VRSAVE | 64 + PPC | KVM_REG_PPC_TM_VSCR | 32 + PPC | KVM_REG_PPC_TM_DSCR | 64 + PPC | KVM_REG_PPC_TM_TAR | 64 + PPC | KVM_REG_PPC_TM_XER | 64 + | | + MIPS | KVM_REG_MIPS_R0 | 64 + ... + MIPS | KVM_REG_MIPS_R31 | 64 + MIPS | KVM_REG_MIPS_HI | 64 + MIPS | KVM_REG_MIPS_LO | 64 + MIPS | KVM_REG_MIPS_PC | 64 + MIPS | KVM_REG_MIPS_CP0_INDEX | 32 + MIPS | KVM_REG_MIPS_CP0_ENTRYLO0 | 64 + MIPS | KVM_REG_MIPS_CP0_ENTRYLO1 | 64 + MIPS | KVM_REG_MIPS_CP0_CONTEXT | 64 + MIPS | KVM_REG_MIPS_CP0_CONTEXTCONFIG| 32 + MIPS | KVM_REG_MIPS_CP0_USERLOCAL | 64 + MIPS | KVM_REG_MIPS_CP0_XCONTEXTCONFIG| 64 + MIPS | KVM_REG_MIPS_CP0_PAGEMASK | 32 + MIPS | KVM_REG_MIPS_CP0_PAGEGRAIN | 32 + MIPS | KVM_REG_MIPS_CP0_SEGCTL0 | 64 + MIPS | KVM_REG_MIPS_CP0_SEGCTL1 | 64 + MIPS | KVM_REG_MIPS_CP0_SEGCTL2 | 64 + MIPS | KVM_REG_MIPS_CP0_PWBASE | 64 + MIPS | KVM_REG_MIPS_CP0_PWFIELD | 64 + MIPS | KVM_REG_MIPS_CP0_PWSIZE | 64 + MIPS | KVM_REG_MIPS_CP0_WIRED | 32 + MIPS | KVM_REG_MIPS_CP0_PWCTL | 32 + MIPS | KVM_REG_MIPS_CP0_HWRENA | 32 + MIPS | KVM_REG_MIPS_CP0_BADVADDR | 64 + MIPS | KVM_REG_MIPS_CP0_BADINSTR | 32 + MIPS | KVM_REG_MIPS_CP0_BADINSTRP | 32 + MIPS | KVM_REG_MIPS_CP0_COUNT | 32 + MIPS | KVM_REG_MIPS_CP0_ENTRYHI | 64 + MIPS | KVM_REG_MIPS_CP0_COMPARE | 32 + MIPS | KVM_REG_MIPS_CP0_STATUS | 32 + MIPS | KVM_REG_MIPS_CP0_INTCTL | 32 + MIPS | KVM_REG_MIPS_CP0_CAUSE | 32 + MIPS | KVM_REG_MIPS_CP0_EPC | 64 + MIPS | KVM_REG_MIPS_CP0_PRID | 32 + MIPS | KVM_REG_MIPS_CP0_EBASE | 64 + MIPS | KVM_REG_MIPS_CP0_CONFIG | 32 + MIPS | KVM_REG_MIPS_CP0_CONFIG1 | 32 + MIPS | KVM_REG_MIPS_CP0_CONFIG2 | 32 + MIPS | KVM_REG_MIPS_CP0_CONFIG3 | 32 + MIPS | KVM_REG_MIPS_CP0_CONFIG4 | 32 + MIPS | KVM_REG_MIPS_CP0_CONFIG5 | 32 + MIPS | KVM_REG_MIPS_CP0_CONFIG7 | 32 + MIPS | KVM_REG_MIPS_CP0_XCONTEXT | 64 + MIPS | KVM_REG_MIPS_CP0_ERROREPC | 64 + MIPS | KVM_REG_MIPS_CP0_KSCRATCH1 | 64 + MIPS | KVM_REG_MIPS_CP0_KSCRATCH2 | 64 + MIPS | KVM_REG_MIPS_CP0_KSCRATCH3 | 64 + MIPS | KVM_REG_MIPS_CP0_KSCRATCH4 | 64 + MIPS | KVM_REG_MIPS_CP0_KSCRATCH5 | 64 + MIPS | KVM_REG_MIPS_CP0_KSCRATCH6 | 64 + MIPS | KVM_REG_MIPS_CP0_MAAR(0..63) | 64 + MIPS | KVM_REG_MIPS_COUNT_CTL | 64 + MIPS | KVM_REG_MIPS_COUNT_RESUME | 64 + MIPS | KVM_REG_MIPS_COUNT_HZ | 64 + MIPS | KVM_REG_MIPS_FPR_32(0..31) | 32 + MIPS | KVM_REG_MIPS_FPR_64(0..31) | 64 + MIPS | KVM_REG_MIPS_VEC_128(0..31) | 128 + MIPS | KVM_REG_MIPS_FCR_IR | 32 + MIPS | KVM_REG_MIPS_FCR_CSR | 32 + MIPS | KVM_REG_MIPS_MSA_IR | 32 + MIPS | KVM_REG_MIPS_MSA_CSR | 32 + +ARM registers are mapped using the lower 32 bits. The upper 16 of that +is the register group type, or coprocessor number: + +ARM core registers have the following id bit patterns: + 0x4020 0000 0010 + +ARM 32-bit CP15 registers have the following id bit patterns: + 0x4020 0000 000F + +ARM 64-bit CP15 registers have the following id bit patterns: + 0x4030 0000 000F + +ARM CCSIDR registers are demultiplexed by CSSELR value: + 0x4020 0000 0011 00 + +ARM 32-bit VFP control registers have the following id bit patterns: + 0x4020 0000 0012 1 + +ARM 64-bit FP registers have the following id bit patterns: + 0x4030 0000 0012 0 + +ARM firmware pseudo-registers have the following bit pattern: + 0x4030 0000 0014 + + +arm64 registers are mapped using the lower 32 bits. The upper 16 of +that is the register group type, or coprocessor number: + +arm64 core/FP-SIMD registers have the following id bit patterns. Note +that the size of the access is variable, as the kvm_regs structure +contains elements ranging from 32 to 128 bits. The index is a 32bit +value in the kvm_regs structure seen as a 32bit array. + 0x60x0 0000 0010 + +Specifically: + Encoding Register Bits kvm_regs member +---------------------------------------------------------------- + 0x6030 0000 0010 0000 X0 64 regs.regs[0] + 0x6030 0000 0010 0002 X1 64 regs.regs[1] + ... + 0x6030 0000 0010 003c X30 64 regs.regs[30] + 0x6030 0000 0010 003e SP 64 regs.sp + 0x6030 0000 0010 0040 PC 64 regs.pc + 0x6030 0000 0010 0042 PSTATE 64 regs.pstate + 0x6030 0000 0010 0044 SP_EL1 64 sp_el1 + 0x6030 0000 0010 0046 ELR_EL1 64 elr_el1 + 0x6030 0000 0010 0048 SPSR_EL1 64 spsr[KVM_SPSR_EL1] (alias SPSR_SVC) + 0x6030 0000 0010 004a SPSR_ABT 64 spsr[KVM_SPSR_ABT] + 0x6030 0000 0010 004c SPSR_UND 64 spsr[KVM_SPSR_UND] + 0x6030 0000 0010 004e SPSR_IRQ 64 spsr[KVM_SPSR_IRQ] + 0x6060 0000 0010 0050 SPSR_FIQ 64 spsr[KVM_SPSR_FIQ] + 0x6040 0000 0010 0054 V0 128 fp_regs.vregs[0] (*) + 0x6040 0000 0010 0058 V1 128 fp_regs.vregs[1] (*) + ... + 0x6040 0000 0010 00d0 V31 128 fp_regs.vregs[31] (*) + 0x6020 0000 0010 00d4 FPSR 32 fp_regs.fpsr + 0x6020 0000 0010 00d5 FPCR 32 fp_regs.fpcr + +(*) These encodings are not accepted for SVE-enabled vcpus. See + KVM_ARM_VCPU_INIT. + + The equivalent register content can be accessed via bits [127:0] of + the corresponding SVE Zn registers instead for vcpus that have SVE + enabled (see below). + +arm64 CCSIDR registers are demultiplexed by CSSELR value: + 0x6020 0000 0011 00 + +arm64 system registers have the following id bit patterns: + 0x6030 0000 0013 + +arm64 firmware pseudo-registers have the following bit pattern: + 0x6030 0000 0014 + +arm64 SVE registers have the following bit patterns: + 0x6080 0000 0015 00 Zn bits[2048*slice + 2047 : 2048*slice] + 0x6050 0000 0015 04 Pn bits[256*slice + 255 : 256*slice] + 0x6050 0000 0015 060 FFR bits[256*slice + 255 : 256*slice] + 0x6060 0000 0015 ffff KVM_REG_ARM64_SVE_VLS pseudo-register + +Access to register IDs where 2048 * slice >= 128 * max_vq will fail with +ENOENT. max_vq is the vcpu's maximum supported vector length in 128-bit +quadwords: see (**) below. + +These registers are only accessible on vcpus for which SVE is enabled. +See KVM_ARM_VCPU_INIT for details. + +In addition, except for KVM_REG_ARM64_SVE_VLS, these registers are not +accessible until the vcpu's SVE configuration has been finalized +using KVM_ARM_VCPU_FINALIZE(KVM_ARM_VCPU_SVE). See KVM_ARM_VCPU_INIT +and KVM_ARM_VCPU_FINALIZE for more information about this procedure. + +KVM_REG_ARM64_SVE_VLS is a pseudo-register that allows the set of vector +lengths supported by the vcpu to be discovered and configured by +userspace. When transferred to or from user memory via KVM_GET_ONE_REG +or KVM_SET_ONE_REG, the value of this register is of type +__u64[KVM_ARM64_SVE_VLS_WORDS], and encodes the set of vector lengths as +follows: + +__u64 vector_lengths[KVM_ARM64_SVE_VLS_WORDS]; + +if (vq >= SVE_VQ_MIN && vq <= SVE_VQ_MAX && + ((vector_lengths[(vq - KVM_ARM64_SVE_VQ_MIN) / 64] >> + ((vq - KVM_ARM64_SVE_VQ_MIN) % 64)) & 1)) + /* Vector length vq * 16 bytes supported */ +else + /* Vector length vq * 16 bytes not supported */ + +(**) The maximum value vq for which the above condition is true is +max_vq. This is the maximum vector length available to the guest on +this vcpu, and determines which register slices are visible through +this ioctl interface. + +(See Documentation/arm64/sve.rst for an explanation of the "vq" +nomenclature.) + +KVM_REG_ARM64_SVE_VLS is only accessible after KVM_ARM_VCPU_INIT. +KVM_ARM_VCPU_INIT initialises it to the best set of vector lengths that +the host supports. + +Userspace may subsequently modify it if desired until the vcpu's SVE +configuration is finalized using KVM_ARM_VCPU_FINALIZE(KVM_ARM_VCPU_SVE). + +Apart from simply removing all vector lengths from the host set that +exceed some value, support for arbitrarily chosen sets of vector lengths +is hardware-dependent and may not be available. Attempting to configure +an invalid set of vector lengths via KVM_SET_ONE_REG will fail with +EINVAL. + +After the vcpu's SVE configuration is finalized, further attempts to +write this register will fail with EPERM. + + +MIPS registers are mapped using the lower 32 bits. The upper 16 of that is +the register group type: + +MIPS core registers (see above) have the following id bit patterns: + 0x7030 0000 0000 + +MIPS CP0 registers (see KVM_REG_MIPS_CP0_* above) have the following id bit +patterns depending on whether they're 32-bit or 64-bit registers: + 0x7020 0000 0001 00 (32-bit) + 0x7030 0000 0001 00 (64-bit) + +Note: KVM_REG_MIPS_CP0_ENTRYLO0 and KVM_REG_MIPS_CP0_ENTRYLO1 are the MIPS64 +versions of the EntryLo registers regardless of the word size of the host +hardware, host kernel, guest, and whether XPA is present in the guest, i.e. +with the RI and XI bits (if they exist) in bits 63 and 62 respectively, and +the PFNX field starting at bit 30. + +MIPS MAARs (see KVM_REG_MIPS_CP0_MAAR(*) above) have the following id bit +patterns: + 0x7030 0000 0001 01 + +MIPS KVM control registers (see above) have the following id bit patterns: + 0x7030 0000 0002 + +MIPS FPU registers (see KVM_REG_MIPS_FPR_{32,64}() above) have the following +id bit patterns depending on the size of the register being accessed. They are +always accessed according to the current guest FPU mode (Status.FR and +Config5.FRE), i.e. as the guest would see them, and they become unpredictable +if the guest FPU mode is changed. MIPS SIMD Architecture (MSA) vector +registers (see KVM_REG_MIPS_VEC_128() above) have similar patterns as they +overlap the FPU registers: + 0x7020 0000 0003 00 <0:3> (32-bit FPU registers) + 0x7030 0000 0003 00 <0:3> (64-bit FPU registers) + 0x7040 0000 0003 00 <0:3> (128-bit MSA vector registers) + +MIPS FPU control registers (see KVM_REG_MIPS_FCR_{IR,CSR} above) have the +following id bit patterns: + 0x7020 0000 0003 01 <0:3> + +MIPS MSA control registers (see KVM_REG_MIPS_MSA_{IR,CSR} above) have the +following id bit patterns: + 0x7020 0000 0003 02 <0:3> + + +4.69 KVM_GET_ONE_REG + +Capability: KVM_CAP_ONE_REG +Architectures: all +Type: vcpu ioctl +Parameters: struct kvm_one_reg (in and out) +Returns: 0 on success, negative value on failure +Errors include: +  ENOENT:   no such register +  EINVAL:   invalid register ID, or no such register +  EPERM:    (arm64) register access not allowed before vcpu finalization +(These error codes are indicative only: do not rely on a specific error +code being returned in a specific situation.) + +This ioctl allows to receive the value of a single register implemented +in a vcpu. The register to read is indicated by the "id" field of the +kvm_one_reg struct passed in. On success, the register value can be found +at the memory location pointed to by "addr". + +The list of registers accessible using this interface is identical to the +list in 4.68. + + +4.70 KVM_KVMCLOCK_CTRL + +Capability: KVM_CAP_KVMCLOCK_CTRL +Architectures: Any that implement pvclocks (currently x86 only) +Type: vcpu ioctl +Parameters: None +Returns: 0 on success, -1 on error + +This signals to the host kernel that the specified guest is being paused by +userspace. The host will set a flag in the pvclock structure that is checked +from the soft lockup watchdog. The flag is part of the pvclock structure that +is shared between guest and host, specifically the second bit of the flags +field of the pvclock_vcpu_time_info structure. It will be set exclusively by +the host and read/cleared exclusively by the guest. The guest operation of +checking and clearing the flag must an atomic operation so +load-link/store-conditional, or equivalent must be used. There are two cases +where the guest will clear the flag: when the soft lockup watchdog timer resets +itself or when a soft lockup is detected. This ioctl can be called any time +after pausing the vcpu, but before it is resumed. + + +4.71 KVM_SIGNAL_MSI + +Capability: KVM_CAP_SIGNAL_MSI +Architectures: x86 arm arm64 +Type: vm ioctl +Parameters: struct kvm_msi (in) +Returns: >0 on delivery, 0 if guest blocked the MSI, and -1 on error + +Directly inject a MSI message. Only valid with in-kernel irqchip that handles +MSI messages. + +struct kvm_msi { + __u32 address_lo; + __u32 address_hi; + __u32 data; + __u32 flags; + __u32 devid; + __u8 pad[12]; +}; + +flags: KVM_MSI_VALID_DEVID: devid contains a valid value. The per-VM + KVM_CAP_MSI_DEVID capability advertises the requirement to provide + the device ID. If this capability is not available, userspace + should never set the KVM_MSI_VALID_DEVID flag as the ioctl might fail. + +If KVM_MSI_VALID_DEVID is set, devid contains a unique device identifier +for the device that wrote the MSI message. For PCI, this is usually a +BFD identifier in the lower 16 bits. + +On x86, address_hi is ignored unless the KVM_X2APIC_API_USE_32BIT_IDS +feature of KVM_CAP_X2APIC_API capability is enabled. If it is enabled, +address_hi bits 31-8 provide bits 31-8 of the destination id. Bits 7-0 of +address_hi must be zero. + + +4.71 KVM_CREATE_PIT2 + +Capability: KVM_CAP_PIT2 +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_pit_config (in) +Returns: 0 on success, -1 on error + +Creates an in-kernel device model for the i8254 PIT. This call is only valid +after enabling in-kernel irqchip support via KVM_CREATE_IRQCHIP. The following +parameters have to be passed: + +struct kvm_pit_config { + __u32 flags; + __u32 pad[15]; +}; + +Valid flags are: + +#define KVM_PIT_SPEAKER_DUMMY 1 /* emulate speaker port stub */ + +PIT timer interrupts may use a per-VM kernel thread for injection. If it +exists, this thread will have a name of the following pattern: + +kvm-pit/ + +When running a guest with elevated priorities, the scheduling parameters of +this thread may have to be adjusted accordingly. + +This IOCTL replaces the obsolete KVM_CREATE_PIT. + + +4.72 KVM_GET_PIT2 + +Capability: KVM_CAP_PIT_STATE2 +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_pit_state2 (out) +Returns: 0 on success, -1 on error + +Retrieves the state of the in-kernel PIT model. Only valid after +KVM_CREATE_PIT2. The state is returned in the following structure: + +struct kvm_pit_state2 { + struct kvm_pit_channel_state channels[3]; + __u32 flags; + __u32 reserved[9]; +}; + +Valid flags are: + +/* disable PIT in HPET legacy mode */ +#define KVM_PIT_FLAGS_HPET_LEGACY 0x00000001 + +This IOCTL replaces the obsolete KVM_GET_PIT. + + +4.73 KVM_SET_PIT2 + +Capability: KVM_CAP_PIT_STATE2 +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_pit_state2 (in) +Returns: 0 on success, -1 on error + +Sets the state of the in-kernel PIT model. Only valid after KVM_CREATE_PIT2. +See KVM_GET_PIT2 for details on struct kvm_pit_state2. + +This IOCTL replaces the obsolete KVM_SET_PIT. + + +4.74 KVM_PPC_GET_SMMU_INFO + +Capability: KVM_CAP_PPC_GET_SMMU_INFO +Architectures: powerpc +Type: vm ioctl +Parameters: None +Returns: 0 on success, -1 on error + +This populates and returns a structure describing the features of +the "Server" class MMU emulation supported by KVM. +This can in turn be used by userspace to generate the appropriate +device-tree properties for the guest operating system. + +The structure contains some global information, followed by an +array of supported segment page sizes: + + struct kvm_ppc_smmu_info { + __u64 flags; + __u32 slb_size; + __u32 pad; + struct kvm_ppc_one_seg_page_size sps[KVM_PPC_PAGE_SIZES_MAX_SZ]; + }; + +The supported flags are: + + - KVM_PPC_PAGE_SIZES_REAL: + When that flag is set, guest page sizes must "fit" the backing + store page sizes. When not set, any page size in the list can + be used regardless of how they are backed by userspace. + + - KVM_PPC_1T_SEGMENTS + The emulated MMU supports 1T segments in addition to the + standard 256M ones. + + - KVM_PPC_NO_HASH + This flag indicates that HPT guests are not supported by KVM, + thus all guests must use radix MMU mode. + +The "slb_size" field indicates how many SLB entries are supported + +The "sps" array contains 8 entries indicating the supported base +page sizes for a segment in increasing order. Each entry is defined +as follow: + + struct kvm_ppc_one_seg_page_size { + __u32 page_shift; /* Base page shift of segment (or 0) */ + __u32 slb_enc; /* SLB encoding for BookS */ + struct kvm_ppc_one_page_size enc[KVM_PPC_PAGE_SIZES_MAX_SZ]; + }; + +An entry with a "page_shift" of 0 is unused. Because the array is +organized in increasing order, a lookup can stop when encoutering +such an entry. + +The "slb_enc" field provides the encoding to use in the SLB for the +page size. The bits are in positions such as the value can directly +be OR'ed into the "vsid" argument of the slbmte instruction. + +The "enc" array is a list which for each of those segment base page +size provides the list of supported actual page sizes (which can be +only larger or equal to the base page size), along with the +corresponding encoding in the hash PTE. Similarly, the array is +8 entries sorted by increasing sizes and an entry with a "0" shift +is an empty entry and a terminator: + + struct kvm_ppc_one_page_size { + __u32 page_shift; /* Page shift (or 0) */ + __u32 pte_enc; /* Encoding in the HPTE (>>12) */ + }; + +The "pte_enc" field provides a value that can OR'ed into the hash +PTE's RPN field (ie, it needs to be shifted left by 12 to OR it +into the hash PTE second double word). + +4.75 KVM_IRQFD + +Capability: KVM_CAP_IRQFD +Architectures: x86 s390 arm arm64 +Type: vm ioctl +Parameters: struct kvm_irqfd (in) +Returns: 0 on success, -1 on error + +Allows setting an eventfd to directly trigger a guest interrupt. +kvm_irqfd.fd specifies the file descriptor to use as the eventfd and +kvm_irqfd.gsi specifies the irqchip pin toggled by this event. When +an event is triggered on the eventfd, an interrupt is injected into +the guest using the specified gsi pin. The irqfd is removed using +the KVM_IRQFD_FLAG_DEASSIGN flag, specifying both kvm_irqfd.fd +and kvm_irqfd.gsi. + +With KVM_CAP_IRQFD_RESAMPLE, KVM_IRQFD supports a de-assert and notify +mechanism allowing emulation of level-triggered, irqfd-based +interrupts. When KVM_IRQFD_FLAG_RESAMPLE is set the user must pass an +additional eventfd in the kvm_irqfd.resamplefd field. When operating +in resample mode, posting of an interrupt through kvm_irq.fd asserts +the specified gsi in the irqchip. When the irqchip is resampled, such +as from an EOI, the gsi is de-asserted and the user is notified via +kvm_irqfd.resamplefd. It is the user's responsibility to re-queue +the interrupt if the device making use of it still requires service. +Note that closing the resamplefd is not sufficient to disable the +irqfd. The KVM_IRQFD_FLAG_RESAMPLE is only necessary on assignment +and need not be specified with KVM_IRQFD_FLAG_DEASSIGN. + +On arm/arm64, gsi routing being supported, the following can happen: +- in case no routing entry is associated to this gsi, injection fails +- in case the gsi is associated to an irqchip routing entry, + irqchip.pin + 32 corresponds to the injected SPI ID. +- in case the gsi is associated to an MSI routing entry, the MSI + message and device ID are translated into an LPI (support restricted + to GICv3 ITS in-kernel emulation). + +4.76 KVM_PPC_ALLOCATE_HTAB + +Capability: KVM_CAP_PPC_ALLOC_HTAB +Architectures: powerpc +Type: vm ioctl +Parameters: Pointer to u32 containing hash table order (in/out) +Returns: 0 on success, -1 on error + +This requests the host kernel to allocate an MMU hash table for a +guest using the PAPR paravirtualization interface. This only does +anything if the kernel is configured to use the Book 3S HV style of +virtualization. Otherwise the capability doesn't exist and the ioctl +returns an ENOTTY error. The rest of this description assumes Book 3S +HV. + +There must be no vcpus running when this ioctl is called; if there +are, it will do nothing and return an EBUSY error. + +The parameter is a pointer to a 32-bit unsigned integer variable +containing the order (log base 2) of the desired size of the hash +table, which must be between 18 and 46. On successful return from the +ioctl, the value will not be changed by the kernel. + +If no hash table has been allocated when any vcpu is asked to run +(with the KVM_RUN ioctl), the host kernel will allocate a +default-sized hash table (16 MB). + +If this ioctl is called when a hash table has already been allocated, +with a different order from the existing hash table, the existing hash +table will be freed and a new one allocated. If this is ioctl is +called when a hash table has already been allocated of the same order +as specified, the kernel will clear out the existing hash table (zero +all HPTEs). In either case, if the guest is using the virtualized +real-mode area (VRMA) facility, the kernel will re-create the VMRA +HPTEs on the next KVM_RUN of any vcpu. + +4.77 KVM_S390_INTERRUPT + +Capability: basic +Architectures: s390 +Type: vm ioctl, vcpu ioctl +Parameters: struct kvm_s390_interrupt (in) +Returns: 0 on success, -1 on error + +Allows to inject an interrupt to the guest. Interrupts can be floating +(vm ioctl) or per cpu (vcpu ioctl), depending on the interrupt type. + +Interrupt parameters are passed via kvm_s390_interrupt: + +struct kvm_s390_interrupt { + __u32 type; + __u32 parm; + __u64 parm64; +}; + +type can be one of the following: + +KVM_S390_SIGP_STOP (vcpu) - sigp stop; optional flags in parm +KVM_S390_PROGRAM_INT (vcpu) - program check; code in parm +KVM_S390_SIGP_SET_PREFIX (vcpu) - sigp set prefix; prefix address in parm +KVM_S390_RESTART (vcpu) - restart +KVM_S390_INT_CLOCK_COMP (vcpu) - clock comparator interrupt +KVM_S390_INT_CPU_TIMER (vcpu) - CPU timer interrupt +KVM_S390_INT_VIRTIO (vm) - virtio external interrupt; external interrupt + parameters in parm and parm64 +KVM_S390_INT_SERVICE (vm) - sclp external interrupt; sclp parameter in parm +KVM_S390_INT_EMERGENCY (vcpu) - sigp emergency; source cpu in parm +KVM_S390_INT_EXTERNAL_CALL (vcpu) - sigp external call; source cpu in parm +KVM_S390_INT_IO(ai,cssid,ssid,schid) (vm) - compound value to indicate an + I/O interrupt (ai - adapter interrupt; cssid,ssid,schid - subchannel); + I/O interruption parameters in parm (subchannel) and parm64 (intparm, + interruption subclass) +KVM_S390_MCHK (vm, vcpu) - machine check interrupt; cr 14 bits in parm, + machine check interrupt code in parm64 (note that + machine checks needing further payload are not + supported by this ioctl) + +This is an asynchronous vcpu ioctl and can be invoked from any thread. + +4.78 KVM_PPC_GET_HTAB_FD + +Capability: KVM_CAP_PPC_HTAB_FD +Architectures: powerpc +Type: vm ioctl +Parameters: Pointer to struct kvm_get_htab_fd (in) +Returns: file descriptor number (>= 0) on success, -1 on error + +This returns a file descriptor that can be used either to read out the +entries in the guest's hashed page table (HPT), or to write entries to +initialize the HPT. The returned fd can only be written to if the +KVM_GET_HTAB_WRITE bit is set in the flags field of the argument, and +can only be read if that bit is clear. The argument struct looks like +this: + +/* For KVM_PPC_GET_HTAB_FD */ +struct kvm_get_htab_fd { + __u64 flags; + __u64 start_index; + __u64 reserved[2]; +}; + +/* Values for kvm_get_htab_fd.flags */ +#define KVM_GET_HTAB_BOLTED_ONLY ((__u64)0x1) +#define KVM_GET_HTAB_WRITE ((__u64)0x2) + +The `start_index' field gives the index in the HPT of the entry at +which to start reading. It is ignored when writing. + +Reads on the fd will initially supply information about all +"interesting" HPT entries. Interesting entries are those with the +bolted bit set, if the KVM_GET_HTAB_BOLTED_ONLY bit is set, otherwise +all entries. When the end of the HPT is reached, the read() will +return. If read() is called again on the fd, it will start again from +the beginning of the HPT, but will only return HPT entries that have +changed since they were last read. + +Data read or written is structured as a header (8 bytes) followed by a +series of valid HPT entries (16 bytes) each. The header indicates how +many valid HPT entries there are and how many invalid entries follow +the valid entries. The invalid entries are not represented explicitly +in the stream. The header format is: + +struct kvm_get_htab_header { + __u32 index; + __u16 n_valid; + __u16 n_invalid; +}; + +Writes to the fd create HPT entries starting at the index given in the +header; first `n_valid' valid entries with contents from the data +written, then `n_invalid' invalid entries, invalidating any previously +valid entries found. + +4.79 KVM_CREATE_DEVICE + +Capability: KVM_CAP_DEVICE_CTRL +Type: vm ioctl +Parameters: struct kvm_create_device (in/out) +Returns: 0 on success, -1 on error +Errors: + ENODEV: The device type is unknown or unsupported + EEXIST: Device already created, and this type of device may not + be instantiated multiple times + + Other error conditions may be defined by individual device types or + have their standard meanings. + +Creates an emulated device in the kernel. The file descriptor returned +in fd can be used with KVM_SET/GET/HAS_DEVICE_ATTR. + +If the KVM_CREATE_DEVICE_TEST flag is set, only test whether the +device type is supported (not necessarily whether it can be created +in the current vm). + +Individual devices should not define flags. Attributes should be used +for specifying any behavior that is not implied by the device type +number. + +struct kvm_create_device { + __u32 type; /* in: KVM_DEV_TYPE_xxx */ + __u32 fd; /* out: device handle */ + __u32 flags; /* in: KVM_CREATE_DEVICE_xxx */ +}; + +4.80 KVM_SET_DEVICE_ATTR/KVM_GET_DEVICE_ATTR + +Capability: KVM_CAP_DEVICE_CTRL, KVM_CAP_VM_ATTRIBUTES for vm device, + KVM_CAP_VCPU_ATTRIBUTES for vcpu device +Type: device ioctl, vm ioctl, vcpu ioctl +Parameters: struct kvm_device_attr +Returns: 0 on success, -1 on error +Errors: + ENXIO: The group or attribute is unknown/unsupported for this device + or hardware support is missing. + EPERM: The attribute cannot (currently) be accessed this way + (e.g. read-only attribute, or attribute that only makes + sense when the device is in a different state) + + Other error conditions may be defined by individual device types. + +Gets/sets a specified piece of device configuration and/or state. The +semantics are device-specific. See individual device documentation in +the "devices" directory. As with ONE_REG, the size of the data +transferred is defined by the particular attribute. + +struct kvm_device_attr { + __u32 flags; /* no flags currently defined */ + __u32 group; /* device-defined */ + __u64 attr; /* group-defined */ + __u64 addr; /* userspace address of attr data */ +}; + +4.81 KVM_HAS_DEVICE_ATTR + +Capability: KVM_CAP_DEVICE_CTRL, KVM_CAP_VM_ATTRIBUTES for vm device, + KVM_CAP_VCPU_ATTRIBUTES for vcpu device +Type: device ioctl, vm ioctl, vcpu ioctl +Parameters: struct kvm_device_attr +Returns: 0 on success, -1 on error +Errors: + ENXIO: The group or attribute is unknown/unsupported for this device + or hardware support is missing. + +Tests whether a device supports a particular attribute. A successful +return indicates the attribute is implemented. It does not necessarily +indicate that the attribute can be read or written in the device's +current state. "addr" is ignored. + +4.82 KVM_ARM_VCPU_INIT + +Capability: basic +Architectures: arm, arm64 +Type: vcpu ioctl +Parameters: struct kvm_vcpu_init (in) +Returns: 0 on success; -1 on error +Errors: +  EINVAL:    the target is unknown, or the combination of features is invalid. +  ENOENT:    a features bit specified is unknown. + +This tells KVM what type of CPU to present to the guest, and what +optional features it should have.  This will cause a reset of the cpu +registers to their initial values.  If this is not called, KVM_RUN will +return ENOEXEC for that vcpu. + +Note that because some registers reflect machine topology, all vcpus +should be created before this ioctl is invoked. + +Userspace can call this function multiple times for a given vcpu, including +after the vcpu has been run. This will reset the vcpu to its initial +state. All calls to this function after the initial call must use the same +target and same set of feature flags, otherwise EINVAL will be returned. + +Possible features: + - KVM_ARM_VCPU_POWER_OFF: Starts the CPU in a power-off state. + Depends on KVM_CAP_ARM_PSCI. If not set, the CPU will be powered on + and execute guest code when KVM_RUN is called. + - KVM_ARM_VCPU_EL1_32BIT: Starts the CPU in a 32bit mode. + Depends on KVM_CAP_ARM_EL1_32BIT (arm64 only). + - KVM_ARM_VCPU_PSCI_0_2: Emulate PSCI v0.2 (or a future revision + backward compatible with v0.2) for the CPU. + Depends on KVM_CAP_ARM_PSCI_0_2. + - KVM_ARM_VCPU_PMU_V3: Emulate PMUv3 for the CPU. + Depends on KVM_CAP_ARM_PMU_V3. + + - KVM_ARM_VCPU_PTRAUTH_ADDRESS: Enables Address Pointer authentication + for arm64 only. + Depends on KVM_CAP_ARM_PTRAUTH_ADDRESS. + If KVM_CAP_ARM_PTRAUTH_ADDRESS and KVM_CAP_ARM_PTRAUTH_GENERIC are + both present, then both KVM_ARM_VCPU_PTRAUTH_ADDRESS and + KVM_ARM_VCPU_PTRAUTH_GENERIC must be requested or neither must be + requested. + + - KVM_ARM_VCPU_PTRAUTH_GENERIC: Enables Generic Pointer authentication + for arm64 only. + Depends on KVM_CAP_ARM_PTRAUTH_GENERIC. + If KVM_CAP_ARM_PTRAUTH_ADDRESS and KVM_CAP_ARM_PTRAUTH_GENERIC are + both present, then both KVM_ARM_VCPU_PTRAUTH_ADDRESS and + KVM_ARM_VCPU_PTRAUTH_GENERIC must be requested or neither must be + requested. + + - KVM_ARM_VCPU_SVE: Enables SVE for the CPU (arm64 only). + Depends on KVM_CAP_ARM_SVE. + Requires KVM_ARM_VCPU_FINALIZE(KVM_ARM_VCPU_SVE): + + * After KVM_ARM_VCPU_INIT: + + - KVM_REG_ARM64_SVE_VLS may be read using KVM_GET_ONE_REG: the + initial value of this pseudo-register indicates the best set of + vector lengths possible for a vcpu on this host. + + * Before KVM_ARM_VCPU_FINALIZE(KVM_ARM_VCPU_SVE): + + - KVM_RUN and KVM_GET_REG_LIST are not available; + + - KVM_GET_ONE_REG and KVM_SET_ONE_REG cannot be used to access + the scalable archietctural SVE registers + KVM_REG_ARM64_SVE_ZREG(), KVM_REG_ARM64_SVE_PREG() or + KVM_REG_ARM64_SVE_FFR; + + - KVM_REG_ARM64_SVE_VLS may optionally be written using + KVM_SET_ONE_REG, to modify the set of vector lengths available + for the vcpu. + + * After KVM_ARM_VCPU_FINALIZE(KVM_ARM_VCPU_SVE): + + - the KVM_REG_ARM64_SVE_VLS pseudo-register is immutable, and can + no longer be written using KVM_SET_ONE_REG. + +4.83 KVM_ARM_PREFERRED_TARGET + +Capability: basic +Architectures: arm, arm64 +Type: vm ioctl +Parameters: struct struct kvm_vcpu_init (out) +Returns: 0 on success; -1 on error +Errors: + ENODEV: no preferred target available for the host + +This queries KVM for preferred CPU target type which can be emulated +by KVM on underlying host. + +The ioctl returns struct kvm_vcpu_init instance containing information +about preferred CPU target type and recommended features for it. The +kvm_vcpu_init->features bitmap returned will have feature bits set if +the preferred target recommends setting these features, but this is +not mandatory. + +The information returned by this ioctl can be used to prepare an instance +of struct kvm_vcpu_init for KVM_ARM_VCPU_INIT ioctl which will result in +in VCPU matching underlying host. + + +4.84 KVM_GET_REG_LIST + +Capability: basic +Architectures: arm, arm64, mips +Type: vcpu ioctl +Parameters: struct kvm_reg_list (in/out) +Returns: 0 on success; -1 on error +Errors: +  E2BIG:     the reg index list is too big to fit in the array specified by +             the user (the number required will be written into n). + +struct kvm_reg_list { + __u64 n; /* number of registers in reg[] */ + __u64 reg[0]; +}; + +This ioctl returns the guest registers that are supported for the +KVM_GET_ONE_REG/KVM_SET_ONE_REG calls. + + +4.85 KVM_ARM_SET_DEVICE_ADDR (deprecated) + +Capability: KVM_CAP_ARM_SET_DEVICE_ADDR +Architectures: arm, arm64 +Type: vm ioctl +Parameters: struct kvm_arm_device_address (in) +Returns: 0 on success, -1 on error +Errors: + ENODEV: The device id is unknown + ENXIO: Device not supported on current system + EEXIST: Address already set + E2BIG: Address outside guest physical address space + EBUSY: Address overlaps with other device range + +struct kvm_arm_device_addr { + __u64 id; + __u64 addr; +}; + +Specify a device address in the guest's physical address space where guests +can access emulated or directly exposed devices, which the host kernel needs +to know about. The id field is an architecture specific identifier for a +specific device. + +ARM/arm64 divides the id field into two parts, a device id and an +address type id specific to the individual device. + +  bits: | 63 ... 32 | 31 ... 16 | 15 ... 0 | + field: | 0x00000000 | device id | addr type id | + +ARM/arm64 currently only require this when using the in-kernel GIC +support for the hardware VGIC features, using KVM_ARM_DEVICE_VGIC_V2 +as the device id. When setting the base address for the guest's +mapping of the VGIC virtual CPU and distributor interface, the ioctl +must be called after calling KVM_CREATE_IRQCHIP, but before calling +KVM_RUN on any of the VCPUs. Calling this ioctl twice for any of the +base addresses will return -EEXIST. + +Note, this IOCTL is deprecated and the more flexible SET/GET_DEVICE_ATTR API +should be used instead. + + +4.86 KVM_PPC_RTAS_DEFINE_TOKEN + +Capability: KVM_CAP_PPC_RTAS +Architectures: ppc +Type: vm ioctl +Parameters: struct kvm_rtas_token_args +Returns: 0 on success, -1 on error + +Defines a token value for a RTAS (Run Time Abstraction Services) +service in order to allow it to be handled in the kernel. The +argument struct gives the name of the service, which must be the name +of a service that has a kernel-side implementation. If the token +value is non-zero, it will be associated with that service, and +subsequent RTAS calls by the guest specifying that token will be +handled by the kernel. If the token value is 0, then any token +associated with the service will be forgotten, and subsequent RTAS +calls by the guest for that service will be passed to userspace to be +handled. + +4.87 KVM_SET_GUEST_DEBUG + +Capability: KVM_CAP_SET_GUEST_DEBUG +Architectures: x86, s390, ppc, arm64 +Type: vcpu ioctl +Parameters: struct kvm_guest_debug (in) +Returns: 0 on success; -1 on error + +struct kvm_guest_debug { + __u32 control; + __u32 pad; + struct kvm_guest_debug_arch arch; +}; + +Set up the processor specific debug registers and configure vcpu for +handling guest debug events. There are two parts to the structure, the +first a control bitfield indicates the type of debug events to handle +when running. Common control bits are: + + - KVM_GUESTDBG_ENABLE: guest debugging is enabled + - KVM_GUESTDBG_SINGLESTEP: the next run should single-step + +The top 16 bits of the control field are architecture specific control +flags which can include the following: + + - KVM_GUESTDBG_USE_SW_BP: using software breakpoints [x86, arm64] + - KVM_GUESTDBG_USE_HW_BP: using hardware breakpoints [x86, s390, arm64] + - KVM_GUESTDBG_INJECT_DB: inject DB type exception [x86] + - KVM_GUESTDBG_INJECT_BP: inject BP type exception [x86] + - KVM_GUESTDBG_EXIT_PENDING: trigger an immediate guest exit [s390] + +For example KVM_GUESTDBG_USE_SW_BP indicates that software breakpoints +are enabled in memory so we need to ensure breakpoint exceptions are +correctly trapped and the KVM run loop exits at the breakpoint and not +running off into the normal guest vector. For KVM_GUESTDBG_USE_HW_BP +we need to ensure the guest vCPUs architecture specific registers are +updated to the correct (supplied) values. + +The second part of the structure is architecture specific and +typically contains a set of debug registers. + +For arm64 the number of debug registers is implementation defined and +can be determined by querying the KVM_CAP_GUEST_DEBUG_HW_BPS and +KVM_CAP_GUEST_DEBUG_HW_WPS capabilities which return a positive number +indicating the number of supported registers. + +When debug events exit the main run loop with the reason +KVM_EXIT_DEBUG with the kvm_debug_exit_arch part of the kvm_run +structure containing architecture specific debug information. + +4.88 KVM_GET_EMULATED_CPUID + +Capability: KVM_CAP_EXT_EMUL_CPUID +Architectures: x86 +Type: system ioctl +Parameters: struct kvm_cpuid2 (in/out) +Returns: 0 on success, -1 on error + +struct kvm_cpuid2 { + __u32 nent; + __u32 flags; + struct kvm_cpuid_entry2 entries[0]; +}; + +The member 'flags' is used for passing flags from userspace. + +#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX BIT(0) +#define KVM_CPUID_FLAG_STATEFUL_FUNC BIT(1) +#define KVM_CPUID_FLAG_STATE_READ_NEXT BIT(2) + +struct kvm_cpuid_entry2 { + __u32 function; + __u32 index; + __u32 flags; + __u32 eax; + __u32 ebx; + __u32 ecx; + __u32 edx; + __u32 padding[3]; +}; + +This ioctl returns x86 cpuid features which are emulated by +kvm.Userspace can use the information returned by this ioctl to query +which features are emulated by kvm instead of being present natively. + +Userspace invokes KVM_GET_EMULATED_CPUID by passing a kvm_cpuid2 +structure with the 'nent' field indicating the number of entries in +the variable-size array 'entries'. If the number of entries is too low +to describe the cpu capabilities, an error (E2BIG) is returned. If the +number is too high, the 'nent' field is adjusted and an error (ENOMEM) +is returned. If the number is just right, the 'nent' field is adjusted +to the number of valid entries in the 'entries' array, which is then +filled. + +The entries returned are the set CPUID bits of the respective features +which kvm emulates, as returned by the CPUID instruction, with unknown +or unsupported feature bits cleared. + +Features like x2apic, for example, may not be present in the host cpu +but are exposed by kvm in KVM_GET_SUPPORTED_CPUID because they can be +emulated efficiently and thus not included here. + +The fields in each entry are defined as follows: + + function: the eax value used to obtain the entry + index: the ecx value used to obtain the entry (for entries that are + affected by ecx) + flags: an OR of zero or more of the following: + KVM_CPUID_FLAG_SIGNIFCANT_INDEX: + if the index field is valid + KVM_CPUID_FLAG_STATEFUL_FUNC: + if cpuid for this function returns different values for successive + invocations; there will be several entries with the same function, + all with this flag set + KVM_CPUID_FLAG_STATE_READ_NEXT: + for KVM_CPUID_FLAG_STATEFUL_FUNC entries, set if this entry is + the first entry to be read by a cpu + eax, ebx, ecx, edx: the values returned by the cpuid instruction for + this function/index combination + +4.89 KVM_S390_MEM_OP + +Capability: KVM_CAP_S390_MEM_OP +Architectures: s390 +Type: vcpu ioctl +Parameters: struct kvm_s390_mem_op (in) +Returns: = 0 on success, + < 0 on generic error (e.g. -EFAULT or -ENOMEM), + > 0 if an exception occurred while walking the page tables + +Read or write data from/to the logical (virtual) memory of a VCPU. + +Parameters are specified via the following structure: + +struct kvm_s390_mem_op { + __u64 gaddr; /* the guest address */ + __u64 flags; /* flags */ + __u32 size; /* amount of bytes */ + __u32 op; /* type of operation */ + __u64 buf; /* buffer in userspace */ + __u8 ar; /* the access register number */ + __u8 reserved[31]; /* should be set to 0 */ +}; + +The type of operation is specified in the "op" field. It is either +KVM_S390_MEMOP_LOGICAL_READ for reading from logical memory space or +KVM_S390_MEMOP_LOGICAL_WRITE for writing to logical memory space. The +KVM_S390_MEMOP_F_CHECK_ONLY flag can be set in the "flags" field to check +whether the corresponding memory access would create an access exception +(without touching the data in the memory at the destination). In case an +access exception occurred while walking the MMU tables of the guest, the +ioctl returns a positive error number to indicate the type of exception. +This exception is also raised directly at the corresponding VCPU if the +flag KVM_S390_MEMOP_F_INJECT_EXCEPTION is set in the "flags" field. + +The start address of the memory region has to be specified in the "gaddr" +field, and the length of the region in the "size" field. "buf" is the buffer +supplied by the userspace application where the read data should be written +to for KVM_S390_MEMOP_LOGICAL_READ, or where the data that should be written +is stored for a KVM_S390_MEMOP_LOGICAL_WRITE. "buf" is unused and can be NULL +when KVM_S390_MEMOP_F_CHECK_ONLY is specified. "ar" designates the access +register number to be used. + +The "reserved" field is meant for future extensions. It is not used by +KVM with the currently defined set of flags. + +4.90 KVM_S390_GET_SKEYS + +Capability: KVM_CAP_S390_SKEYS +Architectures: s390 +Type: vm ioctl +Parameters: struct kvm_s390_skeys +Returns: 0 on success, KVM_S390_GET_KEYS_NONE if guest is not using storage + keys, negative value on error + +This ioctl is used to get guest storage key values on the s390 +architecture. The ioctl takes parameters via the kvm_s390_skeys struct. + +struct kvm_s390_skeys { + __u64 start_gfn; + __u64 count; + __u64 skeydata_addr; + __u32 flags; + __u32 reserved[9]; +}; + +The start_gfn field is the number of the first guest frame whose storage keys +you want to get. + +The count field is the number of consecutive frames (starting from start_gfn) +whose storage keys to get. The count field must be at least 1 and the maximum +allowed value is defined as KVM_S390_SKEYS_ALLOC_MAX. Values outside this range +will cause the ioctl to return -EINVAL. + +The skeydata_addr field is the address to a buffer large enough to hold count +bytes. This buffer will be filled with storage key data by the ioctl. + +4.91 KVM_S390_SET_SKEYS + +Capability: KVM_CAP_S390_SKEYS +Architectures: s390 +Type: vm ioctl +Parameters: struct kvm_s390_skeys +Returns: 0 on success, negative value on error + +This ioctl is used to set guest storage key values on the s390 +architecture. The ioctl takes parameters via the kvm_s390_skeys struct. +See section on KVM_S390_GET_SKEYS for struct definition. + +The start_gfn field is the number of the first guest frame whose storage keys +you want to set. + +The count field is the number of consecutive frames (starting from start_gfn) +whose storage keys to get. The count field must be at least 1 and the maximum +allowed value is defined as KVM_S390_SKEYS_ALLOC_MAX. Values outside this range +will cause the ioctl to return -EINVAL. + +The skeydata_addr field is the address to a buffer containing count bytes of +storage keys. Each byte in the buffer will be set as the storage key for a +single frame starting at start_gfn for count frames. + +Note: If any architecturally invalid key value is found in the given data then +the ioctl will return -EINVAL. + +4.92 KVM_S390_IRQ + +Capability: KVM_CAP_S390_INJECT_IRQ +Architectures: s390 +Type: vcpu ioctl +Parameters: struct kvm_s390_irq (in) +Returns: 0 on success, -1 on error +Errors: + EINVAL: interrupt type is invalid + type is KVM_S390_SIGP_STOP and flag parameter is invalid value + type is KVM_S390_INT_EXTERNAL_CALL and code is bigger + than the maximum of VCPUs + EBUSY: type is KVM_S390_SIGP_SET_PREFIX and vcpu is not stopped + type is KVM_S390_SIGP_STOP and a stop irq is already pending + type is KVM_S390_INT_EXTERNAL_CALL and an external call interrupt + is already pending + +Allows to inject an interrupt to the guest. + +Using struct kvm_s390_irq as a parameter allows +to inject additional payload which is not +possible via KVM_S390_INTERRUPT. + +Interrupt parameters are passed via kvm_s390_irq: + +struct kvm_s390_irq { + __u64 type; + union { + struct kvm_s390_io_info io; + struct kvm_s390_ext_info ext; + struct kvm_s390_pgm_info pgm; + struct kvm_s390_emerg_info emerg; + struct kvm_s390_extcall_info extcall; + struct kvm_s390_prefix_info prefix; + struct kvm_s390_stop_info stop; + struct kvm_s390_mchk_info mchk; + char reserved[64]; + } u; +}; + +type can be one of the following: + +KVM_S390_SIGP_STOP - sigp stop; parameter in .stop +KVM_S390_PROGRAM_INT - program check; parameters in .pgm +KVM_S390_SIGP_SET_PREFIX - sigp set prefix; parameters in .prefix +KVM_S390_RESTART - restart; no parameters +KVM_S390_INT_CLOCK_COMP - clock comparator interrupt; no parameters +KVM_S390_INT_CPU_TIMER - CPU timer interrupt; no parameters +KVM_S390_INT_EMERGENCY - sigp emergency; parameters in .emerg +KVM_S390_INT_EXTERNAL_CALL - sigp external call; parameters in .extcall +KVM_S390_MCHK - machine check interrupt; parameters in .mchk + +This is an asynchronous vcpu ioctl and can be invoked from any thread. + +4.94 KVM_S390_GET_IRQ_STATE + +Capability: KVM_CAP_S390_IRQ_STATE +Architectures: s390 +Type: vcpu ioctl +Parameters: struct kvm_s390_irq_state (out) +Returns: >= number of bytes copied into buffer, + -EINVAL if buffer size is 0, + -ENOBUFS if buffer size is too small to fit all pending interrupts, + -EFAULT if the buffer address was invalid + +This ioctl allows userspace to retrieve the complete state of all currently +pending interrupts in a single buffer. Use cases include migration +and introspection. The parameter structure contains the address of a +userspace buffer and its length: + +struct kvm_s390_irq_state { + __u64 buf; + __u32 flags; /* will stay unused for compatibility reasons */ + __u32 len; + __u32 reserved[4]; /* will stay unused for compatibility reasons */ +}; + +Userspace passes in the above struct and for each pending interrupt a +struct kvm_s390_irq is copied to the provided buffer. + +The structure contains a flags and a reserved field for future extensions. As +the kernel never checked for flags == 0 and QEMU never pre-zeroed flags and +reserved, these fields can not be used in the future without breaking +compatibility. + +If -ENOBUFS is returned the buffer provided was too small and userspace +may retry with a bigger buffer. + +4.95 KVM_S390_SET_IRQ_STATE + +Capability: KVM_CAP_S390_IRQ_STATE +Architectures: s390 +Type: vcpu ioctl +Parameters: struct kvm_s390_irq_state (in) +Returns: 0 on success, + -EFAULT if the buffer address was invalid, + -EINVAL for an invalid buffer length (see below), + -EBUSY if there were already interrupts pending, + errors occurring when actually injecting the + interrupt. See KVM_S390_IRQ. + +This ioctl allows userspace to set the complete state of all cpu-local +interrupts currently pending for the vcpu. It is intended for restoring +interrupt state after a migration. The input parameter is a userspace buffer +containing a struct kvm_s390_irq_state: + +struct kvm_s390_irq_state { + __u64 buf; + __u32 flags; /* will stay unused for compatibility reasons */ + __u32 len; + __u32 reserved[4]; /* will stay unused for compatibility reasons */ +}; + +The restrictions for flags and reserved apply as well. +(see KVM_S390_GET_IRQ_STATE) + +The userspace memory referenced by buf contains a struct kvm_s390_irq +for each interrupt to be injected into the guest. +If one of the interrupts could not be injected for some reason the +ioctl aborts. + +len must be a multiple of sizeof(struct kvm_s390_irq). It must be > 0 +and it must not exceed (max_vcpus + 32) * sizeof(struct kvm_s390_irq), +which is the maximum number of possibly pending cpu-local interrupts. + +4.96 KVM_SMI + +Capability: KVM_CAP_X86_SMM +Architectures: x86 +Type: vcpu ioctl +Parameters: none +Returns: 0 on success, -1 on error + +Queues an SMI on the thread's vcpu. + +4.97 KVM_CAP_PPC_MULTITCE + +Capability: KVM_CAP_PPC_MULTITCE +Architectures: ppc +Type: vm + +This capability means the kernel is capable of handling hypercalls +H_PUT_TCE_INDIRECT and H_STUFF_TCE without passing those into the user +space. This significantly accelerates DMA operations for PPC KVM guests. +User space should expect that its handlers for these hypercalls +are not going to be called if user space previously registered LIOBN +in KVM (via KVM_CREATE_SPAPR_TCE or similar calls). + +In order to enable H_PUT_TCE_INDIRECT and H_STUFF_TCE use in the guest, +user space might have to advertise it for the guest. For example, +IBM pSeries (sPAPR) guest starts using them if "hcall-multi-tce" is +present in the "ibm,hypertas-functions" device-tree property. + +The hypercalls mentioned above may or may not be processed successfully +in the kernel based fast path. If they can not be handled by the kernel, +they will get passed on to user space. So user space still has to have +an implementation for these despite the in kernel acceleration. + +This capability is always enabled. + +4.98 KVM_CREATE_SPAPR_TCE_64 + +Capability: KVM_CAP_SPAPR_TCE_64 +Architectures: powerpc +Type: vm ioctl +Parameters: struct kvm_create_spapr_tce_64 (in) +Returns: file descriptor for manipulating the created TCE table + +This is an extension for KVM_CAP_SPAPR_TCE which only supports 32bit +windows, described in 4.62 KVM_CREATE_SPAPR_TCE + +This capability uses extended struct in ioctl interface: + +/* for KVM_CAP_SPAPR_TCE_64 */ +struct kvm_create_spapr_tce_64 { + __u64 liobn; + __u32 page_shift; + __u32 flags; + __u64 offset; /* in pages */ + __u64 size; /* in pages */ +}; + +The aim of extension is to support an additional bigger DMA window with +a variable page size. +KVM_CREATE_SPAPR_TCE_64 receives a 64bit window size, an IOMMU page shift and +a bus offset of the corresponding DMA window, @size and @offset are numbers +of IOMMU pages. + +@flags are not used at the moment. + +The rest of functionality is identical to KVM_CREATE_SPAPR_TCE. + +4.99 KVM_REINJECT_CONTROL + +Capability: KVM_CAP_REINJECT_CONTROL +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_reinject_control (in) +Returns: 0 on success, + -EFAULT if struct kvm_reinject_control cannot be read, + -ENXIO if KVM_CREATE_PIT or KVM_CREATE_PIT2 didn't succeed earlier. + +i8254 (PIT) has two modes, reinject and !reinject. The default is reinject, +where KVM queues elapsed i8254 ticks and monitors completion of interrupt from +vector(s) that i8254 injects. Reinject mode dequeues a tick and injects its +interrupt whenever there isn't a pending interrupt from i8254. +!reinject mode injects an interrupt as soon as a tick arrives. + +struct kvm_reinject_control { + __u8 pit_reinject; + __u8 reserved[31]; +}; + +pit_reinject = 0 (!reinject mode) is recommended, unless running an old +operating system that uses the PIT for timing (e.g. Linux 2.4.x). + +4.100 KVM_PPC_CONFIGURE_V3_MMU + +Capability: KVM_CAP_PPC_RADIX_MMU or KVM_CAP_PPC_HASH_MMU_V3 +Architectures: ppc +Type: vm ioctl +Parameters: struct kvm_ppc_mmuv3_cfg (in) +Returns: 0 on success, + -EFAULT if struct kvm_ppc_mmuv3_cfg cannot be read, + -EINVAL if the configuration is invalid + +This ioctl controls whether the guest will use radix or HPT (hashed +page table) translation, and sets the pointer to the process table for +the guest. + +struct kvm_ppc_mmuv3_cfg { + __u64 flags; + __u64 process_table; +}; + +There are two bits that can be set in flags; KVM_PPC_MMUV3_RADIX and +KVM_PPC_MMUV3_GTSE. KVM_PPC_MMUV3_RADIX, if set, configures the guest +to use radix tree translation, and if clear, to use HPT translation. +KVM_PPC_MMUV3_GTSE, if set and if KVM permits it, configures the guest +to be able to use the global TLB and SLB invalidation instructions; +if clear, the guest may not use these instructions. + +The process_table field specifies the address and size of the guest +process table, which is in the guest's space. This field is formatted +as the second doubleword of the partition table entry, as defined in +the Power ISA V3.00, Book III section 5.7.6.1. + +4.101 KVM_PPC_GET_RMMU_INFO + +Capability: KVM_CAP_PPC_RADIX_MMU +Architectures: ppc +Type: vm ioctl +Parameters: struct kvm_ppc_rmmu_info (out) +Returns: 0 on success, + -EFAULT if struct kvm_ppc_rmmu_info cannot be written, + -EINVAL if no useful information can be returned + +This ioctl returns a structure containing two things: (a) a list +containing supported radix tree geometries, and (b) a list that maps +page sizes to put in the "AP" (actual page size) field for the tlbie +(TLB invalidate entry) instruction. + +struct kvm_ppc_rmmu_info { + struct kvm_ppc_radix_geom { + __u8 page_shift; + __u8 level_bits[4]; + __u8 pad[3]; + } geometries[8]; + __u32 ap_encodings[8]; +}; + +The geometries[] field gives up to 8 supported geometries for the +radix page table, in terms of the log base 2 of the smallest page +size, and the number of bits indexed at each level of the tree, from +the PTE level up to the PGD level in that order. Any unused entries +will have 0 in the page_shift field. + +The ap_encodings gives the supported page sizes and their AP field +encodings, encoded with the AP value in the top 3 bits and the log +base 2 of the page size in the bottom 6 bits. + +4.102 KVM_PPC_RESIZE_HPT_PREPARE + +Capability: KVM_CAP_SPAPR_RESIZE_HPT +Architectures: powerpc +Type: vm ioctl +Parameters: struct kvm_ppc_resize_hpt (in) +Returns: 0 on successful completion, + >0 if a new HPT is being prepared, the value is an estimated + number of milliseconds until preparation is complete + -EFAULT if struct kvm_reinject_control cannot be read, + -EINVAL if the supplied shift or flags are invalid + -ENOMEM if unable to allocate the new HPT + -ENOSPC if there was a hash collision when moving existing + HPT entries to the new HPT + -EIO on other error conditions + +Used to implement the PAPR extension for runtime resizing of a guest's +Hashed Page Table (HPT). Specifically this starts, stops or monitors +the preparation of a new potential HPT for the guest, essentially +implementing the H_RESIZE_HPT_PREPARE hypercall. + +If called with shift > 0 when there is no pending HPT for the guest, +this begins preparation of a new pending HPT of size 2^(shift) bytes. +It then returns a positive integer with the estimated number of +milliseconds until preparation is complete. + +If called when there is a pending HPT whose size does not match that +requested in the parameters, discards the existing pending HPT and +creates a new one as above. + +If called when there is a pending HPT of the size requested, will: + * If preparation of the pending HPT is already complete, return 0 + * If preparation of the pending HPT has failed, return an error + code, then discard the pending HPT. + * If preparation of the pending HPT is still in progress, return an + estimated number of milliseconds until preparation is complete. + +If called with shift == 0, discards any currently pending HPT and +returns 0 (i.e. cancels any in-progress preparation). + +flags is reserved for future expansion, currently setting any bits in +flags will result in an -EINVAL. + +Normally this will be called repeatedly with the same parameters until +it returns <= 0. The first call will initiate preparation, subsequent +ones will monitor preparation until it completes or fails. + +struct kvm_ppc_resize_hpt { + __u64 flags; + __u32 shift; + __u32 pad; +}; + +4.103 KVM_PPC_RESIZE_HPT_COMMIT + +Capability: KVM_CAP_SPAPR_RESIZE_HPT +Architectures: powerpc +Type: vm ioctl +Parameters: struct kvm_ppc_resize_hpt (in) +Returns: 0 on successful completion, + -EFAULT if struct kvm_reinject_control cannot be read, + -EINVAL if the supplied shift or flags are invalid + -ENXIO is there is no pending HPT, or the pending HPT doesn't + have the requested size + -EBUSY if the pending HPT is not fully prepared + -ENOSPC if there was a hash collision when moving existing + HPT entries to the new HPT + -EIO on other error conditions + +Used to implement the PAPR extension for runtime resizing of a guest's +Hashed Page Table (HPT). Specifically this requests that the guest be +transferred to working with the new HPT, essentially implementing the +H_RESIZE_HPT_COMMIT hypercall. + +This should only be called after KVM_PPC_RESIZE_HPT_PREPARE has +returned 0 with the same parameters. In other cases +KVM_PPC_RESIZE_HPT_COMMIT will return an error (usually -ENXIO or +-EBUSY, though others may be possible if the preparation was started, +but failed). + +This will have undefined effects on the guest if it has not already +placed itself in a quiescent state where no vcpu will make MMU enabled +memory accesses. + +On succsful completion, the pending HPT will become the guest's active +HPT and the previous HPT will be discarded. + +On failure, the guest will still be operating on its previous HPT. + +struct kvm_ppc_resize_hpt { + __u64 flags; + __u32 shift; + __u32 pad; +}; + +4.104 KVM_X86_GET_MCE_CAP_SUPPORTED + +Capability: KVM_CAP_MCE +Architectures: x86 +Type: system ioctl +Parameters: u64 mce_cap (out) +Returns: 0 on success, -1 on error + +Returns supported MCE capabilities. The u64 mce_cap parameter +has the same format as the MSR_IA32_MCG_CAP register. Supported +capabilities will have the corresponding bits set. + +4.105 KVM_X86_SETUP_MCE + +Capability: KVM_CAP_MCE +Architectures: x86 +Type: vcpu ioctl +Parameters: u64 mcg_cap (in) +Returns: 0 on success, + -EFAULT if u64 mcg_cap cannot be read, + -EINVAL if the requested number of banks is invalid, + -EINVAL if requested MCE capability is not supported. + +Initializes MCE support for use. The u64 mcg_cap parameter +has the same format as the MSR_IA32_MCG_CAP register and +specifies which capabilities should be enabled. The maximum +supported number of error-reporting banks can be retrieved when +checking for KVM_CAP_MCE. The supported capabilities can be +retrieved with KVM_X86_GET_MCE_CAP_SUPPORTED. + +4.106 KVM_X86_SET_MCE + +Capability: KVM_CAP_MCE +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_x86_mce (in) +Returns: 0 on success, + -EFAULT if struct kvm_x86_mce cannot be read, + -EINVAL if the bank number is invalid, + -EINVAL if VAL bit is not set in status field. + +Inject a machine check error (MCE) into the guest. The input +parameter is: + +struct kvm_x86_mce { + __u64 status; + __u64 addr; + __u64 misc; + __u64 mcg_status; + __u8 bank; + __u8 pad1[7]; + __u64 pad2[3]; +}; + +If the MCE being reported is an uncorrected error, KVM will +inject it as an MCE exception into the guest. If the guest +MCG_STATUS register reports that an MCE is in progress, KVM +causes an KVM_EXIT_SHUTDOWN vmexit. + +Otherwise, if the MCE is a corrected error, KVM will just +store it in the corresponding bank (provided this bank is +not holding a previously reported uncorrected error). + +4.107 KVM_S390_GET_CMMA_BITS + +Capability: KVM_CAP_S390_CMMA_MIGRATION +Architectures: s390 +Type: vm ioctl +Parameters: struct kvm_s390_cmma_log (in, out) +Returns: 0 on success, a negative value on error + +This ioctl is used to get the values of the CMMA bits on the s390 +architecture. It is meant to be used in two scenarios: +- During live migration to save the CMMA values. Live migration needs + to be enabled via the KVM_REQ_START_MIGRATION VM property. +- To non-destructively peek at the CMMA values, with the flag + KVM_S390_CMMA_PEEK set. + +The ioctl takes parameters via the kvm_s390_cmma_log struct. The desired +values are written to a buffer whose location is indicated via the "values" +member in the kvm_s390_cmma_log struct. The values in the input struct are +also updated as needed. +Each CMMA value takes up one byte. + +struct kvm_s390_cmma_log { + __u64 start_gfn; + __u32 count; + __u32 flags; + union { + __u64 remaining; + __u64 mask; + }; + __u64 values; +}; + +start_gfn is the number of the first guest frame whose CMMA values are +to be retrieved, + +count is the length of the buffer in bytes, + +values points to the buffer where the result will be written to. + +If count is greater than KVM_S390_SKEYS_MAX, then it is considered to be +KVM_S390_SKEYS_MAX. KVM_S390_SKEYS_MAX is re-used for consistency with +other ioctls. + +The result is written in the buffer pointed to by the field values, and +the values of the input parameter are updated as follows. + +Depending on the flags, different actions are performed. The only +supported flag so far is KVM_S390_CMMA_PEEK. + +The default behaviour if KVM_S390_CMMA_PEEK is not set is: +start_gfn will indicate the first page frame whose CMMA bits were dirty. +It is not necessarily the same as the one passed as input, as clean pages +are skipped. + +count will indicate the number of bytes actually written in the buffer. +It can (and very often will) be smaller than the input value, since the +buffer is only filled until 16 bytes of clean values are found (which +are then not copied in the buffer). Since a CMMA migration block needs +the base address and the length, for a total of 16 bytes, we will send +back some clean data if there is some dirty data afterwards, as long as +the size of the clean data does not exceed the size of the header. This +allows to minimize the amount of data to be saved or transferred over +the network at the expense of more roundtrips to userspace. The next +invocation of the ioctl will skip over all the clean values, saving +potentially more than just the 16 bytes we found. + +If KVM_S390_CMMA_PEEK is set: +the existing storage attributes are read even when not in migration +mode, and no other action is performed; + +the output start_gfn will be equal to the input start_gfn, + +the output count will be equal to the input count, except if the end of +memory has been reached. + +In both cases: +the field "remaining" will indicate the total number of dirty CMMA values +still remaining, or 0 if KVM_S390_CMMA_PEEK is set and migration mode is +not enabled. + +mask is unused. + +values points to the userspace buffer where the result will be stored. + +This ioctl can fail with -ENOMEM if not enough memory can be allocated to +complete the task, with -ENXIO if CMMA is not enabled, with -EINVAL if +KVM_S390_CMMA_PEEK is not set but migration mode was not enabled, with +-EFAULT if the userspace address is invalid or if no page table is +present for the addresses (e.g. when using hugepages). + +4.108 KVM_S390_SET_CMMA_BITS + +Capability: KVM_CAP_S390_CMMA_MIGRATION +Architectures: s390 +Type: vm ioctl +Parameters: struct kvm_s390_cmma_log (in) +Returns: 0 on success, a negative value on error + +This ioctl is used to set the values of the CMMA bits on the s390 +architecture. It is meant to be used during live migration to restore +the CMMA values, but there are no restrictions on its use. +The ioctl takes parameters via the kvm_s390_cmma_values struct. +Each CMMA value takes up one byte. + +struct kvm_s390_cmma_log { + __u64 start_gfn; + __u32 count; + __u32 flags; + union { + __u64 remaining; + __u64 mask; + }; + __u64 values; +}; + +start_gfn indicates the starting guest frame number, + +count indicates how many values are to be considered in the buffer, + +flags is not used and must be 0. + +mask indicates which PGSTE bits are to be considered. + +remaining is not used. + +values points to the buffer in userspace where to store the values. + +This ioctl can fail with -ENOMEM if not enough memory can be allocated to +complete the task, with -ENXIO if CMMA is not enabled, with -EINVAL if +the count field is too large (e.g. more than KVM_S390_CMMA_SIZE_MAX) or +if the flags field was not 0, with -EFAULT if the userspace address is +invalid, if invalid pages are written to (e.g. after the end of memory) +or if no page table is present for the addresses (e.g. when using +hugepages). + +4.109 KVM_PPC_GET_CPU_CHAR + +Capability: KVM_CAP_PPC_GET_CPU_CHAR +Architectures: powerpc +Type: vm ioctl +Parameters: struct kvm_ppc_cpu_char (out) +Returns: 0 on successful completion + -EFAULT if struct kvm_ppc_cpu_char cannot be written + +This ioctl gives userspace information about certain characteristics +of the CPU relating to speculative execution of instructions and +possible information leakage resulting from speculative execution (see +CVE-2017-5715, CVE-2017-5753 and CVE-2017-5754). The information is +returned in struct kvm_ppc_cpu_char, which looks like this: + +struct kvm_ppc_cpu_char { + __u64 character; /* characteristics of the CPU */ + __u64 behaviour; /* recommended software behaviour */ + __u64 character_mask; /* valid bits in character */ + __u64 behaviour_mask; /* valid bits in behaviour */ +}; + +For extensibility, the character_mask and behaviour_mask fields +indicate which bits of character and behaviour have been filled in by +the kernel. If the set of defined bits is extended in future then +userspace will be able to tell whether it is running on a kernel that +knows about the new bits. + +The character field describes attributes of the CPU which can help +with preventing inadvertent information disclosure - specifically, +whether there is an instruction to flash-invalidate the L1 data cache +(ori 30,30,0 or mtspr SPRN_TRIG2,rN), whether the L1 data cache is set +to a mode where entries can only be used by the thread that created +them, whether the bcctr[l] instruction prevents speculation, and +whether a speculation barrier instruction (ori 31,31,0) is provided. + +The behaviour field describes actions that software should take to +prevent inadvertent information disclosure, and thus describes which +vulnerabilities the hardware is subject to; specifically whether the +L1 data cache should be flushed when returning to user mode from the +kernel, and whether a speculation barrier should be placed between an +array bounds check and the array access. + +These fields use the same bit definitions as the new +H_GET_CPU_CHARACTERISTICS hypercall. + +4.110 KVM_MEMORY_ENCRYPT_OP + +Capability: basic +Architectures: x86 +Type: system +Parameters: an opaque platform specific structure (in/out) +Returns: 0 on success; -1 on error + +If the platform supports creating encrypted VMs then this ioctl can be used +for issuing platform-specific memory encryption commands to manage those +encrypted VMs. + +Currently, this ioctl is used for issuing Secure Encrypted Virtualization +(SEV) commands on AMD Processors. The SEV commands are defined in +Documentation/virt/kvm/amd-memory-encryption.rst. + +4.111 KVM_MEMORY_ENCRYPT_REG_REGION + +Capability: basic +Architectures: x86 +Type: system +Parameters: struct kvm_enc_region (in) +Returns: 0 on success; -1 on error + +This ioctl can be used to register a guest memory region which may +contain encrypted data (e.g. guest RAM, SMRAM etc). + +It is used in the SEV-enabled guest. When encryption is enabled, a guest +memory region may contain encrypted data. The SEV memory encryption +engine uses a tweak such that two identical plaintext pages, each at +different locations will have differing ciphertexts. So swapping or +moving ciphertext of those pages will not result in plaintext being +swapped. So relocating (or migrating) physical backing pages for the SEV +guest will require some additional steps. + +Note: The current SEV key management spec does not provide commands to +swap or migrate (move) ciphertext pages. Hence, for now we pin the guest +memory region registered with the ioctl. + +4.112 KVM_MEMORY_ENCRYPT_UNREG_REGION + +Capability: basic +Architectures: x86 +Type: system +Parameters: struct kvm_enc_region (in) +Returns: 0 on success; -1 on error + +This ioctl can be used to unregister the guest memory region registered +with KVM_MEMORY_ENCRYPT_REG_REGION ioctl above. + +4.113 KVM_HYPERV_EVENTFD + +Capability: KVM_CAP_HYPERV_EVENTFD +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_hyperv_eventfd (in) + +This ioctl (un)registers an eventfd to receive notifications from the guest on +the specified Hyper-V connection id through the SIGNAL_EVENT hypercall, without +causing a user exit. SIGNAL_EVENT hypercall with non-zero event flag number +(bits 24-31) still triggers a KVM_EXIT_HYPERV_HCALL user exit. + +struct kvm_hyperv_eventfd { + __u32 conn_id; + __s32 fd; + __u32 flags; + __u32 padding[3]; +}; + +The conn_id field should fit within 24 bits: + +#define KVM_HYPERV_CONN_ID_MASK 0x00ffffff + +The acceptable values for the flags field are: + +#define KVM_HYPERV_EVENTFD_DEASSIGN (1 << 0) + +Returns: 0 on success, + -EINVAL if conn_id or flags is outside the allowed range + -ENOENT on deassign if the conn_id isn't registered + -EEXIST on assign if the conn_id is already registered + +4.114 KVM_GET_NESTED_STATE + +Capability: KVM_CAP_NESTED_STATE +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_nested_state (in/out) +Returns: 0 on success, -1 on error +Errors: + E2BIG: the total state size exceeds the value of 'size' specified by + the user; the size required will be written into size. + +struct kvm_nested_state { + __u16 flags; + __u16 format; + __u32 size; + + union { + struct kvm_vmx_nested_state_hdr vmx; + struct kvm_svm_nested_state_hdr svm; + + /* Pad the header to 128 bytes. */ + __u8 pad[120]; + } hdr; + + union { + struct kvm_vmx_nested_state_data vmx[0]; + struct kvm_svm_nested_state_data svm[0]; + } data; +}; + +#define KVM_STATE_NESTED_GUEST_MODE 0x00000001 +#define KVM_STATE_NESTED_RUN_PENDING 0x00000002 +#define KVM_STATE_NESTED_EVMCS 0x00000004 + +#define KVM_STATE_NESTED_FORMAT_VMX 0 +#define KVM_STATE_NESTED_FORMAT_SVM 1 + +#define KVM_STATE_NESTED_VMX_VMCS_SIZE 0x1000 + +#define KVM_STATE_NESTED_VMX_SMM_GUEST_MODE 0x00000001 +#define KVM_STATE_NESTED_VMX_SMM_VMXON 0x00000002 + +struct kvm_vmx_nested_state_hdr { + __u64 vmxon_pa; + __u64 vmcs12_pa; + + struct { + __u16 flags; + } smm; +}; + +struct kvm_vmx_nested_state_data { + __u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE]; + __u8 shadow_vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE]; +}; + +This ioctl copies the vcpu's nested virtualization state from the kernel to +userspace. + +The maximum size of the state can be retrieved by passing KVM_CAP_NESTED_STATE +to the KVM_CHECK_EXTENSION ioctl(). + +4.115 KVM_SET_NESTED_STATE + +Capability: KVM_CAP_NESTED_STATE +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_nested_state (in) +Returns: 0 on success, -1 on error + +This copies the vcpu's kvm_nested_state struct from userspace to the kernel. +For the definition of struct kvm_nested_state, see KVM_GET_NESTED_STATE. + +4.116 KVM_(UN)REGISTER_COALESCED_MMIO + +Capability: KVM_CAP_COALESCED_MMIO (for coalesced mmio) + KVM_CAP_COALESCED_PIO (for coalesced pio) +Architectures: all +Type: vm ioctl +Parameters: struct kvm_coalesced_mmio_zone +Returns: 0 on success, < 0 on error + +Coalesced I/O is a performance optimization that defers hardware +register write emulation so that userspace exits are avoided. It is +typically used to reduce the overhead of emulating frequently accessed +hardware registers. + +When a hardware register is configured for coalesced I/O, write accesses +do not exit to userspace and their value is recorded in a ring buffer +that is shared between kernel and userspace. + +Coalesced I/O is used if one or more write accesses to a hardware +register can be deferred until a read or a write to another hardware +register on the same device. This last access will cause a vmexit and +userspace will process accesses from the ring buffer before emulating +it. That will avoid exiting to userspace on repeated writes. + +Coalesced pio is based on coalesced mmio. There is little difference +between coalesced mmio and pio except that coalesced pio records accesses +to I/O ports. + +4.117 KVM_CLEAR_DIRTY_LOG (vm ioctl) + +Capability: KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 +Architectures: x86, arm, arm64, mips +Type: vm ioctl +Parameters: struct kvm_dirty_log (in) +Returns: 0 on success, -1 on error + +/* for KVM_CLEAR_DIRTY_LOG */ +struct kvm_clear_dirty_log { + __u32 slot; + __u32 num_pages; + __u64 first_page; + union { + void __user *dirty_bitmap; /* one bit per page */ + __u64 padding; + }; +}; + +The ioctl clears the dirty status of pages in a memory slot, according to +the bitmap that is passed in struct kvm_clear_dirty_log's dirty_bitmap +field. Bit 0 of the bitmap corresponds to page "first_page" in the +memory slot, and num_pages is the size in bits of the input bitmap. +first_page must be a multiple of 64; num_pages must also be a multiple of +64 unless first_page + num_pages is the size of the memory slot. For each +bit that is set in the input bitmap, the corresponding page is marked "clean" +in KVM's dirty bitmap, and dirty tracking is re-enabled for that page +(for example via write-protection, or by clearing the dirty bit in +a page table entry). + +If KVM_CAP_MULTI_ADDRESS_SPACE is available, bits 16-31 specifies +the address space for which you want to return the dirty bitmap. +They must be less than the value that KVM_CHECK_EXTENSION returns for +the KVM_CAP_MULTI_ADDRESS_SPACE capability. + +This ioctl is mostly useful when KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 +is enabled; for more information, see the description of the capability. +However, it can always be used as long as KVM_CHECK_EXTENSION confirms +that KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 is present. + +4.118 KVM_GET_SUPPORTED_HV_CPUID + +Capability: KVM_CAP_HYPERV_CPUID +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_cpuid2 (in/out) +Returns: 0 on success, -1 on error + +struct kvm_cpuid2 { + __u32 nent; + __u32 padding; + struct kvm_cpuid_entry2 entries[0]; +}; + +struct kvm_cpuid_entry2 { + __u32 function; + __u32 index; + __u32 flags; + __u32 eax; + __u32 ebx; + __u32 ecx; + __u32 edx; + __u32 padding[3]; +}; + +This ioctl returns x86 cpuid features leaves related to Hyper-V emulation in +KVM. Userspace can use the information returned by this ioctl to construct +cpuid information presented to guests consuming Hyper-V enlightenments (e.g. +Windows or Hyper-V guests). + +CPUID feature leaves returned by this ioctl are defined by Hyper-V Top Level +Functional Specification (TLFS). These leaves can't be obtained with +KVM_GET_SUPPORTED_CPUID ioctl because some of them intersect with KVM feature +leaves (0x40000000, 0x40000001). + +Currently, the following list of CPUID leaves are returned: + HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS + HYPERV_CPUID_INTERFACE + HYPERV_CPUID_VERSION + HYPERV_CPUID_FEATURES + HYPERV_CPUID_ENLIGHTMENT_INFO + HYPERV_CPUID_IMPLEMENT_LIMITS + HYPERV_CPUID_NESTED_FEATURES + +HYPERV_CPUID_NESTED_FEATURES leaf is only exposed when Enlightened VMCS was +enabled on the corresponding vCPU (KVM_CAP_HYPERV_ENLIGHTENED_VMCS). + +Userspace invokes KVM_GET_SUPPORTED_CPUID by passing a kvm_cpuid2 structure +with the 'nent' field indicating the number of entries in the variable-size +array 'entries'. If the number of entries is too low to describe all Hyper-V +feature leaves, an error (E2BIG) is returned. If the number is more or equal +to the number of Hyper-V feature leaves, the 'nent' field is adjusted to the +number of valid entries in the 'entries' array, which is then filled. + +'index' and 'flags' fields in 'struct kvm_cpuid_entry2' are currently reserved, +userspace should not expect to get any particular value there. + +4.119 KVM_ARM_VCPU_FINALIZE + +Architectures: arm, arm64 +Type: vcpu ioctl +Parameters: int feature (in) +Returns: 0 on success, -1 on error +Errors: + EPERM: feature not enabled, needs configuration, or already finalized + EINVAL: feature unknown or not present + +Recognised values for feature: + arm64 KVM_ARM_VCPU_SVE (requires KVM_CAP_ARM_SVE) + +Finalizes the configuration of the specified vcpu feature. + +The vcpu must already have been initialised, enabling the affected feature, by +means of a successful KVM_ARM_VCPU_INIT call with the appropriate flag set in +features[]. + +For affected vcpu features, this is a mandatory step that must be performed +before the vcpu is fully usable. + +Between KVM_ARM_VCPU_INIT and KVM_ARM_VCPU_FINALIZE, the feature may be +configured by use of ioctls such as KVM_SET_ONE_REG. The exact configuration +that should be performaned and how to do it are feature-dependent. + +Other calls that depend on a particular feature being finalized, such as +KVM_RUN, KVM_GET_REG_LIST, KVM_GET_ONE_REG and KVM_SET_ONE_REG, will fail with +-EPERM unless the feature has already been finalized by means of a +KVM_ARM_VCPU_FINALIZE call. + +See KVM_ARM_VCPU_INIT for details of vcpu features that require finalization +using this ioctl. + +4.120 KVM_SET_PMU_EVENT_FILTER + +Capability: KVM_CAP_PMU_EVENT_FILTER +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_pmu_event_filter (in) +Returns: 0 on success, -1 on error + +struct kvm_pmu_event_filter { + __u32 action; + __u32 nevents; + __u32 fixed_counter_bitmap; + __u32 flags; + __u32 pad[4]; + __u64 events[0]; +}; + +This ioctl restricts the set of PMU events that the guest can program. +The argument holds a list of events which will be allowed or denied. +The eventsel+umask of each event the guest attempts to program is compared +against the events field to determine whether the guest should have access. +The events field only controls general purpose counters; fixed purpose +counters are controlled by the fixed_counter_bitmap. + +No flags are defined yet, the field must be zero. + +Valid values for 'action': +#define KVM_PMU_EVENT_ALLOW 0 +#define KVM_PMU_EVENT_DENY 1 + + +5. The kvm_run structure +------------------------ + +Application code obtains a pointer to the kvm_run structure by +mmap()ing a vcpu fd. From that point, application code can control +execution by changing fields in kvm_run prior to calling the KVM_RUN +ioctl, and obtain information about the reason KVM_RUN returned by +looking up structure members. + +struct kvm_run { + /* in */ + __u8 request_interrupt_window; + +Request that KVM_RUN return when it becomes possible to inject external +interrupts into the guest. Useful in conjunction with KVM_INTERRUPT. + + __u8 immediate_exit; + +This field is polled once when KVM_RUN starts; if non-zero, KVM_RUN +exits immediately, returning -EINTR. In the common scenario where a +signal is used to "kick" a VCPU out of KVM_RUN, this field can be used +to avoid usage of KVM_SET_SIGNAL_MASK, which has worse scalability. +Rather than blocking the signal outside KVM_RUN, userspace can set up +a signal handler that sets run->immediate_exit to a non-zero value. + +This field is ignored if KVM_CAP_IMMEDIATE_EXIT is not available. + + __u8 padding1[6]; + + /* out */ + __u32 exit_reason; + +When KVM_RUN has returned successfully (return value 0), this informs +application code why KVM_RUN has returned. Allowable values for this +field are detailed below. + + __u8 ready_for_interrupt_injection; + +If request_interrupt_window has been specified, this field indicates +an interrupt can be injected now with KVM_INTERRUPT. + + __u8 if_flag; + +The value of the current interrupt flag. Only valid if in-kernel +local APIC is not used. + + __u16 flags; + +More architecture-specific flags detailing state of the VCPU that may +affect the device's behavior. The only currently defined flag is +KVM_RUN_X86_SMM, which is valid on x86 machines and is set if the +VCPU is in system management mode. + + /* in (pre_kvm_run), out (post_kvm_run) */ + __u64 cr8; + +The value of the cr8 register. Only valid if in-kernel local APIC is +not used. Both input and output. + + __u64 apic_base; + +The value of the APIC BASE msr. Only valid if in-kernel local +APIC is not used. Both input and output. + + union { + /* KVM_EXIT_UNKNOWN */ + struct { + __u64 hardware_exit_reason; + } hw; + +If exit_reason is KVM_EXIT_UNKNOWN, the vcpu has exited due to unknown +reasons. Further architecture-specific information is available in +hardware_exit_reason. + + /* KVM_EXIT_FAIL_ENTRY */ + struct { + __u64 hardware_entry_failure_reason; + } fail_entry; + +If exit_reason is KVM_EXIT_FAIL_ENTRY, the vcpu could not be run due +to unknown reasons. Further architecture-specific information is +available in hardware_entry_failure_reason. + + /* KVM_EXIT_EXCEPTION */ + struct { + __u32 exception; + __u32 error_code; + } ex; + +Unused. + + /* KVM_EXIT_IO */ + struct { +#define KVM_EXIT_IO_IN 0 +#define KVM_EXIT_IO_OUT 1 + __u8 direction; + __u8 size; /* bytes */ + __u16 port; + __u32 count; + __u64 data_offset; /* relative to kvm_run start */ + } io; + +If exit_reason is KVM_EXIT_IO, then the vcpu has +executed a port I/O instruction which could not be satisfied by kvm. +data_offset describes where the data is located (KVM_EXIT_IO_OUT) or +where kvm expects application code to place the data for the next +KVM_RUN invocation (KVM_EXIT_IO_IN). Data format is a packed array. + + /* KVM_EXIT_DEBUG */ + struct { + struct kvm_debug_exit_arch arch; + } debug; + +If the exit_reason is KVM_EXIT_DEBUG, then a vcpu is processing a debug event +for which architecture specific information is returned. + + /* KVM_EXIT_MMIO */ + struct { + __u64 phys_addr; + __u8 data[8]; + __u32 len; + __u8 is_write; + } mmio; + +If exit_reason is KVM_EXIT_MMIO, then the vcpu has +executed a memory-mapped I/O instruction which could not be satisfied +by kvm. The 'data' member contains the written data if 'is_write' is +true, and should be filled by application code otherwise. + +The 'data' member contains, in its first 'len' bytes, the value as it would +appear if the VCPU performed a load or store of the appropriate width directly +to the byte array. + +NOTE: For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR and + KVM_EXIT_EPR the corresponding +operations are complete (and guest state is consistent) only after userspace +has re-entered the kernel with KVM_RUN. The kernel side will first finish +incomplete operations and then check for pending signals. Userspace +can re-enter the guest with an unmasked signal pending to complete +pending operations. + + /* KVM_EXIT_HYPERCALL */ + struct { + __u64 nr; + __u64 args[6]; + __u64 ret; + __u32 longmode; + __u32 pad; + } hypercall; + +Unused. This was once used for 'hypercall to userspace'. To implement +such functionality, use KVM_EXIT_IO (x86) or KVM_EXIT_MMIO (all except s390). +Note KVM_EXIT_IO is significantly faster than KVM_EXIT_MMIO. + + /* KVM_EXIT_TPR_ACCESS */ + struct { + __u64 rip; + __u32 is_write; + __u32 pad; + } tpr_access; + +To be documented (KVM_TPR_ACCESS_REPORTING). + + /* KVM_EXIT_S390_SIEIC */ + struct { + __u8 icptcode; + __u64 mask; /* psw upper half */ + __u64 addr; /* psw lower half */ + __u16 ipa; + __u32 ipb; + } s390_sieic; + +s390 specific. + + /* KVM_EXIT_S390_RESET */ +#define KVM_S390_RESET_POR 1 +#define KVM_S390_RESET_CLEAR 2 +#define KVM_S390_RESET_SUBSYSTEM 4 +#define KVM_S390_RESET_CPU_INIT 8 +#define KVM_S390_RESET_IPL 16 + __u64 s390_reset_flags; + +s390 specific. + + /* KVM_EXIT_S390_UCONTROL */ + struct { + __u64 trans_exc_code; + __u32 pgm_code; + } s390_ucontrol; + +s390 specific. A page fault has occurred for a user controlled virtual +machine (KVM_VM_S390_UNCONTROL) on it's host page table that cannot be +resolved by the kernel. +The program code and the translation exception code that were placed +in the cpu's lowcore are presented here as defined by the z Architecture +Principles of Operation Book in the Chapter for Dynamic Address Translation +(DAT) + + /* KVM_EXIT_DCR */ + struct { + __u32 dcrn; + __u32 data; + __u8 is_write; + } dcr; + +Deprecated - was used for 440 KVM. + + /* KVM_EXIT_OSI */ + struct { + __u64 gprs[32]; + } osi; + +MOL uses a special hypercall interface it calls 'OSI'. To enable it, we catch +hypercalls and exit with this exit struct that contains all the guest gprs. + +If exit_reason is KVM_EXIT_OSI, then the vcpu has triggered such a hypercall. +Userspace can now handle the hypercall and when it's done modify the gprs as +necessary. Upon guest entry all guest GPRs will then be replaced by the values +in this struct. + + /* KVM_EXIT_PAPR_HCALL */ + struct { + __u64 nr; + __u64 ret; + __u64 args[9]; + } papr_hcall; + +This is used on 64-bit PowerPC when emulating a pSeries partition, +e.g. with the 'pseries' machine type in qemu. It occurs when the +guest does a hypercall using the 'sc 1' instruction. The 'nr' field +contains the hypercall number (from the guest R3), and 'args' contains +the arguments (from the guest R4 - R12). Userspace should put the +return code in 'ret' and any extra returned values in args[]. +The possible hypercalls are defined in the Power Architecture Platform +Requirements (PAPR) document available from www.power.org (free +developer registration required to access it). + + /* KVM_EXIT_S390_TSCH */ + struct { + __u16 subchannel_id; + __u16 subchannel_nr; + __u32 io_int_parm; + __u32 io_int_word; + __u32 ipb; + __u8 dequeued; + } s390_tsch; + +s390 specific. This exit occurs when KVM_CAP_S390_CSS_SUPPORT has been enabled +and TEST SUBCHANNEL was intercepted. If dequeued is set, a pending I/O +interrupt for the target subchannel has been dequeued and subchannel_id, +subchannel_nr, io_int_parm and io_int_word contain the parameters for that +interrupt. ipb is needed for instruction parameter decoding. + + /* KVM_EXIT_EPR */ + struct { + __u32 epr; + } epr; + +On FSL BookE PowerPC chips, the interrupt controller has a fast patch +interrupt acknowledge path to the core. When the core successfully +delivers an interrupt, it automatically populates the EPR register with +the interrupt vector number and acknowledges the interrupt inside +the interrupt controller. + +In case the interrupt controller lives in user space, we need to do +the interrupt acknowledge cycle through it to fetch the next to be +delivered interrupt vector using this exit. + +It gets triggered whenever both KVM_CAP_PPC_EPR are enabled and an +external interrupt has just been delivered into the guest. User space +should put the acknowledged interrupt vector into the 'epr' field. + + /* KVM_EXIT_SYSTEM_EVENT */ + struct { +#define KVM_SYSTEM_EVENT_SHUTDOWN 1 +#define KVM_SYSTEM_EVENT_RESET 2 +#define KVM_SYSTEM_EVENT_CRASH 3 + __u32 type; + __u64 flags; + } system_event; + +If exit_reason is KVM_EXIT_SYSTEM_EVENT then the vcpu has triggered +a system-level event using some architecture specific mechanism (hypercall +or some special instruction). In case of ARM/ARM64, this is triggered using +HVC instruction based PSCI call from the vcpu. The 'type' field describes +the system-level event type. The 'flags' field describes architecture +specific flags for the system-level event. + +Valid values for 'type' are: + KVM_SYSTEM_EVENT_SHUTDOWN -- the guest has requested a shutdown of the + VM. Userspace is not obliged to honour this, and if it does honour + this does not need to destroy the VM synchronously (ie it may call + KVM_RUN again before shutdown finally occurs). + KVM_SYSTEM_EVENT_RESET -- the guest has requested a reset of the VM. + As with SHUTDOWN, userspace can choose to ignore the request, or + to schedule the reset to occur in the future and may call KVM_RUN again. + KVM_SYSTEM_EVENT_CRASH -- the guest crash occurred and the guest + has requested a crash condition maintenance. Userspace can choose + to ignore the request, or to gather VM memory core dump and/or + reset/shutdown of the VM. + + /* KVM_EXIT_IOAPIC_EOI */ + struct { + __u8 vector; + } eoi; + +Indicates that the VCPU's in-kernel local APIC received an EOI for a +level-triggered IOAPIC interrupt. This exit only triggers when the +IOAPIC is implemented in userspace (i.e. KVM_CAP_SPLIT_IRQCHIP is enabled); +the userspace IOAPIC should process the EOI and retrigger the interrupt if +it is still asserted. Vector is the LAPIC interrupt vector for which the +EOI was received. + + struct kvm_hyperv_exit { +#define KVM_EXIT_HYPERV_SYNIC 1 +#define KVM_EXIT_HYPERV_HCALL 2 + __u32 type; + union { + struct { + __u32 msr; + __u64 control; + __u64 evt_page; + __u64 msg_page; + } synic; + struct { + __u64 input; + __u64 result; + __u64 params[2]; + } hcall; + } u; + }; + /* KVM_EXIT_HYPERV */ + struct kvm_hyperv_exit hyperv; +Indicates that the VCPU exits into userspace to process some tasks +related to Hyper-V emulation. +Valid values for 'type' are: + KVM_EXIT_HYPERV_SYNIC -- synchronously notify user-space about +Hyper-V SynIC state change. Notification is used to remap SynIC +event/message pages and to enable/disable SynIC messages/events processing +in userspace. + + /* Fix the size of the union. */ + char padding[256]; + }; + + /* + * shared registers between kvm and userspace. + * kvm_valid_regs specifies the register classes set by the host + * kvm_dirty_regs specified the register classes dirtied by userspace + * struct kvm_sync_regs is architecture specific, as well as the + * bits for kvm_valid_regs and kvm_dirty_regs + */ + __u64 kvm_valid_regs; + __u64 kvm_dirty_regs; + union { + struct kvm_sync_regs regs; + char padding[SYNC_REGS_SIZE_BYTES]; + } s; + +If KVM_CAP_SYNC_REGS is defined, these fields allow userspace to access +certain guest registers without having to call SET/GET_*REGS. Thus we can +avoid some system call overhead if userspace has to handle the exit. +Userspace can query the validity of the structure by checking +kvm_valid_regs for specific bits. These bits are architecture specific +and usually define the validity of a groups of registers. (e.g. one bit + for general purpose registers) + +Please note that the kernel is allowed to use the kvm_run structure as the +primary storage for certain register types. Therefore, the kernel may use the +values in kvm_run even if the corresponding bit in kvm_dirty_regs is not set. + +}; + + + +6. Capabilities that can be enabled on vCPUs +-------------------------------------------- + +There are certain capabilities that change the behavior of the virtual CPU or +the virtual machine when enabled. To enable them, please see section 4.37. +Below you can find a list of capabilities and what their effect on the vCPU or +the virtual machine is when enabling them. + +The following information is provided along with the description: + + Architectures: which instruction set architectures provide this ioctl. + x86 includes both i386 and x86_64. + + Target: whether this is a per-vcpu or per-vm capability. + + Parameters: what parameters are accepted by the capability. + + Returns: the return value. General error numbers (EBADF, ENOMEM, EINVAL) + are not detailed, but errors with specific meanings are. + + +6.1 KVM_CAP_PPC_OSI + +Architectures: ppc +Target: vcpu +Parameters: none +Returns: 0 on success; -1 on error + +This capability enables interception of OSI hypercalls that otherwise would +be treated as normal system calls to be injected into the guest. OSI hypercalls +were invented by Mac-on-Linux to have a standardized communication mechanism +between the guest and the host. + +When this capability is enabled, KVM_EXIT_OSI can occur. + + +6.2 KVM_CAP_PPC_PAPR + +Architectures: ppc +Target: vcpu +Parameters: none +Returns: 0 on success; -1 on error + +This capability enables interception of PAPR hypercalls. PAPR hypercalls are +done using the hypercall instruction "sc 1". + +It also sets the guest privilege level to "supervisor" mode. Usually the guest +runs in "hypervisor" privilege mode with a few missing features. + +In addition to the above, it changes the semantics of SDR1. In this mode, the +HTAB address part of SDR1 contains an HVA instead of a GPA, as PAPR keeps the +HTAB invisible to the guest. + +When this capability is enabled, KVM_EXIT_PAPR_HCALL can occur. + + +6.3 KVM_CAP_SW_TLB + +Architectures: ppc +Target: vcpu +Parameters: args[0] is the address of a struct kvm_config_tlb +Returns: 0 on success; -1 on error + +struct kvm_config_tlb { + __u64 params; + __u64 array; + __u32 mmu_type; + __u32 array_len; +}; + +Configures the virtual CPU's TLB array, establishing a shared memory area +between userspace and KVM. The "params" and "array" fields are userspace +addresses of mmu-type-specific data structures. The "array_len" field is an +safety mechanism, and should be set to the size in bytes of the memory that +userspace has reserved for the array. It must be at least the size dictated +by "mmu_type" and "params". + +While KVM_RUN is active, the shared region is under control of KVM. Its +contents are undefined, and any modification by userspace results in +boundedly undefined behavior. + +On return from KVM_RUN, the shared region will reflect the current state of +the guest's TLB. If userspace makes any changes, it must call KVM_DIRTY_TLB +to tell KVM which entries have been changed, prior to calling KVM_RUN again +on this vcpu. + +For mmu types KVM_MMU_FSL_BOOKE_NOHV and KVM_MMU_FSL_BOOKE_HV: + - The "params" field is of type "struct kvm_book3e_206_tlb_params". + - The "array" field points to an array of type "struct + kvm_book3e_206_tlb_entry". + - The array consists of all entries in the first TLB, followed by all + entries in the second TLB. + - Within a TLB, entries are ordered first by increasing set number. Within a + set, entries are ordered by way (increasing ESEL). + - The hash for determining set number in TLB0 is: (MAS2 >> 12) & (num_sets - 1) + where "num_sets" is the tlb_sizes[] value divided by the tlb_ways[] value. + - The tsize field of mas1 shall be set to 4K on TLB0, even though the + hardware ignores this value for TLB0. + +6.4 KVM_CAP_S390_CSS_SUPPORT + +Architectures: s390 +Target: vcpu +Parameters: none +Returns: 0 on success; -1 on error + +This capability enables support for handling of channel I/O instructions. + +TEST PENDING INTERRUPTION and the interrupt portion of TEST SUBCHANNEL are +handled in-kernel, while the other I/O instructions are passed to userspace. + +When this capability is enabled, KVM_EXIT_S390_TSCH will occur on TEST +SUBCHANNEL intercepts. + +Note that even though this capability is enabled per-vcpu, the complete +virtual machine is affected. + +6.5 KVM_CAP_PPC_EPR + +Architectures: ppc +Target: vcpu +Parameters: args[0] defines whether the proxy facility is active +Returns: 0 on success; -1 on error + +This capability enables or disables the delivery of interrupts through the +external proxy facility. + +When enabled (args[0] != 0), every time the guest gets an external interrupt +delivered, it automatically exits into user space with a KVM_EXIT_EPR exit +to receive the topmost interrupt vector. + +When disabled (args[0] == 0), behavior is as if this facility is unsupported. + +When this capability is enabled, KVM_EXIT_EPR can occur. + +6.6 KVM_CAP_IRQ_MPIC + +Architectures: ppc +Parameters: args[0] is the MPIC device fd + args[1] is the MPIC CPU number for this vcpu + +This capability connects the vcpu to an in-kernel MPIC device. + +6.7 KVM_CAP_IRQ_XICS + +Architectures: ppc +Target: vcpu +Parameters: args[0] is the XICS device fd + args[1] is the XICS CPU number (server ID) for this vcpu + +This capability connects the vcpu to an in-kernel XICS device. + +6.8 KVM_CAP_S390_IRQCHIP + +Architectures: s390 +Target: vm +Parameters: none + +This capability enables the in-kernel irqchip for s390. Please refer to +"4.24 KVM_CREATE_IRQCHIP" for details. + +6.9 KVM_CAP_MIPS_FPU + +Architectures: mips +Target: vcpu +Parameters: args[0] is reserved for future use (should be 0). + +This capability allows the use of the host Floating Point Unit by the guest. It +allows the Config1.FP bit to be set to enable the FPU in the guest. Once this is +done the KVM_REG_MIPS_FPR_* and KVM_REG_MIPS_FCR_* registers can be accessed +(depending on the current guest FPU register mode), and the Status.FR, +Config5.FRE bits are accessible via the KVM API and also from the guest, +depending on them being supported by the FPU. + +6.10 KVM_CAP_MIPS_MSA + +Architectures: mips +Target: vcpu +Parameters: args[0] is reserved for future use (should be 0). + +This capability allows the use of the MIPS SIMD Architecture (MSA) by the guest. +It allows the Config3.MSAP bit to be set to enable the use of MSA by the guest. +Once this is done the KVM_REG_MIPS_VEC_* and KVM_REG_MIPS_MSA_* registers can be +accessed, and the Config5.MSAEn bit is accessible via the KVM API and also from +the guest. + +6.74 KVM_CAP_SYNC_REGS +Architectures: s390, x86 +Target: s390: always enabled, x86: vcpu +Parameters: none +Returns: x86: KVM_CHECK_EXTENSION returns a bit-array indicating which register +sets are supported (bitfields defined in arch/x86/include/uapi/asm/kvm.h). + +As described above in the kvm_sync_regs struct info in section 5 (kvm_run): +KVM_CAP_SYNC_REGS "allow[s] userspace to access certain guest registers +without having to call SET/GET_*REGS". This reduces overhead by eliminating +repeated ioctl calls for setting and/or getting register values. This is +particularly important when userspace is making synchronous guest state +modifications, e.g. when emulating and/or intercepting instructions in +userspace. + +For s390 specifics, please refer to the source code. + +For x86: +- the register sets to be copied out to kvm_run are selectable + by userspace (rather that all sets being copied out for every exit). +- vcpu_events are available in addition to regs and sregs. + +For x86, the 'kvm_valid_regs' field of struct kvm_run is overloaded to +function as an input bit-array field set by userspace to indicate the +specific register sets to be copied out on the next exit. + +To indicate when userspace has modified values that should be copied into +the vCPU, the all architecture bitarray field, 'kvm_dirty_regs' must be set. +This is done using the same bitflags as for the 'kvm_valid_regs' field. +If the dirty bit is not set, then the register set values will not be copied +into the vCPU even if they've been modified. + +Unused bitfields in the bitarrays must be set to zero. + +struct kvm_sync_regs { + struct kvm_regs regs; + struct kvm_sregs sregs; + struct kvm_vcpu_events events; +}; + +6.75 KVM_CAP_PPC_IRQ_XIVE + +Architectures: ppc +Target: vcpu +Parameters: args[0] is the XIVE device fd + args[1] is the XIVE CPU number (server ID) for this vcpu + +This capability connects the vcpu to an in-kernel XIVE device. + +7. Capabilities that can be enabled on VMs +------------------------------------------ + +There are certain capabilities that change the behavior of the virtual +machine when enabled. To enable them, please see section 4.37. Below +you can find a list of capabilities and what their effect on the VM +is when enabling them. + +The following information is provided along with the description: + + Architectures: which instruction set architectures provide this ioctl. + x86 includes both i386 and x86_64. + + Parameters: what parameters are accepted by the capability. + + Returns: the return value. General error numbers (EBADF, ENOMEM, EINVAL) + are not detailed, but errors with specific meanings are. + + +7.1 KVM_CAP_PPC_ENABLE_HCALL + +Architectures: ppc +Parameters: args[0] is the sPAPR hcall number + args[1] is 0 to disable, 1 to enable in-kernel handling + +This capability controls whether individual sPAPR hypercalls (hcalls) +get handled by the kernel or not. Enabling or disabling in-kernel +handling of an hcall is effective across the VM. On creation, an +initial set of hcalls are enabled for in-kernel handling, which +consists of those hcalls for which in-kernel handlers were implemented +before this capability was implemented. If disabled, the kernel will +not to attempt to handle the hcall, but will always exit to userspace +to handle it. Note that it may not make sense to enable some and +disable others of a group of related hcalls, but KVM does not prevent +userspace from doing that. + +If the hcall number specified is not one that has an in-kernel +implementation, the KVM_ENABLE_CAP ioctl will fail with an EINVAL +error. + +7.2 KVM_CAP_S390_USER_SIGP + +Architectures: s390 +Parameters: none + +This capability controls which SIGP orders will be handled completely in user +space. With this capability enabled, all fast orders will be handled completely +in the kernel: +- SENSE +- SENSE RUNNING +- EXTERNAL CALL +- EMERGENCY SIGNAL +- CONDITIONAL EMERGENCY SIGNAL + +All other orders will be handled completely in user space. + +Only privileged operation exceptions will be checked for in the kernel (or even +in the hardware prior to interception). If this capability is not enabled, the +old way of handling SIGP orders is used (partially in kernel and user space). + +7.3 KVM_CAP_S390_VECTOR_REGISTERS + +Architectures: s390 +Parameters: none +Returns: 0 on success, negative value on error + +Allows use of the vector registers introduced with z13 processor, and +provides for the synchronization between host and user space. Will +return -EINVAL if the machine does not support vectors. + +7.4 KVM_CAP_S390_USER_STSI + +Architectures: s390 +Parameters: none + +This capability allows post-handlers for the STSI instruction. After +initial handling in the kernel, KVM exits to user space with +KVM_EXIT_S390_STSI to allow user space to insert further data. + +Before exiting to userspace, kvm handlers should fill in s390_stsi field of +vcpu->run: +struct { + __u64 addr; + __u8 ar; + __u8 reserved; + __u8 fc; + __u8 sel1; + __u16 sel2; +} s390_stsi; + +@addr - guest address of STSI SYSIB +@fc - function code +@sel1 - selector 1 +@sel2 - selector 2 +@ar - access register number + +KVM handlers should exit to userspace with rc = -EREMOTE. + +7.5 KVM_CAP_SPLIT_IRQCHIP + +Architectures: x86 +Parameters: args[0] - number of routes reserved for userspace IOAPICs +Returns: 0 on success, -1 on error + +Create a local apic for each processor in the kernel. This can be used +instead of KVM_CREATE_IRQCHIP if the userspace VMM wishes to emulate the +IOAPIC and PIC (and also the PIT, even though this has to be enabled +separately). + +This capability also enables in kernel routing of interrupt requests; +when KVM_CAP_SPLIT_IRQCHIP only routes of KVM_IRQ_ROUTING_MSI type are +used in the IRQ routing table. The first args[0] MSI routes are reserved +for the IOAPIC pins. Whenever the LAPIC receives an EOI for these routes, +a KVM_EXIT_IOAPIC_EOI vmexit will be reported to userspace. + +Fails if VCPU has already been created, or if the irqchip is already in the +kernel (i.e. KVM_CREATE_IRQCHIP has already been called). + +7.6 KVM_CAP_S390_RI + +Architectures: s390 +Parameters: none + +Allows use of runtime-instrumentation introduced with zEC12 processor. +Will return -EINVAL if the machine does not support runtime-instrumentation. +Will return -EBUSY if a VCPU has already been created. + +7.7 KVM_CAP_X2APIC_API + +Architectures: x86 +Parameters: args[0] - features that should be enabled +Returns: 0 on success, -EINVAL when args[0] contains invalid features + +Valid feature flags in args[0] are + +#define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0) +#define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1) + +Enabling KVM_X2APIC_API_USE_32BIT_IDS changes the behavior of +KVM_SET_GSI_ROUTING, KVM_SIGNAL_MSI, KVM_SET_LAPIC, and KVM_GET_LAPIC, +allowing the use of 32-bit APIC IDs. See KVM_CAP_X2APIC_API in their +respective sections. + +KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK must be enabled for x2APIC to work +in logical mode or with more than 255 VCPUs. Otherwise, KVM treats 0xff +as a broadcast even in x2APIC mode in order to support physical x2APIC +without interrupt remapping. This is undesirable in logical mode, +where 0xff represents CPUs 0-7 in cluster 0. + +7.8 KVM_CAP_S390_USER_INSTR0 + +Architectures: s390 +Parameters: none + +With this capability enabled, all illegal instructions 0x0000 (2 bytes) will +be intercepted and forwarded to user space. User space can use this +mechanism e.g. to realize 2-byte software breakpoints. The kernel will +not inject an operating exception for these instructions, user space has +to take care of that. + +This capability can be enabled dynamically even if VCPUs were already +created and are running. + +7.9 KVM_CAP_S390_GS + +Architectures: s390 +Parameters: none +Returns: 0 on success; -EINVAL if the machine does not support + guarded storage; -EBUSY if a VCPU has already been created. + +Allows use of guarded storage for the KVM guest. + +7.10 KVM_CAP_S390_AIS + +Architectures: s390 +Parameters: none + +Allow use of adapter-interruption suppression. +Returns: 0 on success; -EBUSY if a VCPU has already been created. + +7.11 KVM_CAP_PPC_SMT + +Architectures: ppc +Parameters: vsmt_mode, flags + +Enabling this capability on a VM provides userspace with a way to set +the desired virtual SMT mode (i.e. the number of virtual CPUs per +virtual core). The virtual SMT mode, vsmt_mode, must be a power of 2 +between 1 and 8. On POWER8, vsmt_mode must also be no greater than +the number of threads per subcore for the host. Currently flags must +be 0. A successful call to enable this capability will result in +vsmt_mode being returned when the KVM_CAP_PPC_SMT capability is +subsequently queried for the VM. This capability is only supported by +HV KVM, and can only be set before any VCPUs have been created. +The KVM_CAP_PPC_SMT_POSSIBLE capability indicates which virtual SMT +modes are available. + +7.12 KVM_CAP_PPC_FWNMI + +Architectures: ppc +Parameters: none + +With this capability a machine check exception in the guest address +space will cause KVM to exit the guest with NMI exit reason. This +enables QEMU to build error log and branch to guest kernel registered +machine check handling routine. Without this capability KVM will +branch to guests' 0x200 interrupt vector. + +7.13 KVM_CAP_X86_DISABLE_EXITS + +Architectures: x86 +Parameters: args[0] defines which exits are disabled +Returns: 0 on success, -EINVAL when args[0] contains invalid exits + +Valid bits in args[0] are + +#define KVM_X86_DISABLE_EXITS_MWAIT (1 << 0) +#define KVM_X86_DISABLE_EXITS_HLT (1 << 1) +#define KVM_X86_DISABLE_EXITS_PAUSE (1 << 2) +#define KVM_X86_DISABLE_EXITS_CSTATE (1 << 3) + +Enabling this capability on a VM provides userspace with a way to no +longer intercept some instructions for improved latency in some +workloads, and is suggested when vCPUs are associated to dedicated +physical CPUs. More bits can be added in the future; userspace can +just pass the KVM_CHECK_EXTENSION result to KVM_ENABLE_CAP to disable +all such vmexits. + +Do not enable KVM_FEATURE_PV_UNHALT if you disable HLT exits. + +7.14 KVM_CAP_S390_HPAGE_1M + +Architectures: s390 +Parameters: none +Returns: 0 on success, -EINVAL if hpage module parameter was not set + or cmma is enabled, or the VM has the KVM_VM_S390_UCONTROL + flag set + +With this capability the KVM support for memory backing with 1m pages +through hugetlbfs can be enabled for a VM. After the capability is +enabled, cmma can't be enabled anymore and pfmfi and the storage key +interpretation are disabled. If cmma has already been enabled or the +hpage module parameter is not set to 1, -EINVAL is returned. + +While it is generally possible to create a huge page backed VM without +this capability, the VM will not be able to run. + +7.15 KVM_CAP_MSR_PLATFORM_INFO + +Architectures: x86 +Parameters: args[0] whether feature should be enabled or not + +With this capability, a guest may read the MSR_PLATFORM_INFO MSR. Otherwise, +a #GP would be raised when the guest tries to access. Currently, this +capability does not enable write permissions of this MSR for the guest. + +7.16 KVM_CAP_PPC_NESTED_HV + +Architectures: ppc +Parameters: none +Returns: 0 on success, -EINVAL when the implementation doesn't support + nested-HV virtualization. + +HV-KVM on POWER9 and later systems allows for "nested-HV" +virtualization, which provides a way for a guest VM to run guests that +can run using the CPU's supervisor mode (privileged non-hypervisor +state). Enabling this capability on a VM depends on the CPU having +the necessary functionality and on the facility being enabled with a +kvm-hv module parameter. + +7.17 KVM_CAP_EXCEPTION_PAYLOAD + +Architectures: x86 +Parameters: args[0] whether feature should be enabled or not + +With this capability enabled, CR2 will not be modified prior to the +emulated VM-exit when L1 intercepts a #PF exception that occurs in +L2. Similarly, for kvm-intel only, DR6 will not be modified prior to +the emulated VM-exit when L1 intercepts a #DB exception that occurs in +L2. As a result, when KVM_GET_VCPU_EVENTS reports a pending #PF (or +#DB) exception for L2, exception.has_payload will be set and the +faulting address (or the new DR6 bits*) will be reported in the +exception_payload field. Similarly, when userspace injects a #PF (or +#DB) into L2 using KVM_SET_VCPU_EVENTS, it is expected to set +exception.has_payload and to put the faulting address (or the new DR6 +bits*) in the exception_payload field. + +This capability also enables exception.pending in struct +kvm_vcpu_events, which allows userspace to distinguish between pending +and injected exceptions. + + +* For the new DR6 bits, note that bit 16 is set iff the #DB exception + will clear DR6.RTM. + +7.18 KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 + +Architectures: x86, arm, arm64, mips +Parameters: args[0] whether feature should be enabled or not + +With this capability enabled, KVM_GET_DIRTY_LOG will not automatically +clear and write-protect all pages that are returned as dirty. +Rather, userspace will have to do this operation separately using +KVM_CLEAR_DIRTY_LOG. + +At the cost of a slightly more complicated operation, this provides better +scalability and responsiveness for two reasons. First, +KVM_CLEAR_DIRTY_LOG ioctl can operate on a 64-page granularity rather +than requiring to sync a full memslot; this ensures that KVM does not +take spinlocks for an extended period of time. Second, in some cases a +large amount of time can pass between a call to KVM_GET_DIRTY_LOG and +userspace actually using the data in the page. Pages can be modified +during this time, which is inefficint for both the guest and userspace: +the guest will incur a higher penalty due to write protection faults, +while userspace can see false reports of dirty pages. Manual reprotection +helps reducing this time, improving guest performance and reducing the +number of dirty log false positives. + +KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 was previously available under the name +KVM_CAP_MANUAL_DIRTY_LOG_PROTECT, but the implementation had bugs that make +it hard or impossible to use it correctly. The availability of +KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 signals that those bugs are fixed. +Userspace should not try to use KVM_CAP_MANUAL_DIRTY_LOG_PROTECT. + +8. Other capabilities. +---------------------- + +This section lists capabilities that give information about other +features of the KVM implementation. + +8.1 KVM_CAP_PPC_HWRNG + +Architectures: ppc + +This capability, if KVM_CHECK_EXTENSION indicates that it is +available, means that that the kernel has an implementation of the +H_RANDOM hypercall backed by a hardware random-number generator. +If present, the kernel H_RANDOM handler can be enabled for guest use +with the KVM_CAP_PPC_ENABLE_HCALL capability. + +8.2 KVM_CAP_HYPERV_SYNIC + +Architectures: x86 +This capability, if KVM_CHECK_EXTENSION indicates that it is +available, means that that the kernel has an implementation of the +Hyper-V Synthetic interrupt controller(SynIC). Hyper-V SynIC is +used to support Windows Hyper-V based guest paravirt drivers(VMBus). + +In order to use SynIC, it has to be activated by setting this +capability via KVM_ENABLE_CAP ioctl on the vcpu fd. Note that this +will disable the use of APIC hardware virtualization even if supported +by the CPU, as it's incompatible with SynIC auto-EOI behavior. + +8.3 KVM_CAP_PPC_RADIX_MMU + +Architectures: ppc + +This capability, if KVM_CHECK_EXTENSION indicates that it is +available, means that that the kernel can support guests using the +radix MMU defined in Power ISA V3.00 (as implemented in the POWER9 +processor). + +8.4 KVM_CAP_PPC_HASH_MMU_V3 + +Architectures: ppc + +This capability, if KVM_CHECK_EXTENSION indicates that it is +available, means that that the kernel can support guests using the +hashed page table MMU defined in Power ISA V3.00 (as implemented in +the POWER9 processor), including in-memory segment tables. + +8.5 KVM_CAP_MIPS_VZ + +Architectures: mips + +This capability, if KVM_CHECK_EXTENSION on the main kvm handle indicates that +it is available, means that full hardware assisted virtualization capabilities +of the hardware are available for use through KVM. An appropriate +KVM_VM_MIPS_* type must be passed to KVM_CREATE_VM to create a VM which +utilises it. + +If KVM_CHECK_EXTENSION on a kvm VM handle indicates that this capability is +available, it means that the VM is using full hardware assisted virtualization +capabilities of the hardware. This is useful to check after creating a VM with +KVM_VM_MIPS_DEFAULT. + +The value returned by KVM_CHECK_EXTENSION should be compared against known +values (see below). All other values are reserved. This is to allow for the +possibility of other hardware assisted virtualization implementations which +may be incompatible with the MIPS VZ ASE. + + 0: The trap & emulate implementation is in use to run guest code in user + mode. Guest virtual memory segments are rearranged to fit the guest in the + user mode address space. + + 1: The MIPS VZ ASE is in use, providing full hardware assisted + virtualization, including standard guest virtual memory segments. + +8.6 KVM_CAP_MIPS_TE + +Architectures: mips + +This capability, if KVM_CHECK_EXTENSION on the main kvm handle indicates that +it is available, means that the trap & emulate implementation is available to +run guest code in user mode, even if KVM_CAP_MIPS_VZ indicates that hardware +assisted virtualisation is also available. KVM_VM_MIPS_TE (0) must be passed +to KVM_CREATE_VM to create a VM which utilises it. + +If KVM_CHECK_EXTENSION on a kvm VM handle indicates that this capability is +available, it means that the VM is using trap & emulate. + +8.7 KVM_CAP_MIPS_64BIT + +Architectures: mips + +This capability indicates the supported architecture type of the guest, i.e. the +supported register and address width. + +The values returned when this capability is checked by KVM_CHECK_EXTENSION on a +kvm VM handle correspond roughly to the CP0_Config.AT register field, and should +be checked specifically against known values (see below). All other values are +reserved. + + 0: MIPS32 or microMIPS32. + Both registers and addresses are 32-bits wide. + It will only be possible to run 32-bit guest code. + + 1: MIPS64 or microMIPS64 with access only to 32-bit compatibility segments. + Registers are 64-bits wide, but addresses are 32-bits wide. + 64-bit guest code may run but cannot access MIPS64 memory segments. + It will also be possible to run 32-bit guest code. + + 2: MIPS64 or microMIPS64 with access to all address segments. + Both registers and addresses are 64-bits wide. + It will be possible to run 64-bit or 32-bit guest code. + +8.9 KVM_CAP_ARM_USER_IRQ + +Architectures: arm, arm64 +This capability, if KVM_CHECK_EXTENSION indicates that it is available, means +that if userspace creates a VM without an in-kernel interrupt controller, it +will be notified of changes to the output level of in-kernel emulated devices, +which can generate virtual interrupts, presented to the VM. +For such VMs, on every return to userspace, the kernel +updates the vcpu's run->s.regs.device_irq_level field to represent the actual +output level of the device. + +Whenever kvm detects a change in the device output level, kvm guarantees at +least one return to userspace before running the VM. This exit could either +be a KVM_EXIT_INTR or any other exit event, like KVM_EXIT_MMIO. This way, +userspace can always sample the device output level and re-compute the state of +the userspace interrupt controller. Userspace should always check the state +of run->s.regs.device_irq_level on every kvm exit. +The value in run->s.regs.device_irq_level can represent both level and edge +triggered interrupt signals, depending on the device. Edge triggered interrupt +signals will exit to userspace with the bit in run->s.regs.device_irq_level +set exactly once per edge signal. + +The field run->s.regs.device_irq_level is available independent of +run->kvm_valid_regs or run->kvm_dirty_regs bits. + +If KVM_CAP_ARM_USER_IRQ is supported, the KVM_CHECK_EXTENSION ioctl returns a +number larger than 0 indicating the version of this capability is implemented +and thereby which bits in in run->s.regs.device_irq_level can signal values. + +Currently the following bits are defined for the device_irq_level bitmap: + + KVM_CAP_ARM_USER_IRQ >= 1: + + KVM_ARM_DEV_EL1_VTIMER - EL1 virtual timer + KVM_ARM_DEV_EL1_PTIMER - EL1 physical timer + KVM_ARM_DEV_PMU - ARM PMU overflow interrupt signal + +Future versions of kvm may implement additional events. These will get +indicated by returning a higher number from KVM_CHECK_EXTENSION and will be +listed above. + +8.10 KVM_CAP_PPC_SMT_POSSIBLE + +Architectures: ppc + +Querying this capability returns a bitmap indicating the possible +virtual SMT modes that can be set using KVM_CAP_PPC_SMT. If bit N +(counting from the right) is set, then a virtual SMT mode of 2^N is +available. + +8.11 KVM_CAP_HYPERV_SYNIC2 + +Architectures: x86 + +This capability enables a newer version of Hyper-V Synthetic interrupt +controller (SynIC). The only difference with KVM_CAP_HYPERV_SYNIC is that KVM +doesn't clear SynIC message and event flags pages when they are enabled by +writing to the respective MSRs. + +8.12 KVM_CAP_HYPERV_VP_INDEX + +Architectures: x86 + +This capability indicates that userspace can load HV_X64_MSR_VP_INDEX msr. Its +value is used to denote the target vcpu for a SynIC interrupt. For +compatibilty, KVM initializes this msr to KVM's internal vcpu index. When this +capability is absent, userspace can still query this msr's value. + +8.13 KVM_CAP_S390_AIS_MIGRATION + +Architectures: s390 +Parameters: none + +This capability indicates if the flic device will be able to get/set the +AIS states for migration via the KVM_DEV_FLIC_AISM_ALL attribute and allows +to discover this without having to create a flic device. + +8.14 KVM_CAP_S390_PSW + +Architectures: s390 + +This capability indicates that the PSW is exposed via the kvm_run structure. + +8.15 KVM_CAP_S390_GMAP + +Architectures: s390 + +This capability indicates that the user space memory used as guest mapping can +be anywhere in the user memory address space, as long as the memory slots are +aligned and sized to a segment (1MB) boundary. + +8.16 KVM_CAP_S390_COW + +Architectures: s390 + +This capability indicates that the user space memory used as guest mapping can +use copy-on-write semantics as well as dirty pages tracking via read-only page +tables. + +8.17 KVM_CAP_S390_BPB + +Architectures: s390 + +This capability indicates that kvm will implement the interfaces to handle +reset, migration and nested KVM for branch prediction blocking. The stfle +facility 82 should not be provided to the guest without this capability. + +8.18 KVM_CAP_HYPERV_TLBFLUSH + +Architectures: x86 + +This capability indicates that KVM supports paravirtualized Hyper-V TLB Flush +hypercalls: +HvFlushVirtualAddressSpace, HvFlushVirtualAddressSpaceEx, +HvFlushVirtualAddressList, HvFlushVirtualAddressListEx. + +8.19 KVM_CAP_ARM_INJECT_SERROR_ESR + +Architectures: arm, arm64 + +This capability indicates that userspace can specify (via the +KVM_SET_VCPU_EVENTS ioctl) the syndrome value reported to the guest when it +takes a virtual SError interrupt exception. +If KVM advertises this capability, userspace can only specify the ISS field for +the ESR syndrome. Other parts of the ESR, such as the EC are generated by the +CPU when the exception is taken. If this virtual SError is taken to EL1 using +AArch64, this value will be reported in the ISS field of ESR_ELx. + +See KVM_CAP_VCPU_EVENTS for more details. +8.20 KVM_CAP_HYPERV_SEND_IPI + +Architectures: x86 + +This capability indicates that KVM supports paravirtualized Hyper-V IPI send +hypercalls: +HvCallSendSyntheticClusterIpi, HvCallSendSyntheticClusterIpiEx. diff --git a/Documentation/virt/kvm/arm/hyp-abi.txt b/Documentation/virt/kvm/arm/hyp-abi.txt new file mode 100644 index 000000000000..a20a0bee268d --- /dev/null +++ b/Documentation/virt/kvm/arm/hyp-abi.txt @@ -0,0 +1,53 @@ +* Internal ABI between the kernel and HYP + +This file documents the interaction between the Linux kernel and the +hypervisor layer when running Linux as a hypervisor (for example +KVM). It doesn't cover the interaction of the kernel with the +hypervisor when running as a guest (under Xen, KVM or any other +hypervisor), or any hypervisor-specific interaction when the kernel is +used as a host. + +On arm and arm64 (without VHE), the kernel doesn't run in hypervisor +mode, but still needs to interact with it, allowing a built-in +hypervisor to be either installed or torn down. + +In order to achieve this, the kernel must be booted at HYP (arm) or +EL2 (arm64), allowing it to install a set of stubs before dropping to +SVC/EL1. These stubs are accessible by using a 'hvc #0' instruction, +and only act on individual CPUs. + +Unless specified otherwise, any built-in hypervisor must implement +these functions (see arch/arm{,64}/include/asm/virt.h): + +* r0/x0 = HVC_SET_VECTORS + r1/x1 = vectors + + Set HVBAR/VBAR_EL2 to 'vectors' to enable a hypervisor. 'vectors' + must be a physical address, and respect the alignment requirements + of the architecture. Only implemented by the initial stubs, not by + Linux hypervisors. + +* r0/x0 = HVC_RESET_VECTORS + + Turn HYP/EL2 MMU off, and reset HVBAR/VBAR_EL2 to the initials + stubs' exception vector value. This effectively disables an existing + hypervisor. + +* r0/x0 = HVC_SOFT_RESTART + r1/x1 = restart address + x2 = x0's value when entering the next payload (arm64) + x3 = x1's value when entering the next payload (arm64) + x4 = x2's value when entering the next payload (arm64) + + Mask all exceptions, disable the MMU, move the arguments into place + (arm64 only), and jump to the restart address while at HYP/EL2. This + hypercall is not expected to return to its caller. + +Any other value of r0/x0 triggers a hypervisor-specific handling, +which is not documented here. + +The return value of a stub hypercall is held by r0/x0, and is 0 on +success, and HVC_STUB_ERR on error. A stub hypercall is allowed to +clobber any of the caller-saved registers (x0-x18 on arm64, r0-r3 and +ip on arm). It is thus recommended to use a function call to perform +the hypercall. diff --git a/Documentation/virt/kvm/arm/psci.txt b/Documentation/virt/kvm/arm/psci.txt new file mode 100644 index 000000000000..559586fc9d37 --- /dev/null +++ b/Documentation/virt/kvm/arm/psci.txt @@ -0,0 +1,61 @@ +KVM implements the PSCI (Power State Coordination Interface) +specification in order to provide services such as CPU on/off, reset +and power-off to the guest. + +The PSCI specification is regularly updated to provide new features, +and KVM implements these updates if they make sense from a virtualization +point of view. + +This means that a guest booted on two different versions of KVM can +observe two different "firmware" revisions. This could cause issues if +a given guest is tied to a particular PSCI revision (unlikely), or if +a migration causes a different PSCI version to be exposed out of the +blue to an unsuspecting guest. + +In order to remedy this situation, KVM exposes a set of "firmware +pseudo-registers" that can be manipulated using the GET/SET_ONE_REG +interface. These registers can be saved/restored by userspace, and set +to a convenient value if required. + +The following register is defined: + +* KVM_REG_ARM_PSCI_VERSION: + + - Only valid if the vcpu has the KVM_ARM_VCPU_PSCI_0_2 feature set + (and thus has already been initialized) + - Returns the current PSCI version on GET_ONE_REG (defaulting to the + highest PSCI version implemented by KVM and compatible with v0.2) + - Allows any PSCI version implemented by KVM and compatible with + v0.2 to be set with SET_ONE_REG + - Affects the whole VM (even if the register view is per-vcpu) + +* KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1: + Holds the state of the firmware support to mitigate CVE-2017-5715, as + offered by KVM to the guest via a HVC call. The workaround is described + under SMCCC_ARCH_WORKAROUND_1 in [1]. + Accepted values are: + KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL: KVM does not offer + firmware support for the workaround. The mitigation status for the + guest is unknown. + KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_AVAIL: The workaround HVC call is + available to the guest and required for the mitigation. + KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_REQUIRED: The workaround HVC call + is available to the guest, but it is not needed on this VCPU. + +* KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2: + Holds the state of the firmware support to mitigate CVE-2018-3639, as + offered by KVM to the guest via a HVC call. The workaround is described + under SMCCC_ARCH_WORKAROUND_2 in [1]. + Accepted values are: + KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL: A workaround is not + available. KVM does not offer firmware support for the workaround. + KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_UNKNOWN: The workaround state is + unknown. KVM does not offer firmware support for the workaround. + KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL: The workaround is available, + and can be disabled by a vCPU. If + KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED is set, it is active for + this vCPU. + KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED: The workaround is + always active on this vCPU or it is not needed. + +[1] https://developer.arm.com/-/media/developer/pdf/ARM_DEN_0070A_Firmware_interfaces_for_mitigating_CVE-2017-5715.pdf diff --git a/Documentation/virt/kvm/cpuid.rst b/Documentation/virt/kvm/cpuid.rst new file mode 100644 index 000000000000..01b081f6e7ea --- /dev/null +++ b/Documentation/virt/kvm/cpuid.rst @@ -0,0 +1,107 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============== +KVM CPUID bits +============== + +:Author: Glauber Costa + +A guest running on a kvm host, can check some of its features using +cpuid. This is not always guaranteed to work, since userspace can +mask-out some, or even all KVM-related cpuid features before launching +a guest. + +KVM cpuid functions are: + +function: KVM_CPUID_SIGNATURE (0x40000000) + +returns:: + + eax = 0x40000001 + ebx = 0x4b4d564b + ecx = 0x564b4d56 + edx = 0x4d + +Note that this value in ebx, ecx and edx corresponds to the string "KVMKVMKVM". +The value in eax corresponds to the maximum cpuid function present in this leaf, +and will be updated if more functions are added in the future. +Note also that old hosts set eax value to 0x0. This should +be interpreted as if the value was 0x40000001. +This function queries the presence of KVM cpuid leafs. + +function: define KVM_CPUID_FEATURES (0x40000001) + +returns:: + + ebx, ecx + eax = an OR'ed group of (1 << flag) + +where ``flag`` is defined as below: + +================================= =========== ================================ +flag value meaning +================================= =========== ================================ +KVM_FEATURE_CLOCKSOURCE 0 kvmclock available at msrs + 0x11 and 0x12 + +KVM_FEATURE_NOP_IO_DELAY 1 not necessary to perform delays + on PIO operations + +KVM_FEATURE_MMU_OP 2 deprecated + +KVM_FEATURE_CLOCKSOURCE2 3 kvmclock available at msrs + + 0x4b564d00 and 0x4b564d01 +KVM_FEATURE_ASYNC_PF 4 async pf can be enabled by + writing to msr 0x4b564d02 + +KVM_FEATURE_STEAL_TIME 5 steal time can be enabled by + writing to msr 0x4b564d03 + +KVM_FEATURE_PV_EOI 6 paravirtualized end of interrupt + handler can be enabled by + writing to msr 0x4b564d04 + +KVM_FEATURE_PV_UNHAULT 7 guest checks this feature bit + before enabling paravirtualized + spinlock support + +KVM_FEATURE_PV_TLB_FLUSH 9 guest checks this feature bit + before enabling paravirtualized + tlb flush + +KVM_FEATURE_ASYNC_PF_VMEXIT 10 paravirtualized async PF VM EXIT + can be enabled by setting bit 2 + when writing to msr 0x4b564d02 + +KVM_FEATURE_PV_SEND_IPI 11 guest checks this feature bit + before enabling paravirtualized + sebd IPIs + +KVM_FEATURE_PV_POLL_CONTROL 12 host-side polling on HLT can + be disabled by writing + to msr 0x4b564d05. + +KVM_FEATURE_PV_SCHED_YIELD 13 guest checks this feature bit + before using paravirtualized + sched yield. + +KVM_FEATURE_CLOCSOURCE_STABLE_BIT 24 host will warn if no guest-side + per-cpu warps are expeced in + kvmclock +================================= =========== ================================ + +:: + + edx = an OR'ed group of (1 << flag) + +Where ``flag`` here is defined as below: + +================== ============ ================================= +flag value meaning +================== ============ ================================= +KVM_HINTS_REALTIME 0 guest checks this feature bit to + determine that vCPUs are never + preempted for an unlimited time + allowing optimizations +================== ============ ================================= diff --git a/Documentation/virt/kvm/devices/README b/Documentation/virt/kvm/devices/README new file mode 100644 index 000000000000..34a69834124a --- /dev/null +++ b/Documentation/virt/kvm/devices/README @@ -0,0 +1 @@ +This directory contains specific device bindings for KVM_CAP_DEVICE_CTRL. diff --git a/Documentation/virt/kvm/devices/arm-vgic-its.txt b/Documentation/virt/kvm/devices/arm-vgic-its.txt new file mode 100644 index 000000000000..eeaa95b893a8 --- /dev/null +++ b/Documentation/virt/kvm/devices/arm-vgic-its.txt @@ -0,0 +1,181 @@ +ARM Virtual Interrupt Translation Service (ITS) +=============================================== + +Device types supported: + KVM_DEV_TYPE_ARM_VGIC_ITS ARM Interrupt Translation Service Controller + +The ITS allows MSI(-X) interrupts to be injected into guests. This extension is +optional. Creating a virtual ITS controller also requires a host GICv3 (see +arm-vgic-v3.txt), but does not depend on having physical ITS controllers. + +There can be multiple ITS controllers per guest, each of them has to have +a separate, non-overlapping MMIO region. + + +Groups: + KVM_DEV_ARM_VGIC_GRP_ADDR + Attributes: + KVM_VGIC_ITS_ADDR_TYPE (rw, 64-bit) + Base address in the guest physical address space of the GICv3 ITS + control register frame. + This address needs to be 64K aligned and the region covers 128K. + Errors: + -E2BIG: Address outside of addressable IPA range + -EINVAL: Incorrectly aligned address + -EEXIST: Address already configured + -EFAULT: Invalid user pointer for attr->addr. + -ENODEV: Incorrect attribute or the ITS is not supported. + + + KVM_DEV_ARM_VGIC_GRP_CTRL + Attributes: + KVM_DEV_ARM_VGIC_CTRL_INIT + request the initialization of the ITS, no additional parameter in + kvm_device_attr.addr. + + KVM_DEV_ARM_ITS_CTRL_RESET + reset the ITS, no additional parameter in kvm_device_attr.addr. + See "ITS Reset State" section. + + KVM_DEV_ARM_ITS_SAVE_TABLES + save the ITS table data into guest RAM, at the location provisioned + by the guest in corresponding registers/table entries. + + The layout of the tables in guest memory defines an ABI. The entries + are laid out in little endian format as described in the last paragraph. + + KVM_DEV_ARM_ITS_RESTORE_TABLES + restore the ITS tables from guest RAM to ITS internal structures. + + The GICV3 must be restored before the ITS and all ITS registers but + the GITS_CTLR must be restored before restoring the ITS tables. + + The GITS_IIDR read-only register must also be restored before + calling KVM_DEV_ARM_ITS_RESTORE_TABLES as the IIDR revision field + encodes the ABI revision. + + The expected ordering when restoring the GICv3/ITS is described in section + "ITS Restore Sequence". + + Errors: + -ENXIO: ITS not properly configured as required prior to setting + this attribute + -ENOMEM: Memory shortage when allocating ITS internal data + -EINVAL: Inconsistent restored data + -EFAULT: Invalid guest ram access + -EBUSY: One or more VCPUS are running + -EACCES: The virtual ITS is backed by a physical GICv4 ITS, and the + state is not available + + KVM_DEV_ARM_VGIC_GRP_ITS_REGS + Attributes: + The attr field of kvm_device_attr encodes the offset of the + ITS register, relative to the ITS control frame base address + (ITS_base). + + kvm_device_attr.addr points to a __u64 value whatever the width + of the addressed register (32/64 bits). 64 bit registers can only + be accessed with full length. + + Writes to read-only registers are ignored by the kernel except for: + - GITS_CREADR. It must be restored otherwise commands in the queue + will be re-executed after restoring CWRITER. GITS_CREADR must be + restored before restoring the GITS_CTLR which is likely to enable the + ITS. Also it must be restored after GITS_CBASER since a write to + GITS_CBASER resets GITS_CREADR. + - GITS_IIDR. The Revision field encodes the table layout ABI revision. + In the future we might implement direct injection of virtual LPIs. + This will require an upgrade of the table layout and an evolution of + the ABI. GITS_IIDR must be restored before calling + KVM_DEV_ARM_ITS_RESTORE_TABLES. + + For other registers, getting or setting a register has the same + effect as reading/writing the register on real hardware. + Errors: + -ENXIO: Offset does not correspond to any supported register + -EFAULT: Invalid user pointer for attr->addr + -EINVAL: Offset is not 64-bit aligned + -EBUSY: one or more VCPUS are running + + ITS Restore Sequence: + ------------------------- + +The following ordering must be followed when restoring the GIC and the ITS: +a) restore all guest memory and create vcpus +b) restore all redistributors +c) provide the ITS base address + (KVM_DEV_ARM_VGIC_GRP_ADDR) +d) restore the ITS in the following order: + 1. Restore GITS_CBASER + 2. Restore all other GITS_ registers, except GITS_CTLR! + 3. Load the ITS table data (KVM_DEV_ARM_ITS_RESTORE_TABLES) + 4. Restore GITS_CTLR + +Then vcpus can be started. + + ITS Table ABI REV0: + ------------------- + + Revision 0 of the ABI only supports the features of a virtual GICv3, and does + not support a virtual GICv4 with support for direct injection of virtual + interrupts for nested hypervisors. + + The device table and ITT are indexed by the DeviceID and EventID, + respectively. The collection table is not indexed by CollectionID, and the + entries in the collection are listed in no particular order. + All entries are 8 bytes. + + Device Table Entry (DTE): + + bits: | 63| 62 ... 49 | 48 ... 5 | 4 ... 0 | + values: | V | next | ITT_addr | Size | + + where; + - V indicates whether the entry is valid. If not, other fields + are not meaningful. + - next: equals to 0 if this entry is the last one; otherwise it + corresponds to the DeviceID offset to the next DTE, capped by + 2^14 -1. + - ITT_addr matches bits [51:8] of the ITT address (256 Byte aligned). + - Size specifies the supported number of bits for the EventID, + minus one + + Collection Table Entry (CTE): + + bits: | 63| 62 .. 52 | 51 ... 16 | 15 ... 0 | + values: | V | RES0 | RDBase | ICID | + + where: + - V indicates whether the entry is valid. If not, other fields are + not meaningful. + - RES0: reserved field with Should-Be-Zero-or-Preserved behavior. + - RDBase is the PE number (GICR_TYPER.Processor_Number semantic), + - ICID is the collection ID + + Interrupt Translation Entry (ITE): + + bits: | 63 ... 48 | 47 ... 16 | 15 ... 0 | + values: | next | pINTID | ICID | + + where: + - next: equals to 0 if this entry is the last one; otherwise it corresponds + to the EventID offset to the next ITE capped by 2^16 -1. + - pINTID is the physical LPI ID; if zero, it means the entry is not valid + and other fields are not meaningful. + - ICID is the collection ID + + ITS Reset State: + ---------------- + +RESET returns the ITS to the same state that it was when first created and +initialized. When the RESET command returns, the following things are +guaranteed: + +- The ITS is not enabled and quiescent + GITS_CTLR.Enabled = 0 .Quiescent=1 +- There is no internally cached state +- No collection or device table are used + GITS_BASER.Valid = 0 +- GITS_CBASER = 0, GITS_CREADR = 0, GITS_CWRITER = 0 +- The ABI version is unchanged and remains the one set when the ITS + device was first created. diff --git a/Documentation/virt/kvm/devices/arm-vgic-v3.txt b/Documentation/virt/kvm/devices/arm-vgic-v3.txt new file mode 100644 index 000000000000..ff290b43c8e5 --- /dev/null +++ b/Documentation/virt/kvm/devices/arm-vgic-v3.txt @@ -0,0 +1,251 @@ +ARM Virtual Generic Interrupt Controller v3 and later (VGICv3) +============================================================== + + +Device types supported: + KVM_DEV_TYPE_ARM_VGIC_V3 ARM Generic Interrupt Controller v3.0 + +Only one VGIC instance may be instantiated through this API. The created VGIC +will act as the VM interrupt controller, requiring emulated user-space devices +to inject interrupts to the VGIC instead of directly to CPUs. It is not +possible to create both a GICv3 and GICv2 on the same VM. + +Creating a guest GICv3 device requires a host GICv3 as well. + + +Groups: + KVM_DEV_ARM_VGIC_GRP_ADDR + Attributes: + KVM_VGIC_V3_ADDR_TYPE_DIST (rw, 64-bit) + Base address in the guest physical address space of the GICv3 distributor + register mappings. Only valid for KVM_DEV_TYPE_ARM_VGIC_V3. + This address needs to be 64K aligned and the region covers 64 KByte. + + KVM_VGIC_V3_ADDR_TYPE_REDIST (rw, 64-bit) + Base address in the guest physical address space of the GICv3 + redistributor register mappings. There are two 64K pages for each + VCPU and all of the redistributor pages are contiguous. + Only valid for KVM_DEV_TYPE_ARM_VGIC_V3. + This address needs to be 64K aligned. + + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION (rw, 64-bit) + The attribute data pointed to by kvm_device_attr.addr is a __u64 value: + bits: | 63 .... 52 | 51 .... 16 | 15 - 12 |11 - 0 + values: | count | base | flags | index + - index encodes the unique redistributor region index + - flags: reserved for future use, currently 0 + - base field encodes bits [51:16] of the guest physical base address + of the first redistributor in the region. + - count encodes the number of redistributors in the region. Must be + greater than 0. + There are two 64K pages for each redistributor in the region and + redistributors are laid out contiguously within the region. Regions + are filled with redistributors in the index order. The sum of all + region count fields must be greater than or equal to the number of + VCPUs. Redistributor regions must be registered in the incremental + index order, starting from index 0. + The characteristics of a specific redistributor region can be read + by presetting the index field in the attr data. + Only valid for KVM_DEV_TYPE_ARM_VGIC_V3. + + It is invalid to mix calls with KVM_VGIC_V3_ADDR_TYPE_REDIST and + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION attributes. + + Errors: + -E2BIG: Address outside of addressable IPA range + -EINVAL: Incorrectly aligned address, bad redistributor region + count/index, mixed redistributor region attribute usage + -EEXIST: Address already configured + -ENOENT: Attempt to read the characteristics of a non existing + redistributor region + -ENXIO: The group or attribute is unknown/unsupported for this device + or hardware support is missing. + -EFAULT: Invalid user pointer for attr->addr. + + + KVM_DEV_ARM_VGIC_GRP_DIST_REGS + KVM_DEV_ARM_VGIC_GRP_REDIST_REGS + Attributes: + The attr field of kvm_device_attr encodes two values: + bits: | 63 .... 32 | 31 .... 0 | + values: | mpidr | offset | + + All distributor regs are (rw, 32-bit) and kvm_device_attr.addr points to a + __u32 value. 64-bit registers must be accessed by separately accessing the + lower and higher word. + + Writes to read-only registers are ignored by the kernel. + + KVM_DEV_ARM_VGIC_GRP_DIST_REGS accesses the main distributor registers. + KVM_DEV_ARM_VGIC_GRP_REDIST_REGS accesses the redistributor of the CPU + specified by the mpidr. + + The offset is relative to the "[Re]Distributor base address" as defined + in the GICv3/4 specs. Getting or setting such a register has the same + effect as reading or writing the register on real hardware, except for the + following registers: GICD_STATUSR, GICR_STATUSR, GICD_ISPENDR, + GICR_ISPENDR0, GICD_ICPENDR, and GICR_ICPENDR0. These registers behave + differently when accessed via this interface compared to their + architecturally defined behavior to allow software a full view of the + VGIC's internal state. + + The mpidr field is used to specify which + redistributor is accessed. The mpidr is ignored for the distributor. + + The mpidr encoding is based on the affinity information in the + architecture defined MPIDR, and the field is encoded as follows: + | 63 .... 56 | 55 .... 48 | 47 .... 40 | 39 .... 32 | + | Aff3 | Aff2 | Aff1 | Aff0 | + + Note that distributor fields are not banked, but return the same value + regardless of the mpidr used to access the register. + + GICD_IIDR.Revision is updated when the KVM implementation is changed in a + way directly observable by the guest or userspace. Userspace should read + GICD_IIDR from KVM and write back the read value to confirm its expected + behavior is aligned with the KVM implementation. Userspace should set + GICD_IIDR before setting any other registers to ensure the expected + behavior. + + + The GICD_STATUSR and GICR_STATUSR registers are architecturally defined such + that a write of a clear bit has no effect, whereas a write with a set bit + clears that value. To allow userspace to freely set the values of these two + registers, setting the attributes with the register offsets for these two + registers simply sets the non-reserved bits to the value written. + + + Accesses (reads and writes) to the GICD_ISPENDR register region and + GICR_ISPENDR0 registers get/set the value of the latched pending state for + the interrupts. + + This is identical to the value returned by a guest read from ISPENDR for an + edge triggered interrupt, but may differ for level triggered interrupts. + For edge triggered interrupts, once an interrupt becomes pending (whether + because of an edge detected on the input line or because of a guest write + to ISPENDR) this state is "latched", and only cleared when either the + interrupt is activated or when the guest writes to ICPENDR. A level + triggered interrupt may be pending either because the level input is held + high by a device, or because of a guest write to the ISPENDR register. Only + ISPENDR writes are latched; if the device lowers the line level then the + interrupt is no longer pending unless the guest also wrote to ISPENDR, and + conversely writes to ICPENDR or activations of the interrupt do not clear + the pending status if the line level is still being held high. (These + rules are documented in the GICv3 specification descriptions of the ICPENDR + and ISPENDR registers.) For a level triggered interrupt the value accessed + here is that of the latch which is set by ISPENDR and cleared by ICPENDR or + interrupt activation, whereas the value returned by a guest read from + ISPENDR is the logical OR of the latch value and the input line level. + + Raw access to the latch state is provided to userspace so that it can save + and restore the entire GIC internal state (which is defined by the + combination of the current input line level and the latch state, and cannot + be deduced from purely the line level and the value of the ISPENDR + registers). + + Accesses to GICD_ICPENDR register region and GICR_ICPENDR0 registers have + RAZ/WI semantics, meaning that reads always return 0 and writes are always + ignored. + + Errors: + -ENXIO: Getting or setting this register is not yet supported + -EBUSY: One or more VCPUs are running + + + KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS + Attributes: + The attr field of kvm_device_attr encodes two values: + bits: | 63 .... 32 | 31 .... 16 | 15 .... 0 | + values: | mpidr | RES | instr | + + The mpidr field encodes the CPU ID based on the affinity information in the + architecture defined MPIDR, and the field is encoded as follows: + | 63 .... 56 | 55 .... 48 | 47 .... 40 | 39 .... 32 | + | Aff3 | Aff2 | Aff1 | Aff0 | + + The instr field encodes the system register to access based on the fields + defined in the A64 instruction set encoding for system register access + (RES means the bits are reserved for future use and should be zero): + + | 15 ... 14 | 13 ... 11 | 10 ... 7 | 6 ... 3 | 2 ... 0 | + | Op 0 | Op1 | CRn | CRm | Op2 | + + All system regs accessed through this API are (rw, 64-bit) and + kvm_device_attr.addr points to a __u64 value. + + KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS accesses the CPU interface registers for the + CPU specified by the mpidr field. + + CPU interface registers access is not implemented for AArch32 mode. + Error -ENXIO is returned when accessed in AArch32 mode. + Errors: + -ENXIO: Getting or setting this register is not yet supported + -EBUSY: VCPU is running + -EINVAL: Invalid mpidr or register value supplied + + + KVM_DEV_ARM_VGIC_GRP_NR_IRQS + Attributes: + A value describing the number of interrupts (SGI, PPI and SPI) for + this GIC instance, ranging from 64 to 1024, in increments of 32. + + kvm_device_attr.addr points to a __u32 value. + + Errors: + -EINVAL: Value set is out of the expected range + -EBUSY: Value has already be set. + + + KVM_DEV_ARM_VGIC_GRP_CTRL + Attributes: + KVM_DEV_ARM_VGIC_CTRL_INIT + request the initialization of the VGIC, no additional parameter in + kvm_device_attr.addr. + KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES + save all LPI pending bits into guest RAM pending tables. + + The first kB of the pending table is not altered by this operation. + Errors: + -ENXIO: VGIC not properly configured as required prior to calling + this attribute + -ENODEV: no online VCPU + -ENOMEM: memory shortage when allocating vgic internal data + -EFAULT: Invalid guest ram access + -EBUSY: One or more VCPUS are running + + + KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO + Attributes: + The attr field of kvm_device_attr encodes the following values: + bits: | 63 .... 32 | 31 .... 10 | 9 .... 0 | + values: | mpidr | info | vINTID | + + The vINTID specifies which set of IRQs is reported on. + + The info field specifies which information userspace wants to get or set + using this interface. Currently we support the following info values: + + VGIC_LEVEL_INFO_LINE_LEVEL: + Get/Set the input level of the IRQ line for a set of 32 contiguously + numbered interrupts. + vINTID must be a multiple of 32. + + kvm_device_attr.addr points to a __u32 value which will contain a + bitmap where a set bit means the interrupt level is asserted. + + Bit[n] indicates the status for interrupt vINTID + n. + + SGIs and any interrupt with a higher ID than the number of interrupts + supported, will be RAZ/WI. LPIs are always edge-triggered and are + therefore not supported by this interface. + + PPIs are reported per VCPU as specified in the mpidr field, and SPIs are + reported with the same value regardless of the mpidr specified. + + The mpidr field encodes the CPU ID based on the affinity information in the + architecture defined MPIDR, and the field is encoded as follows: + | 63 .... 56 | 55 .... 48 | 47 .... 40 | 39 .... 32 | + | Aff3 | Aff2 | Aff1 | Aff0 | + Errors: + -EINVAL: vINTID is not multiple of 32 or + info field is not VGIC_LEVEL_INFO_LINE_LEVEL diff --git a/Documentation/virt/kvm/devices/arm-vgic.txt b/Documentation/virt/kvm/devices/arm-vgic.txt new file mode 100644 index 000000000000..97b6518148f8 --- /dev/null +++ b/Documentation/virt/kvm/devices/arm-vgic.txt @@ -0,0 +1,127 @@ +ARM Virtual Generic Interrupt Controller v2 (VGIC) +================================================== + +Device types supported: + KVM_DEV_TYPE_ARM_VGIC_V2 ARM Generic Interrupt Controller v2.0 + +Only one VGIC instance may be instantiated through either this API or the +legacy KVM_CREATE_IRQCHIP API. The created VGIC will act as the VM interrupt +controller, requiring emulated user-space devices to inject interrupts to the +VGIC instead of directly to CPUs. + +GICv3 implementations with hardware compatibility support allow creating a +guest GICv2 through this interface. For information on creating a guest GICv3 +device and guest ITS devices, see arm-vgic-v3.txt. It is not possible to +create both a GICv3 and GICv2 device on the same VM. + + +Groups: + KVM_DEV_ARM_VGIC_GRP_ADDR + Attributes: + KVM_VGIC_V2_ADDR_TYPE_DIST (rw, 64-bit) + Base address in the guest physical address space of the GIC distributor + register mappings. Only valid for KVM_DEV_TYPE_ARM_VGIC_V2. + This address needs to be 4K aligned and the region covers 4 KByte. + + KVM_VGIC_V2_ADDR_TYPE_CPU (rw, 64-bit) + Base address in the guest physical address space of the GIC virtual cpu + interface register mappings. Only valid for KVM_DEV_TYPE_ARM_VGIC_V2. + This address needs to be 4K aligned and the region covers 4 KByte. + Errors: + -E2BIG: Address outside of addressable IPA range + -EINVAL: Incorrectly aligned address + -EEXIST: Address already configured + -ENXIO: The group or attribute is unknown/unsupported for this device + or hardware support is missing. + -EFAULT: Invalid user pointer for attr->addr. + + KVM_DEV_ARM_VGIC_GRP_DIST_REGS + Attributes: + The attr field of kvm_device_attr encodes two values: + bits: | 63 .... 40 | 39 .. 32 | 31 .... 0 | + values: | reserved | vcpu_index | offset | + + All distributor regs are (rw, 32-bit) + + The offset is relative to the "Distributor base address" as defined in the + GICv2 specs. Getting or setting such a register has the same effect as + reading or writing the register on the actual hardware from the cpu whose + index is specified with the vcpu_index field. Note that most distributor + fields are not banked, but return the same value regardless of the + vcpu_index used to access the register. + + GICD_IIDR.Revision is updated when the KVM implementation of an emulated + GICv2 is changed in a way directly observable by the guest or userspace. + Userspace should read GICD_IIDR from KVM and write back the read value to + confirm its expected behavior is aligned with the KVM implementation. + Userspace should set GICD_IIDR before setting any other registers (both + KVM_DEV_ARM_VGIC_GRP_DIST_REGS and KVM_DEV_ARM_VGIC_GRP_CPU_REGS) to ensure + the expected behavior. Unless GICD_IIDR has been set from userspace, writes + to the interrupt group registers (GICD_IGROUPR) are ignored. + Errors: + -ENXIO: Getting or setting this register is not yet supported + -EBUSY: One or more VCPUs are running + -EINVAL: Invalid vcpu_index supplied + + KVM_DEV_ARM_VGIC_GRP_CPU_REGS + Attributes: + The attr field of kvm_device_attr encodes two values: + bits: | 63 .... 40 | 39 .. 32 | 31 .... 0 | + values: | reserved | vcpu_index | offset | + + All CPU interface regs are (rw, 32-bit) + + The offset specifies the offset from the "CPU interface base address" as + defined in the GICv2 specs. Getting or setting such a register has the + same effect as reading or writing the register on the actual hardware. + + The Active Priorities Registers APRn are implementation defined, so we set a + fixed format for our implementation that fits with the model of a "GICv2 + implementation without the security extensions" which we present to the + guest. This interface always exposes four register APR[0-3] describing the + maximum possible 128 preemption levels. The semantics of the register + indicate if any interrupts in a given preemption level are in the active + state by setting the corresponding bit. + + Thus, preemption level X has one or more active interrupts if and only if: + + APRn[X mod 32] == 0b1, where n = X / 32 + + Bits for undefined preemption levels are RAZ/WI. + + Note that this differs from a CPU's view of the APRs on hardware in which + a GIC without the security extensions expose group 0 and group 1 active + priorities in separate register groups, whereas we show a combined view + similar to GICv2's GICH_APR. + + For historical reasons and to provide ABI compatibility with userspace we + export the GICC_PMR register in the format of the GICH_VMCR.VMPriMask + field in the lower 5 bits of a word, meaning that userspace must always + use the lower 5 bits to communicate with the KVM device and must shift the + value left by 3 places to obtain the actual priority mask level. + + Errors: + -ENXIO: Getting or setting this register is not yet supported + -EBUSY: One or more VCPUs are running + -EINVAL: Invalid vcpu_index supplied + + KVM_DEV_ARM_VGIC_GRP_NR_IRQS + Attributes: + A value describing the number of interrupts (SGI, PPI and SPI) for + this GIC instance, ranging from 64 to 1024, in increments of 32. + + Errors: + -EINVAL: Value set is out of the expected range + -EBUSY: Value has already be set, or GIC has already been initialized + with default values. + + KVM_DEV_ARM_VGIC_GRP_CTRL + Attributes: + KVM_DEV_ARM_VGIC_CTRL_INIT + request the initialization of the VGIC or ITS, no additional parameter + in kvm_device_attr.addr. + Errors: + -ENXIO: VGIC not properly configured as required prior to calling + this attribute + -ENODEV: no online VCPU + -ENOMEM: memory shortage when allocating vgic internal data diff --git a/Documentation/virt/kvm/devices/mpic.txt b/Documentation/virt/kvm/devices/mpic.txt new file mode 100644 index 000000000000..8257397adc3c --- /dev/null +++ b/Documentation/virt/kvm/devices/mpic.txt @@ -0,0 +1,53 @@ +MPIC interrupt controller +========================= + +Device types supported: + KVM_DEV_TYPE_FSL_MPIC_20 Freescale MPIC v2.0 + KVM_DEV_TYPE_FSL_MPIC_42 Freescale MPIC v4.2 + +Only one MPIC instance, of any type, may be instantiated. The created +MPIC will act as the system interrupt controller, connecting to each +vcpu's interrupt inputs. + +Groups: + KVM_DEV_MPIC_GRP_MISC + Attributes: + KVM_DEV_MPIC_BASE_ADDR (rw, 64-bit) + Base address of the 256 KiB MPIC register space. Must be + naturally aligned. A value of zero disables the mapping. + Reset value is zero. + + KVM_DEV_MPIC_GRP_REGISTER (rw, 32-bit) + Access an MPIC register, as if the access were made from the guest. + "attr" is the byte offset into the MPIC register space. Accesses + must be 4-byte aligned. + + MSIs may be signaled by using this attribute group to write + to the relevant MSIIR. + + KVM_DEV_MPIC_GRP_IRQ_ACTIVE (rw, 32-bit) + IRQ input line for each standard openpic source. 0 is inactive and 1 + is active, regardless of interrupt sense. + + For edge-triggered interrupts: Writing 1 is considered an activating + edge, and writing 0 is ignored. Reading returns 1 if a previously + signaled edge has not been acknowledged, and 0 otherwise. + + "attr" is the IRQ number. IRQ numbers for standard sources are the + byte offset of the relevant IVPR from EIVPR0, divided by 32. + +IRQ Routing: + + The MPIC emulation supports IRQ routing. Only a single MPIC device can + be instantiated. Once that device has been created, it's available as + irqchip id 0. + + This irqchip 0 has 256 interrupt pins, which expose the interrupts in + the main array of interrupt sources (a.k.a. "SRC" interrupts). + + The numbering is the same as the MPIC device tree binding -- based on + the register offset from the beginning of the sources array, without + regard to any subdivisions in chip documentation such as "internal" + or "external" interrupts. + + Access to non-SRC interrupts is not implemented through IRQ routing mechanisms. diff --git a/Documentation/virt/kvm/devices/s390_flic.txt b/Documentation/virt/kvm/devices/s390_flic.txt new file mode 100644 index 000000000000..a4e20a090174 --- /dev/null +++ b/Documentation/virt/kvm/devices/s390_flic.txt @@ -0,0 +1,163 @@ +FLIC (floating interrupt controller) +==================================== + +FLIC handles floating (non per-cpu) interrupts, i.e. I/O, service and some +machine check interruptions. All interrupts are stored in a per-vm list of +pending interrupts. FLIC performs operations on this list. + +Only one FLIC instance may be instantiated. + +FLIC provides support to +- add interrupts (KVM_DEV_FLIC_ENQUEUE) +- inspect currently pending interrupts (KVM_FLIC_GET_ALL_IRQS) +- purge all pending floating interrupts (KVM_DEV_FLIC_CLEAR_IRQS) +- purge one pending floating I/O interrupt (KVM_DEV_FLIC_CLEAR_IO_IRQ) +- enable/disable for the guest transparent async page faults +- register and modify adapter interrupt sources (KVM_DEV_FLIC_ADAPTER_*) +- modify AIS (adapter-interruption-suppression) mode state (KVM_DEV_FLIC_AISM) +- inject adapter interrupts on a specified adapter (KVM_DEV_FLIC_AIRQ_INJECT) +- get/set all AIS mode states (KVM_DEV_FLIC_AISM_ALL) + +Groups: + KVM_DEV_FLIC_ENQUEUE + Passes a buffer and length into the kernel which are then injected into + the list of pending interrupts. + attr->addr contains the pointer to the buffer and attr->attr contains + the length of the buffer. + The format of the data structure kvm_s390_irq as it is copied from userspace + is defined in usr/include/linux/kvm.h. + + KVM_DEV_FLIC_GET_ALL_IRQS + Copies all floating interrupts into a buffer provided by userspace. + When the buffer is too small it returns -ENOMEM, which is the indication + for userspace to try again with a bigger buffer. + -ENOBUFS is returned when the allocation of a kernelspace buffer has + failed. + -EFAULT is returned when copying data to userspace failed. + All interrupts remain pending, i.e. are not deleted from the list of + currently pending interrupts. + attr->addr contains the userspace address of the buffer into which all + interrupt data will be copied. + attr->attr contains the size of the buffer in bytes. + + KVM_DEV_FLIC_CLEAR_IRQS + Simply deletes all elements from the list of currently pending floating + interrupts. No interrupts are injected into the guest. + + KVM_DEV_FLIC_CLEAR_IO_IRQ + Deletes one (if any) I/O interrupt for a subchannel identified by the + subsystem identification word passed via the buffer specified by + attr->addr (address) and attr->attr (length). + + KVM_DEV_FLIC_APF_ENABLE + Enables async page faults for the guest. So in case of a major page fault + the host is allowed to handle this async and continues the guest. + + KVM_DEV_FLIC_APF_DISABLE_WAIT + Disables async page faults for the guest and waits until already pending + async page faults are done. This is necessary to trigger a completion interrupt + for every init interrupt before migrating the interrupt list. + + KVM_DEV_FLIC_ADAPTER_REGISTER + Register an I/O adapter interrupt source. Takes a kvm_s390_io_adapter + describing the adapter to register: + +struct kvm_s390_io_adapter { + __u32 id; + __u8 isc; + __u8 maskable; + __u8 swap; + __u8 flags; +}; + + id contains the unique id for the adapter, isc the I/O interruption subclass + to use, maskable whether this adapter may be masked (interrupts turned off), + swap whether the indicators need to be byte swapped, and flags contains + further characteristics of the adapter. + Currently defined values for 'flags' are: + - KVM_S390_ADAPTER_SUPPRESSIBLE: adapter is subject to AIS + (adapter-interrupt-suppression) facility. This flag only has an effect if + the AIS capability is enabled. + Unknown flag values are ignored. + + + KVM_DEV_FLIC_ADAPTER_MODIFY + Modifies attributes of an existing I/O adapter interrupt source. Takes + a kvm_s390_io_adapter_req specifying the adapter and the operation: + +struct kvm_s390_io_adapter_req { + __u32 id; + __u8 type; + __u8 mask; + __u16 pad0; + __u64 addr; +}; + + id specifies the adapter and type the operation. The supported operations + are: + + KVM_S390_IO_ADAPTER_MASK + mask or unmask the adapter, as specified in mask + + KVM_S390_IO_ADAPTER_MAP + perform a gmap translation for the guest address provided in addr, + pin a userspace page for the translated address and add it to the + list of mappings + Note: A new mapping will be created unconditionally; therefore, + the calling code should avoid making duplicate mappings. + + KVM_S390_IO_ADAPTER_UNMAP + release a userspace page for the translated address specified in addr + from the list of mappings + + KVM_DEV_FLIC_AISM + modify the adapter-interruption-suppression mode for a given isc if the + AIS capability is enabled. Takes a kvm_s390_ais_req describing: + +struct kvm_s390_ais_req { + __u8 isc; + __u16 mode; +}; + + isc contains the target I/O interruption subclass, mode the target + adapter-interruption-suppression mode. The following modes are + currently supported: + - KVM_S390_AIS_MODE_ALL: ALL-Interruptions Mode, i.e. airq injection + is always allowed; + - KVM_S390_AIS_MODE_SINGLE: SINGLE-Interruption Mode, i.e. airq + injection is only allowed once and the following adapter interrupts + will be suppressed until the mode is set again to ALL-Interruptions + or SINGLE-Interruption mode. + + KVM_DEV_FLIC_AIRQ_INJECT + Inject adapter interrupts on a specified adapter. + attr->attr contains the unique id for the adapter, which allows for + adapter-specific checks and actions. + For adapters subject to AIS, handle the airq injection suppression for + an isc according to the adapter-interruption-suppression mode on condition + that the AIS capability is enabled. + + KVM_DEV_FLIC_AISM_ALL + Gets or sets the adapter-interruption-suppression mode for all ISCs. Takes + a kvm_s390_ais_all describing: + +struct kvm_s390_ais_all { + __u8 simm; /* Single-Interruption-Mode mask */ + __u8 nimm; /* No-Interruption-Mode mask * +}; + + simm contains Single-Interruption-Mode mask for all ISCs, nimm contains + No-Interruption-Mode mask for all ISCs. Each bit in simm and nimm corresponds + to an ISC (MSB0 bit 0 to ISC 0 and so on). The combination of simm bit and + nimm bit presents AIS mode for a ISC. + + KVM_DEV_FLIC_AISM_ALL is indicated by KVM_CAP_S390_AIS_MIGRATION. + +Note: The KVM_SET_DEVICE_ATTR/KVM_GET_DEVICE_ATTR device ioctls executed on +FLIC with an unknown group or attribute gives the error code EINVAL (instead of +ENXIO, as specified in the API documentation). It is not possible to conclude +that a FLIC operation is unavailable based on the error code resulting from a +usage attempt. + +Note: The KVM_DEV_FLIC_CLEAR_IO_IRQ ioctl will return EINVAL in case a zero +schid is specified. diff --git a/Documentation/virt/kvm/devices/vcpu.txt b/Documentation/virt/kvm/devices/vcpu.txt new file mode 100644 index 000000000000..2b5dab16c4f2 --- /dev/null +++ b/Documentation/virt/kvm/devices/vcpu.txt @@ -0,0 +1,62 @@ +Generic vcpu interface +==================================== + +The virtual cpu "device" also accepts the ioctls KVM_SET_DEVICE_ATTR, +KVM_GET_DEVICE_ATTR, and KVM_HAS_DEVICE_ATTR. The interface uses the same struct +kvm_device_attr as other devices, but targets VCPU-wide settings and controls. + +The groups and attributes per virtual cpu, if any, are architecture specific. + +1. GROUP: KVM_ARM_VCPU_PMU_V3_CTRL +Architectures: ARM64 + +1.1. ATTRIBUTE: KVM_ARM_VCPU_PMU_V3_IRQ +Parameters: in kvm_device_attr.addr the address for PMU overflow interrupt is a + pointer to an int +Returns: -EBUSY: The PMU overflow interrupt is already set + -ENXIO: The overflow interrupt not set when attempting to get it + -ENODEV: PMUv3 not supported + -EINVAL: Invalid PMU overflow interrupt number supplied or + trying to set the IRQ number without using an in-kernel + irqchip. + +A value describing the PMUv3 (Performance Monitor Unit v3) overflow interrupt +number for this vcpu. This interrupt could be a PPI or SPI, but the interrupt +type must be same for each vcpu. As a PPI, the interrupt number is the same for +all vcpus, while as an SPI it must be a separate number per vcpu. + +1.2 ATTRIBUTE: KVM_ARM_VCPU_PMU_V3_INIT +Parameters: no additional parameter in kvm_device_attr.addr +Returns: -ENODEV: PMUv3 not supported or GIC not initialized + -ENXIO: PMUv3 not properly configured or in-kernel irqchip not + configured as required prior to calling this attribute + -EBUSY: PMUv3 already initialized + +Request the initialization of the PMUv3. If using the PMUv3 with an in-kernel +virtual GIC implementation, this must be done after initializing the in-kernel +irqchip. + + +2. GROUP: KVM_ARM_VCPU_TIMER_CTRL +Architectures: ARM,ARM64 + +2.1. ATTRIBUTE: KVM_ARM_VCPU_TIMER_IRQ_VTIMER +2.2. ATTRIBUTE: KVM_ARM_VCPU_TIMER_IRQ_PTIMER +Parameters: in kvm_device_attr.addr the address for the timer interrupt is a + pointer to an int +Returns: -EINVAL: Invalid timer interrupt number + -EBUSY: One or more VCPUs has already run + +A value describing the architected timer interrupt number when connected to an +in-kernel virtual GIC. These must be a PPI (16 <= intid < 32). Setting the +attribute overrides the default values (see below). + +KVM_ARM_VCPU_TIMER_IRQ_VTIMER: The EL1 virtual timer intid (default: 27) +KVM_ARM_VCPU_TIMER_IRQ_PTIMER: The EL1 physical timer intid (default: 30) + +Setting the same PPI for different timers will prevent the VCPUs from running. +Setting the interrupt number on a VCPU configures all VCPUs created at that +time to use the number provided for a given timer, overwriting any previously +configured values on other VCPUs. Userspace should configure the interrupt +numbers on at least one VCPU after creating all VCPUs and before running any +VCPUs. diff --git a/Documentation/virt/kvm/devices/vfio.txt b/Documentation/virt/kvm/devices/vfio.txt new file mode 100644 index 000000000000..528c77c8022c --- /dev/null +++ b/Documentation/virt/kvm/devices/vfio.txt @@ -0,0 +1,36 @@ +VFIO virtual device +=================== + +Device types supported: + KVM_DEV_TYPE_VFIO + +Only one VFIO instance may be created per VM. The created device +tracks VFIO groups in use by the VM and features of those groups +important to the correctness and acceleration of the VM. As groups +are enabled and disabled for use by the VM, KVM should be updated +about their presence. When registered with KVM, a reference to the +VFIO-group is held by KVM. + +Groups: + KVM_DEV_VFIO_GROUP + +KVM_DEV_VFIO_GROUP attributes: + KVM_DEV_VFIO_GROUP_ADD: Add a VFIO group to VFIO-KVM device tracking + kvm_device_attr.addr points to an int32_t file descriptor + for the VFIO group. + KVM_DEV_VFIO_GROUP_DEL: Remove a VFIO group from VFIO-KVM device tracking + kvm_device_attr.addr points to an int32_t file descriptor + for the VFIO group. + KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE: attaches a guest visible TCE table + allocated by sPAPR KVM. + kvm_device_attr.addr points to a struct: + + struct kvm_vfio_spapr_tce { + __s32 groupfd; + __s32 tablefd; + }; + + where + @groupfd is a file descriptor for a VFIO group; + @tablefd is a file descriptor for a TCE table allocated via + KVM_CREATE_SPAPR_TCE. diff --git a/Documentation/virt/kvm/devices/vm.txt b/Documentation/virt/kvm/devices/vm.txt new file mode 100644 index 000000000000..4ffb82b02468 --- /dev/null +++ b/Documentation/virt/kvm/devices/vm.txt @@ -0,0 +1,270 @@ +Generic vm interface +==================================== + +The virtual machine "device" also accepts the ioctls KVM_SET_DEVICE_ATTR, +KVM_GET_DEVICE_ATTR, and KVM_HAS_DEVICE_ATTR. The interface uses the same +struct kvm_device_attr as other devices, but targets VM-wide settings +and controls. + +The groups and attributes per virtual machine, if any, are architecture +specific. + +1. GROUP: KVM_S390_VM_MEM_CTRL +Architectures: s390 + +1.1. ATTRIBUTE: KVM_S390_VM_MEM_ENABLE_CMMA +Parameters: none +Returns: -EBUSY if a vcpu is already defined, otherwise 0 + +Enables Collaborative Memory Management Assist (CMMA) for the virtual machine. + +1.2. ATTRIBUTE: KVM_S390_VM_MEM_CLR_CMMA +Parameters: none +Returns: -EINVAL if CMMA was not enabled + 0 otherwise + +Clear the CMMA status for all guest pages, so any pages the guest marked +as unused are again used any may not be reclaimed by the host. + +1.3. ATTRIBUTE KVM_S390_VM_MEM_LIMIT_SIZE +Parameters: in attr->addr the address for the new limit of guest memory +Returns: -EFAULT if the given address is not accessible + -EINVAL if the virtual machine is of type UCONTROL + -E2BIG if the given guest memory is to big for that machine + -EBUSY if a vcpu is already defined + -ENOMEM if not enough memory is available for a new shadow guest mapping + 0 otherwise + +Allows userspace to query the actual limit and set a new limit for +the maximum guest memory size. The limit will be rounded up to +2048 MB, 4096 GB, 8192 TB respectively, as this limit is governed by +the number of page table levels. In the case that there is no limit we will set +the limit to KVM_S390_NO_MEM_LIMIT (U64_MAX). + +2. GROUP: KVM_S390_VM_CPU_MODEL +Architectures: s390 + +2.1. ATTRIBUTE: KVM_S390_VM_CPU_MACHINE (r/o) + +Allows user space to retrieve machine and kvm specific cpu related information: + +struct kvm_s390_vm_cpu_machine { + __u64 cpuid; # CPUID of host + __u32 ibc; # IBC level range offered by host + __u8 pad[4]; + __u64 fac_mask[256]; # set of cpu facilities enabled by KVM + __u64 fac_list[256]; # set of cpu facilities offered by host +} + +Parameters: address of buffer to store the machine related cpu data + of type struct kvm_s390_vm_cpu_machine* +Returns: -EFAULT if the given address is not accessible from kernel space + -ENOMEM if not enough memory is available to process the ioctl + 0 in case of success + +2.2. ATTRIBUTE: KVM_S390_VM_CPU_PROCESSOR (r/w) + +Allows user space to retrieve or request to change cpu related information for a vcpu: + +struct kvm_s390_vm_cpu_processor { + __u64 cpuid; # CPUID currently (to be) used by this vcpu + __u16 ibc; # IBC level currently (to be) used by this vcpu + __u8 pad[6]; + __u64 fac_list[256]; # set of cpu facilities currently (to be) used + # by this vcpu +} + +KVM does not enforce or limit the cpu model data in any form. Take the information +retrieved by means of KVM_S390_VM_CPU_MACHINE as hint for reasonable configuration +setups. Instruction interceptions triggered by additionally set facility bits that +are not handled by KVM need to by imlemented in the VM driver code. + +Parameters: address of buffer to store/set the processor related cpu + data of type struct kvm_s390_vm_cpu_processor*. +Returns: -EBUSY in case 1 or more vcpus are already activated (only in write case) + -EFAULT if the given address is not accessible from kernel space + -ENOMEM if not enough memory is available to process the ioctl + 0 in case of success + +2.3. ATTRIBUTE: KVM_S390_VM_CPU_MACHINE_FEAT (r/o) + +Allows user space to retrieve available cpu features. A feature is available if +provided by the hardware and supported by kvm. In theory, cpu features could +even be completely emulated by kvm. + +struct kvm_s390_vm_cpu_feat { + __u64 feat[16]; # Bitmap (1 = feature available), MSB 0 bit numbering +}; + +Parameters: address of a buffer to load the feature list from. +Returns: -EFAULT if the given address is not accessible from kernel space. + 0 in case of success. + +2.4. ATTRIBUTE: KVM_S390_VM_CPU_PROCESSOR_FEAT (r/w) + +Allows user space to retrieve or change enabled cpu features for all VCPUs of a +VM. Features that are not available cannot be enabled. + +See 2.3. for a description of the parameter struct. + +Parameters: address of a buffer to store/load the feature list from. +Returns: -EFAULT if the given address is not accessible from kernel space. + -EINVAL if a cpu feature that is not available is to be enabled. + -EBUSY if at least one VCPU has already been defined. + 0 in case of success. + +2.5. ATTRIBUTE: KVM_S390_VM_CPU_MACHINE_SUBFUNC (r/o) + +Allows user space to retrieve available cpu subfunctions without any filtering +done by a set IBC. These subfunctions are indicated to the guest VCPU via +query or "test bit" subfunctions and used e.g. by cpacf functions, plo and ptff. + +A subfunction block is only valid if KVM_S390_VM_CPU_MACHINE contains the +STFL(E) bit introducing the affected instruction. If the affected instruction +indicates subfunctions via a "query subfunction", the response block is +contained in the returned struct. If the affected instruction +indicates subfunctions via a "test bit" mechanism, the subfunction codes are +contained in the returned struct in MSB 0 bit numbering. + +struct kvm_s390_vm_cpu_subfunc { + u8 plo[32]; # always valid (ESA/390 feature) + u8 ptff[16]; # valid with TOD-clock steering + u8 kmac[16]; # valid with Message-Security-Assist + u8 kmc[16]; # valid with Message-Security-Assist + u8 km[16]; # valid with Message-Security-Assist + u8 kimd[16]; # valid with Message-Security-Assist + u8 klmd[16]; # valid with Message-Security-Assist + u8 pckmo[16]; # valid with Message-Security-Assist-Extension 3 + u8 kmctr[16]; # valid with Message-Security-Assist-Extension 4 + u8 kmf[16]; # valid with Message-Security-Assist-Extension 4 + u8 kmo[16]; # valid with Message-Security-Assist-Extension 4 + u8 pcc[16]; # valid with Message-Security-Assist-Extension 4 + u8 ppno[16]; # valid with Message-Security-Assist-Extension 5 + u8 kma[16]; # valid with Message-Security-Assist-Extension 8 + u8 kdsa[16]; # valid with Message-Security-Assist-Extension 9 + u8 reserved[1792]; # reserved for future instructions +}; + +Parameters: address of a buffer to load the subfunction blocks from. +Returns: -EFAULT if the given address is not accessible from kernel space. + 0 in case of success. + +2.6. ATTRIBUTE: KVM_S390_VM_CPU_PROCESSOR_SUBFUNC (r/w) + +Allows user space to retrieve or change cpu subfunctions to be indicated for +all VCPUs of a VM. This attribute will only be available if kernel and +hardware support are in place. + +The kernel uses the configured subfunction blocks for indication to +the guest. A subfunction block will only be used if the associated STFL(E) bit +has not been disabled by user space (so the instruction to be queried is +actually available for the guest). + +As long as no data has been written, a read will fail. The IBC will be used +to determine available subfunctions in this case, this will guarantee backward +compatibility. + +See 2.5. for a description of the parameter struct. + +Parameters: address of a buffer to store/load the subfunction blocks from. +Returns: -EFAULT if the given address is not accessible from kernel space. + -EINVAL when reading, if there was no write yet. + -EBUSY if at least one VCPU has already been defined. + 0 in case of success. + +3. GROUP: KVM_S390_VM_TOD +Architectures: s390 + +3.1. ATTRIBUTE: KVM_S390_VM_TOD_HIGH + +Allows user space to set/get the TOD clock extension (u8) (superseded by +KVM_S390_VM_TOD_EXT). + +Parameters: address of a buffer in user space to store the data (u8) to +Returns: -EFAULT if the given address is not accessible from kernel space + -EINVAL if setting the TOD clock extension to != 0 is not supported + +3.2. ATTRIBUTE: KVM_S390_VM_TOD_LOW + +Allows user space to set/get bits 0-63 of the TOD clock register as defined in +the POP (u64). + +Parameters: address of a buffer in user space to store the data (u64) to +Returns: -EFAULT if the given address is not accessible from kernel space + +3.3. ATTRIBUTE: KVM_S390_VM_TOD_EXT +Allows user space to set/get bits 0-63 of the TOD clock register as defined in +the POP (u64). If the guest CPU model supports the TOD clock extension (u8), it +also allows user space to get/set it. If the guest CPU model does not support +it, it is stored as 0 and not allowed to be set to a value != 0. + +Parameters: address of a buffer in user space to store the data + (kvm_s390_vm_tod_clock) to +Returns: -EFAULT if the given address is not accessible from kernel space + -EINVAL if setting the TOD clock extension to != 0 is not supported + +4. GROUP: KVM_S390_VM_CRYPTO +Architectures: s390 + +4.1. ATTRIBUTE: KVM_S390_VM_CRYPTO_ENABLE_AES_KW (w/o) + +Allows user space to enable aes key wrapping, including generating a new +wrapping key. + +Parameters: none +Returns: 0 + +4.2. ATTRIBUTE: KVM_S390_VM_CRYPTO_ENABLE_DEA_KW (w/o) + +Allows user space to enable dea key wrapping, including generating a new +wrapping key. + +Parameters: none +Returns: 0 + +4.3. ATTRIBUTE: KVM_S390_VM_CRYPTO_DISABLE_AES_KW (w/o) + +Allows user space to disable aes key wrapping, clearing the wrapping key. + +Parameters: none +Returns: 0 + +4.4. ATTRIBUTE: KVM_S390_VM_CRYPTO_DISABLE_DEA_KW (w/o) + +Allows user space to disable dea key wrapping, clearing the wrapping key. + +Parameters: none +Returns: 0 + +5. GROUP: KVM_S390_VM_MIGRATION +Architectures: s390 + +5.1. ATTRIBUTE: KVM_S390_VM_MIGRATION_STOP (w/o) + +Allows userspace to stop migration mode, needed for PGSTE migration. +Setting this attribute when migration mode is not active will have no +effects. + +Parameters: none +Returns: 0 + +5.2. ATTRIBUTE: KVM_S390_VM_MIGRATION_START (w/o) + +Allows userspace to start migration mode, needed for PGSTE migration. +Setting this attribute when migration mode is already active will have +no effects. + +Parameters: none +Returns: -ENOMEM if there is not enough free memory to start migration mode + -EINVAL if the state of the VM is invalid (e.g. no memory defined) + 0 in case of success. + +5.3. ATTRIBUTE: KVM_S390_VM_MIGRATION_STATUS (r/o) + +Allows userspace to query the status of migration mode. + +Parameters: address of a buffer in user space to store the data (u64) to; + the data itself is either 0 if migration mode is disabled or 1 + if it is enabled +Returns: -EFAULT if the given address is not accessible from kernel space + 0 in case of success. diff --git a/Documentation/virt/kvm/devices/xics.txt b/Documentation/virt/kvm/devices/xics.txt new file mode 100644 index 000000000000..42864935ac5d --- /dev/null +++ b/Documentation/virt/kvm/devices/xics.txt @@ -0,0 +1,66 @@ +XICS interrupt controller + +Device type supported: KVM_DEV_TYPE_XICS + +Groups: + KVM_DEV_XICS_SOURCES + Attributes: One per interrupt source, indexed by the source number. + +This device emulates the XICS (eXternal Interrupt Controller +Specification) defined in PAPR. The XICS has a set of interrupt +sources, each identified by a 20-bit source number, and a set of +Interrupt Control Presentation (ICP) entities, also called "servers", +each associated with a virtual CPU. + +The ICP entities are created by enabling the KVM_CAP_IRQ_ARCH +capability for each vcpu, specifying KVM_CAP_IRQ_XICS in args[0] and +the interrupt server number (i.e. the vcpu number from the XICS's +point of view) in args[1] of the kvm_enable_cap struct. Each ICP has +64 bits of state which can be read and written using the +KVM_GET_ONE_REG and KVM_SET_ONE_REG ioctls on the vcpu. The 64 bit +state word has the following bitfields, starting at the +least-significant end of the word: + +* Unused, 16 bits + +* Pending interrupt priority, 8 bits + Zero is the highest priority, 255 means no interrupt is pending. + +* Pending IPI (inter-processor interrupt) priority, 8 bits + Zero is the highest priority, 255 means no IPI is pending. + +* Pending interrupt source number, 24 bits + Zero means no interrupt pending, 2 means an IPI is pending + +* Current processor priority, 8 bits + Zero is the highest priority, meaning no interrupts can be + delivered, and 255 is the lowest priority. + +Each source has 64 bits of state that can be read and written using +the KVM_GET_DEVICE_ATTR and KVM_SET_DEVICE_ATTR ioctls, specifying the +KVM_DEV_XICS_SOURCES attribute group, with the attribute number being +the interrupt source number. The 64 bit state word has the following +bitfields, starting from the least-significant end of the word: + +* Destination (server number), 32 bits + This specifies where the interrupt should be sent, and is the + interrupt server number specified for the destination vcpu. + +* Priority, 8 bits + This is the priority specified for this interrupt source, where 0 is + the highest priority and 255 is the lowest. An interrupt with a + priority of 255 will never be delivered. + +* Level sensitive flag, 1 bit + This bit is 1 for a level-sensitive interrupt source, or 0 for + edge-sensitive (or MSI). + +* Masked flag, 1 bit + This bit is set to 1 if the interrupt is masked (cannot be delivered + regardless of its priority), for example by the ibm,int-off RTAS + call, or 0 if it is not masked. + +* Pending flag, 1 bit + This bit is 1 if the source has a pending interrupt, otherwise 0. + +Only one XICS instance may be created per VM. diff --git a/Documentation/virt/kvm/devices/xive.txt b/Documentation/virt/kvm/devices/xive.txt new file mode 100644 index 000000000000..9a24a4525253 --- /dev/null +++ b/Documentation/virt/kvm/devices/xive.txt @@ -0,0 +1,197 @@ +POWER9 eXternal Interrupt Virtualization Engine (XIVE Gen1) +========================================================== + +Device types supported: + KVM_DEV_TYPE_XIVE POWER9 XIVE Interrupt Controller generation 1 + +This device acts as a VM interrupt controller. It provides the KVM +interface to configure the interrupt sources of a VM in the underlying +POWER9 XIVE interrupt controller. + +Only one XIVE instance may be instantiated. A guest XIVE device +requires a POWER9 host and the guest OS should have support for the +XIVE native exploitation interrupt mode. If not, it should run using +the legacy interrupt mode, referred as XICS (POWER7/8). + +* Device Mappings + + The KVM device exposes different MMIO ranges of the XIVE HW which + are required for interrupt management. These are exposed to the + guest in VMAs populated with a custom VM fault handler. + + 1. Thread Interrupt Management Area (TIMA) + + Each thread has an associated Thread Interrupt Management context + composed of a set of registers. These registers let the thread + handle priority management and interrupt acknowledgment. The most + important are : + + - Interrupt Pending Buffer (IPB) + - Current Processor Priority (CPPR) + - Notification Source Register (NSR) + + They are exposed to software in four different pages each proposing + a view with a different privilege. The first page is for the + physical thread context and the second for the hypervisor. Only the + third (operating system) and the fourth (user level) are exposed the + guest. + + 2. Event State Buffer (ESB) + + Each source is associated with an Event State Buffer (ESB) with + either a pair of even/odd pair of pages which provides commands to + manage the source: to trigger, to EOI, to turn off the source for + instance. + + 3. Device pass-through + + When a device is passed-through into the guest, the source + interrupts are from a different HW controller (PHB4) and the ESB + pages exposed to the guest should accommadate this change. + + The passthru_irq helpers, kvmppc_xive_set_mapped() and + kvmppc_xive_clr_mapped() are called when the device HW irqs are + mapped into or unmapped from the guest IRQ number space. The KVM + device extends these helpers to clear the ESB pages of the guest IRQ + number being mapped and then lets the VM fault handler repopulate. + The handler will insert the ESB page corresponding to the HW + interrupt of the device being passed-through or the initial IPI ESB + page if the device has being removed. + + The ESB remapping is fully transparent to the guest and the OS + device driver. All handling is done within VFIO and the above + helpers in KVM-PPC. + +* Groups: + + 1. KVM_DEV_XIVE_GRP_CTRL + Provides global controls on the device + Attributes: + 1.1 KVM_DEV_XIVE_RESET (write only) + Resets the interrupt controller configuration for sources and event + queues. To be used by kexec and kdump. + Errors: none + + 1.2 KVM_DEV_XIVE_EQ_SYNC (write only) + Sync all the sources and queues and mark the EQ pages dirty. This + to make sure that a consistent memory state is captured when + migrating the VM. + Errors: none + + 2. KVM_DEV_XIVE_GRP_SOURCE (write only) + Initializes a new source in the XIVE device and mask it. + Attributes: + Interrupt source number (64-bit) + The kvm_device_attr.addr points to a __u64 value: + bits: | 63 .... 2 | 1 | 0 + values: | unused | level | type + - type: 0:MSI 1:LSI + - level: assertion level in case of an LSI. + Errors: + -E2BIG: Interrupt source number is out of range + -ENOMEM: Could not create a new source block + -EFAULT: Invalid user pointer for attr->addr. + -ENXIO: Could not allocate underlying HW interrupt + + 3. KVM_DEV_XIVE_GRP_SOURCE_CONFIG (write only) + Configures source targeting + Attributes: + Interrupt source number (64-bit) + The kvm_device_attr.addr points to a __u64 value: + bits: | 63 .... 33 | 32 | 31 .. 3 | 2 .. 0 + values: | eisn | mask | server | priority + - priority: 0-7 interrupt priority level + - server: CPU number chosen to handle the interrupt + - mask: mask flag (unused) + - eisn: Effective Interrupt Source Number + Errors: + -ENOENT: Unknown source number + -EINVAL: Not initialized source number + -EINVAL: Invalid priority + -EINVAL: Invalid CPU number. + -EFAULT: Invalid user pointer for attr->addr. + -ENXIO: CPU event queues not configured or configuration of the + underlying HW interrupt failed + -EBUSY: No CPU available to serve interrupt + + 4. KVM_DEV_XIVE_GRP_EQ_CONFIG (read-write) + Configures an event queue of a CPU + Attributes: + EQ descriptor identifier (64-bit) + The EQ descriptor identifier is a tuple (server, priority) : + bits: | 63 .... 32 | 31 .. 3 | 2 .. 0 + values: | unused | server | priority + The kvm_device_attr.addr points to : + struct kvm_ppc_xive_eq { + __u32 flags; + __u32 qshift; + __u64 qaddr; + __u32 qtoggle; + __u32 qindex; + __u8 pad[40]; + }; + - flags: queue flags + KVM_XIVE_EQ_ALWAYS_NOTIFY (required) + forces notification without using the coalescing mechanism + provided by the XIVE END ESBs. + - qshift: queue size (power of 2) + - qaddr: real address of queue + - qtoggle: current queue toggle bit + - qindex: current queue index + - pad: reserved for future use + Errors: + -ENOENT: Invalid CPU number + -EINVAL: Invalid priority + -EINVAL: Invalid flags + -EINVAL: Invalid queue size + -EINVAL: Invalid queue address + -EFAULT: Invalid user pointer for attr->addr. + -EIO: Configuration of the underlying HW failed + + 5. KVM_DEV_XIVE_GRP_SOURCE_SYNC (write only) + Synchronize the source to flush event notifications + Attributes: + Interrupt source number (64-bit) + Errors: + -ENOENT: Unknown source number + -EINVAL: Not initialized source number + +* VCPU state + + The XIVE IC maintains VP interrupt state in an internal structure + called the NVT. When a VP is not dispatched on a HW processor + thread, this structure can be updated by HW if the VP is the target + of an event notification. + + It is important for migration to capture the cached IPB from the NVT + as it synthesizes the priorities of the pending interrupts. We + capture a bit more to report debug information. + + KVM_REG_PPC_VP_STATE (2 * 64bits) + bits: | 63 .... 32 | 31 .... 0 | + values: | TIMA word0 | TIMA word1 | + bits: | 127 .......... 64 | + values: | unused | + +* Migration: + + Saving the state of a VM using the XIVE native exploitation mode + should follow a specific sequence. When the VM is stopped : + + 1. Mask all sources (PQ=01) to stop the flow of events. + + 2. Sync the XIVE device with the KVM control KVM_DEV_XIVE_EQ_SYNC to + flush any in-flight event notification and to stabilize the EQs. At + this stage, the EQ pages are marked dirty to make sure they are + transferred in the migration sequence. + + 3. Capture the state of the source targeting, the EQs configuration + and the state of thread interrupt context registers. + + Restore is similar : + + 1. Restore the EQ configuration. As targeting depends on it. + 2. Restore targeting + 3. Restore the thread interrupt contexts + 4. Restore the source states + 5. Let the vCPU run diff --git a/Documentation/virt/kvm/halt-polling.txt b/Documentation/virt/kvm/halt-polling.txt new file mode 100644 index 000000000000..4f791b128dd2 --- /dev/null +++ b/Documentation/virt/kvm/halt-polling.txt @@ -0,0 +1,136 @@ +The KVM halt polling system +=========================== + +The KVM halt polling system provides a feature within KVM whereby the latency +of a guest can, under some circumstances, be reduced by polling in the host +for some time period after the guest has elected to no longer run by cedeing. +That is, when a guest vcpu has ceded, or in the case of powerpc when all of the +vcpus of a single vcore have ceded, the host kernel polls for wakeup conditions +before giving up the cpu to the scheduler in order to let something else run. + +Polling provides a latency advantage in cases where the guest can be run again +very quickly by at least saving us a trip through the scheduler, normally on +the order of a few micro-seconds, although performance benefits are workload +dependant. In the event that no wakeup source arrives during the polling +interval or some other task on the runqueue is runnable the scheduler is +invoked. Thus halt polling is especially useful on workloads with very short +wakeup periods where the time spent halt polling is minimised and the time +savings of not invoking the scheduler are distinguishable. + +The generic halt polling code is implemented in: + + virt/kvm/kvm_main.c: kvm_vcpu_block() + +The powerpc kvm-hv specific case is implemented in: + + arch/powerpc/kvm/book3s_hv.c: kvmppc_vcore_blocked() + +Halt Polling Interval +===================== + +The maximum time for which to poll before invoking the scheduler, referred to +as the halt polling interval, is increased and decreased based on the perceived +effectiveness of the polling in an attempt to limit pointless polling. +This value is stored in either the vcpu struct: + + kvm_vcpu->halt_poll_ns + +or in the case of powerpc kvm-hv, in the vcore struct: + + kvmppc_vcore->halt_poll_ns + +Thus this is a per vcpu (or vcore) value. + +During polling if a wakeup source is received within the halt polling interval, +the interval is left unchanged. In the event that a wakeup source isn't +received during the polling interval (and thus schedule is invoked) there are +two options, either the polling interval and total block time[0] were less than +the global max polling interval (see module params below), or the total block +time was greater than the global max polling interval. + +In the event that both the polling interval and total block time were less than +the global max polling interval then the polling interval can be increased in +the hope that next time during the longer polling interval the wake up source +will be received while the host is polling and the latency benefits will be +received. The polling interval is grown in the function grow_halt_poll_ns() and +is multiplied by the module parameters halt_poll_ns_grow and +halt_poll_ns_grow_start. + +In the event that the total block time was greater than the global max polling +interval then the host will never poll for long enough (limited by the global +max) to wakeup during the polling interval so it may as well be shrunk in order +to avoid pointless polling. The polling interval is shrunk in the function +shrink_halt_poll_ns() and is divided by the module parameter +halt_poll_ns_shrink, or set to 0 iff halt_poll_ns_shrink == 0. + +It is worth noting that this adjustment process attempts to hone in on some +steady state polling interval but will only really do a good job for wakeups +which come at an approximately constant rate, otherwise there will be constant +adjustment of the polling interval. + +[0] total block time: the time between when the halt polling function is + invoked and a wakeup source received (irrespective of + whether the scheduler is invoked within that function). + +Module Parameters +================= + +The kvm module has 3 tuneable module parameters to adjust the global max +polling interval as well as the rate at which the polling interval is grown and +shrunk. These variables are defined in include/linux/kvm_host.h and as module +parameters in virt/kvm/kvm_main.c, or arch/powerpc/kvm/book3s_hv.c in the +powerpc kvm-hv case. + +Module Parameter | Description | Default Value +-------------------------------------------------------------------------------- +halt_poll_ns | The global max polling | KVM_HALT_POLL_NS_DEFAULT + | interval which defines | + | the ceiling value of the | + | polling interval for | (per arch value) + | each vcpu. | +-------------------------------------------------------------------------------- +halt_poll_ns_grow | The value by which the | 2 + | halt polling interval is | + | multiplied in the | + | grow_halt_poll_ns() | + | function. | +-------------------------------------------------------------------------------- +halt_poll_ns_grow_start | The initial value to grow | 10000 + | to from zero in the | + | grow_halt_poll_ns() | + | function. | +-------------------------------------------------------------------------------- +halt_poll_ns_shrink | The value by which the | 0 + | halt polling interval is | + | divided in the | + | shrink_halt_poll_ns() | + | function. | +-------------------------------------------------------------------------------- + +These module parameters can be set from the debugfs files in: + + /sys/module/kvm/parameters/ + +Note: that these module parameters are system wide values and are not able to + be tuned on a per vm basis. + +Further Notes +============= + +- Care should be taken when setting the halt_poll_ns module parameter as a +large value has the potential to drive the cpu usage to 100% on a machine which +would be almost entirely idle otherwise. This is because even if a guest has +wakeups during which very little work is done and which are quite far apart, if +the period is shorter than the global max polling interval (halt_poll_ns) then +the host will always poll for the entire block time and thus cpu utilisation +will go to 100%. + +- Halt polling essentially presents a trade off between power usage and latency +and the module parameters should be used to tune the affinity for this. Idle +cpu time is essentially converted to host kernel time with the aim of decreasing +latency when entering the guest. + +- Halt polling will only be conducted by the host when no other tasks are +runnable on that cpu, otherwise the polling will cease immediately and +schedule will be invoked to allow that other task to run. Thus this doesn't +allow a guest to denial of service the cpu. diff --git a/Documentation/virt/kvm/hypercalls.txt b/Documentation/virt/kvm/hypercalls.txt new file mode 100644 index 000000000000..5f6d291bd004 --- /dev/null +++ b/Documentation/virt/kvm/hypercalls.txt @@ -0,0 +1,154 @@ +Linux KVM Hypercall: +=================== +X86: + KVM Hypercalls have a three-byte sequence of either the vmcall or the vmmcall + instruction. The hypervisor can replace it with instructions that are + guaranteed to be supported. + + Up to four arguments may be passed in rbx, rcx, rdx, and rsi respectively. + The hypercall number should be placed in rax and the return value will be + placed in rax. No other registers will be clobbered unless explicitly stated + by the particular hypercall. + +S390: + R2-R7 are used for parameters 1-6. In addition, R1 is used for hypercall + number. The return value is written to R2. + + S390 uses diagnose instruction as hypercall (0x500) along with hypercall + number in R1. + + For further information on the S390 diagnose call as supported by KVM, + refer to Documentation/virt/kvm/s390-diag.txt. + + PowerPC: + It uses R3-R10 and hypercall number in R11. R4-R11 are used as output registers. + Return value is placed in R3. + + KVM hypercalls uses 4 byte opcode, that are patched with 'hypercall-instructions' + property inside the device tree's /hypervisor node. + For more information refer to Documentation/virt/kvm/ppc-pv.txt + +MIPS: + KVM hypercalls use the HYPCALL instruction with code 0 and the hypercall + number in $2 (v0). Up to four arguments may be placed in $4-$7 (a0-a3) and + the return value is placed in $2 (v0). + +KVM Hypercalls Documentation +=========================== +The template for each hypercall is: +1. Hypercall name. +2. Architecture(s) +3. Status (deprecated, obsolete, active) +4. Purpose + +1. KVM_HC_VAPIC_POLL_IRQ +------------------------ +Architecture: x86 +Status: active +Purpose: Trigger guest exit so that the host can check for pending +interrupts on reentry. + +2. KVM_HC_MMU_OP +------------------------ +Architecture: x86 +Status: deprecated. +Purpose: Support MMU operations such as writing to PTE, +flushing TLB, release PT. + +3. KVM_HC_FEATURES +------------------------ +Architecture: PPC +Status: active +Purpose: Expose hypercall availability to the guest. On x86 platforms, cpuid +used to enumerate which hypercalls are available. On PPC, either device tree +based lookup ( which is also what EPAPR dictates) OR KVM specific enumeration +mechanism (which is this hypercall) can be used. + +4. KVM_HC_PPC_MAP_MAGIC_PAGE +------------------------ +Architecture: PPC +Status: active +Purpose: To enable communication between the hypervisor and guest there is a +shared page that contains parts of supervisor visible register state. +The guest can map this shared page to access its supervisor register through +memory using this hypercall. + +5. KVM_HC_KICK_CPU +------------------------ +Architecture: x86 +Status: active +Purpose: Hypercall used to wakeup a vcpu from HLT state +Usage example : A vcpu of a paravirtualized guest that is busywaiting in guest +kernel mode for an event to occur (ex: a spinlock to become available) can +execute HLT instruction once it has busy-waited for more than a threshold +time-interval. Execution of HLT instruction would cause the hypervisor to put +the vcpu to sleep until occurrence of an appropriate event. Another vcpu of the +same guest can wakeup the sleeping vcpu by issuing KVM_HC_KICK_CPU hypercall, +specifying APIC ID (a1) of the vcpu to be woken up. An additional argument (a0) +is used in the hypercall for future use. + + +6. KVM_HC_CLOCK_PAIRING +------------------------ +Architecture: x86 +Status: active +Purpose: Hypercall used to synchronize host and guest clocks. +Usage: + +a0: guest physical address where host copies +"struct kvm_clock_offset" structure. + +a1: clock_type, ATM only KVM_CLOCK_PAIRING_WALLCLOCK (0) +is supported (corresponding to the host's CLOCK_REALTIME clock). + + struct kvm_clock_pairing { + __s64 sec; + __s64 nsec; + __u64 tsc; + __u32 flags; + __u32 pad[9]; + }; + + Where: + * sec: seconds from clock_type clock. + * nsec: nanoseconds from clock_type clock. + * tsc: guest TSC value used to calculate sec/nsec pair + * flags: flags, unused (0) at the moment. + +The hypercall lets a guest compute a precise timestamp across +host and guest. The guest can use the returned TSC value to +compute the CLOCK_REALTIME for its clock, at the same instant. + +Returns KVM_EOPNOTSUPP if the host does not use TSC clocksource, +or if clock type is different than KVM_CLOCK_PAIRING_WALLCLOCK. + +6. KVM_HC_SEND_IPI +------------------------ +Architecture: x86 +Status: active +Purpose: Send IPIs to multiple vCPUs. + +a0: lower part of the bitmap of destination APIC IDs +a1: higher part of the bitmap of destination APIC IDs +a2: the lowest APIC ID in bitmap +a3: APIC ICR + +The hypercall lets a guest send multicast IPIs, with at most 128 +128 destinations per hypercall in 64-bit mode and 64 vCPUs per +hypercall in 32-bit mode. The destinations are represented by a +bitmap contained in the first two arguments (a0 and a1). Bit 0 of +a0 corresponds to the APIC ID in the third argument (a2), bit 1 +corresponds to the APIC ID a2+1, and so on. + +Returns the number of CPUs to which the IPIs were delivered successfully. + +7. KVM_HC_SCHED_YIELD +------------------------ +Architecture: x86 +Status: active +Purpose: Hypercall used to yield if the IPI target vCPU is preempted + +a0: destination APIC ID + +Usage example: When sending a call-function IPI-many to vCPUs, yield if +any of the IPI target vCPUs was preempted. diff --git a/Documentation/virt/kvm/index.rst b/Documentation/virt/kvm/index.rst new file mode 100644 index 000000000000..0b206a06f5be --- /dev/null +++ b/Documentation/virt/kvm/index.rst @@ -0,0 +1,11 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=== +KVM +=== + +.. toctree:: + :maxdepth: 2 + + amd-memory-encryption + cpuid diff --git a/Documentation/virt/kvm/locking.txt b/Documentation/virt/kvm/locking.txt new file mode 100644 index 000000000000..635cd6eaf714 --- /dev/null +++ b/Documentation/virt/kvm/locking.txt @@ -0,0 +1,215 @@ +KVM Lock Overview +================= + +1. Acquisition Orders +--------------------- + +The acquisition orders for mutexes are as follows: + +- kvm->lock is taken outside vcpu->mutex + +- kvm->lock is taken outside kvm->slots_lock and kvm->irq_lock + +- kvm->slots_lock is taken outside kvm->irq_lock, though acquiring + them together is quite rare. + +On x86, vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock. + +Everything else is a leaf: no other lock is taken inside the critical +sections. + +2: Exception +------------ + +Fast page fault: + +Fast page fault is the fast path which fixes the guest page fault out of +the mmu-lock on x86. Currently, the page fault can be fast in one of the +following two cases: + +1. Access Tracking: The SPTE is not present, but it is marked for access +tracking i.e. the SPTE_SPECIAL_MASK is set. That means we need to +restore the saved R/X bits. This is described in more detail later below. + +2. Write-Protection: The SPTE is present and the fault is +caused by write-protect. That means we just need to change the W bit of the +spte. + +What we use to avoid all the race is the SPTE_HOST_WRITEABLE bit and +SPTE_MMU_WRITEABLE bit on the spte: +- SPTE_HOST_WRITEABLE means the gfn is writable on host. +- SPTE_MMU_WRITEABLE means the gfn is writable on mmu. The bit is set when + the gfn is writable on guest mmu and it is not write-protected by shadow + page write-protection. + +On fast page fault path, we will use cmpxchg to atomically set the spte W +bit if spte.SPTE_HOST_WRITEABLE = 1 and spte.SPTE_WRITE_PROTECT = 1, or +restore the saved R/X bits if VMX_EPT_TRACK_ACCESS mask is set, or both. This +is safe because whenever changing these bits can be detected by cmpxchg. + +But we need carefully check these cases: +1): The mapping from gfn to pfn +The mapping from gfn to pfn may be changed since we can only ensure the pfn +is not changed during cmpxchg. This is a ABA problem, for example, below case +will happen: + +At the beginning: +gpte = gfn1 +gfn1 is mapped to pfn1 on host +spte is the shadow page table entry corresponding with gpte and +spte = pfn1 + + VCPU 0 VCPU0 +on fast page fault path: + + old_spte = *spte; + pfn1 is swapped out: + spte = 0; + + pfn1 is re-alloced for gfn2. + + gpte is changed to point to + gfn2 by the guest: + spte = pfn1; + + if (cmpxchg(spte, old_spte, old_spte+W) + mark_page_dirty(vcpu->kvm, gfn1) + OOPS!!! + +We dirty-log for gfn1, that means gfn2 is lost in dirty-bitmap. + +For direct sp, we can easily avoid it since the spte of direct sp is fixed +to gfn. For indirect sp, before we do cmpxchg, we call gfn_to_pfn_atomic() +to pin gfn to pfn, because after gfn_to_pfn_atomic(): +- We have held the refcount of pfn that means the pfn can not be freed and + be reused for another gfn. +- The pfn is writable that means it can not be shared between different gfns + by KSM. + +Then, we can ensure the dirty bitmaps is correctly set for a gfn. + +Currently, to simplify the whole things, we disable fast page fault for +indirect shadow page. + +2): Dirty bit tracking +In the origin code, the spte can be fast updated (non-atomically) if the +spte is read-only and the Accessed bit has already been set since the +Accessed bit and Dirty bit can not be lost. + +But it is not true after fast page fault since the spte can be marked +writable between reading spte and updating spte. Like below case: + +At the beginning: +spte.W = 0 +spte.Accessed = 1 + + VCPU 0 VCPU0 +In mmu_spte_clear_track_bits(): + + old_spte = *spte; + + /* 'if' condition is satisfied. */ + if (old_spte.Accessed == 1 && + old_spte.W == 0) + spte = 0ull; + on fast page fault path: + spte.W = 1 + memory write on the spte: + spte.Dirty = 1 + + + else + old_spte = xchg(spte, 0ull) + + + if (old_spte.Accessed == 1) + kvm_set_pfn_accessed(spte.pfn); + if (old_spte.Dirty == 1) + kvm_set_pfn_dirty(spte.pfn); + OOPS!!! + +The Dirty bit is lost in this case. + +In order to avoid this kind of issue, we always treat the spte as "volatile" +if it can be updated out of mmu-lock, see spte_has_volatile_bits(), it means, +the spte is always atomically updated in this case. + +3): flush tlbs due to spte updated +If the spte is updated from writable to readonly, we should flush all TLBs, +otherwise rmap_write_protect will find a read-only spte, even though the +writable spte might be cached on a CPU's TLB. + +As mentioned before, the spte can be updated to writable out of mmu-lock on +fast page fault path, in order to easily audit the path, we see if TLBs need +be flushed caused by this reason in mmu_spte_update() since this is a common +function to update spte (present -> present). + +Since the spte is "volatile" if it can be updated out of mmu-lock, we always +atomically update the spte, the race caused by fast page fault can be avoided, +See the comments in spte_has_volatile_bits() and mmu_spte_update(). + +Lockless Access Tracking: + +This is used for Intel CPUs that are using EPT but do not support the EPT A/D +bits. In this case, when the KVM MMU notifier is called to track accesses to a +page (via kvm_mmu_notifier_clear_flush_young), it marks the PTE as not-present +by clearing the RWX bits in the PTE and storing the original R & X bits in +some unused/ignored bits. In addition, the SPTE_SPECIAL_MASK is also set on the +PTE (using the ignored bit 62). When the VM tries to access the page later on, +a fault is generated and the fast page fault mechanism described above is used +to atomically restore the PTE to a Present state. The W bit is not saved when +the PTE is marked for access tracking and during restoration to the Present +state, the W bit is set depending on whether or not it was a write access. If +it wasn't, then the W bit will remain clear until a write access happens, at +which time it will be set using the Dirty tracking mechanism described above. + +3. Reference +------------ + +Name: kvm_lock +Type: mutex +Arch: any +Protects: - vm_list + +Name: kvm_count_lock +Type: raw_spinlock_t +Arch: any +Protects: - hardware virtualization enable/disable +Comment: 'raw' because hardware enabling/disabling must be atomic /wrt + migration. + +Name: kvm_arch::tsc_write_lock +Type: raw_spinlock +Arch: x86 +Protects: - kvm_arch::{last_tsc_write,last_tsc_nsec,last_tsc_offset} + - tsc offset in vmcb +Comment: 'raw' because updating the tsc offsets must not be preempted. + +Name: kvm->mmu_lock +Type: spinlock_t +Arch: any +Protects: -shadow page/shadow tlb entry +Comment: it is a spinlock since it is used in mmu notifier. + +Name: kvm->srcu +Type: srcu lock +Arch: any +Protects: - kvm->memslots + - kvm->buses +Comment: The srcu read lock must be held while accessing memslots (e.g. + when using gfn_to_* functions) and while accessing in-kernel + MMIO/PIO address->device structure mapping (kvm->buses). + The srcu index can be stored in kvm_vcpu->srcu_idx per vcpu + if it is needed by multiple functions. + +Name: blocked_vcpu_on_cpu_lock +Type: spinlock_t +Arch: x86 +Protects: blocked_vcpu_on_cpu +Comment: This is a per-CPU lock and it is used for VT-d posted-interrupts. + When VT-d posted-interrupts is supported and the VM has assigned + devices, we put the blocked vCPU on the list blocked_vcpu_on_cpu + protected by blocked_vcpu_on_cpu_lock, when VT-d hardware issues + wakeup notification event since external interrupts from the + assigned devices happens, we will find the vCPU on the list to + wakeup. diff --git a/Documentation/virt/kvm/mmu.txt b/Documentation/virt/kvm/mmu.txt new file mode 100644 index 000000000000..1b9880dfba0a --- /dev/null +++ b/Documentation/virt/kvm/mmu.txt @@ -0,0 +1,449 @@ +The x86 kvm shadow mmu +====================== + +The mmu (in arch/x86/kvm, files mmu.[ch] and paging_tmpl.h) is responsible +for presenting a standard x86 mmu to the guest, while translating guest +physical addresses to host physical addresses. + +The mmu code attempts to satisfy the following requirements: + +- correctness: the guest should not be able to determine that it is running + on an emulated mmu except for timing (we attempt to comply + with the specification, not emulate the characteristics of + a particular implementation such as tlb size) +- security: the guest must not be able to touch host memory not assigned + to it +- performance: minimize the performance penalty imposed by the mmu +- scaling: need to scale to large memory and large vcpu guests +- hardware: support the full range of x86 virtualization hardware +- integration: Linux memory management code must be in control of guest memory + so that swapping, page migration, page merging, transparent + hugepages, and similar features work without change +- dirty tracking: report writes to guest memory to enable live migration + and framebuffer-based displays +- footprint: keep the amount of pinned kernel memory low (most memory + should be shrinkable) +- reliability: avoid multipage or GFP_ATOMIC allocations + +Acronyms +======== + +pfn host page frame number +hpa host physical address +hva host virtual address +gfn guest frame number +gpa guest physical address +gva guest virtual address +ngpa nested guest physical address +ngva nested guest virtual address +pte page table entry (used also to refer generically to paging structure + entries) +gpte guest pte (referring to gfns) +spte shadow pte (referring to pfns) +tdp two dimensional paging (vendor neutral term for NPT and EPT) + +Virtual and real hardware supported +=================================== + +The mmu supports first-generation mmu hardware, which allows an atomic switch +of the current paging mode and cr3 during guest entry, as well as +two-dimensional paging (AMD's NPT and Intel's EPT). The emulated hardware +it exposes is the traditional 2/3/4 level x86 mmu, with support for global +pages, pae, pse, pse36, cr0.wp, and 1GB pages. Emulated hardware also +able to expose NPT capable hardware on NPT capable hosts. + +Translation +=========== + +The primary job of the mmu is to program the processor's mmu to translate +addresses for the guest. Different translations are required at different +times: + +- when guest paging is disabled, we translate guest physical addresses to + host physical addresses (gpa->hpa) +- when guest paging is enabled, we translate guest virtual addresses, to + guest physical addresses, to host physical addresses (gva->gpa->hpa) +- when the guest launches a guest of its own, we translate nested guest + virtual addresses, to nested guest physical addresses, to guest physical + addresses, to host physical addresses (ngva->ngpa->gpa->hpa) + +The primary challenge is to encode between 1 and 3 translations into hardware +that support only 1 (traditional) and 2 (tdp) translations. When the +number of required translations matches the hardware, the mmu operates in +direct mode; otherwise it operates in shadow mode (see below). + +Memory +====== + +Guest memory (gpa) is part of the user address space of the process that is +using kvm. Userspace defines the translation between guest addresses and user +addresses (gpa->hva); note that two gpas may alias to the same hva, but not +vice versa. + +These hvas may be backed using any method available to the host: anonymous +memory, file backed memory, and device memory. Memory might be paged by the +host at any time. + +Events +====== + +The mmu is driven by events, some from the guest, some from the host. + +Guest generated events: +- writes to control registers (especially cr3) +- invlpg/invlpga instruction execution +- access to missing or protected translations + +Host generated events: +- changes in the gpa->hpa translation (either through gpa->hva changes or + through hva->hpa changes) +- memory pressure (the shrinker) + +Shadow pages +============ + +The principal data structure is the shadow page, 'struct kvm_mmu_page'. A +shadow page contains 512 sptes, which can be either leaf or nonleaf sptes. A +shadow page may contain a mix of leaf and nonleaf sptes. + +A nonleaf spte allows the hardware mmu to reach the leaf pages and +is not related to a translation directly. It points to other shadow pages. + +A leaf spte corresponds to either one or two translations encoded into +one paging structure entry. These are always the lowest level of the +translation stack, with optional higher level translations left to NPT/EPT. +Leaf ptes point at guest pages. + +The following table shows translations encoded by leaf ptes, with higher-level +translations in parentheses: + + Non-nested guests: + nonpaging: gpa->hpa + paging: gva->gpa->hpa + paging, tdp: (gva->)gpa->hpa + Nested guests: + non-tdp: ngva->gpa->hpa (*) + tdp: (ngva->)ngpa->gpa->hpa + +(*) the guest hypervisor will encode the ngva->gpa translation into its page + tables if npt is not present + +Shadow pages contain the following information: + role.level: + The level in the shadow paging hierarchy that this shadow page belongs to. + 1=4k sptes, 2=2M sptes, 3=1G sptes, etc. + role.direct: + If set, leaf sptes reachable from this page are for a linear range. + Examples include real mode translation, large guest pages backed by small + host pages, and gpa->hpa translations when NPT or EPT is active. + The linear range starts at (gfn << PAGE_SHIFT) and its size is determined + by role.level (2MB for first level, 1GB for second level, 0.5TB for third + level, 256TB for fourth level) + If clear, this page corresponds to a guest page table denoted by the gfn + field. + role.quadrant: + When role.gpte_is_8_bytes=0, the guest uses 32-bit gptes while the host uses 64-bit + sptes. That means a guest page table contains more ptes than the host, + so multiple shadow pages are needed to shadow one guest page. + For first-level shadow pages, role.quadrant can be 0 or 1 and denotes the + first or second 512-gpte block in the guest page table. For second-level + page tables, each 32-bit gpte is converted to two 64-bit sptes + (since each first-level guest page is shadowed by two first-level + shadow pages) so role.quadrant takes values in the range 0..3. Each + quadrant maps 1GB virtual address space. + role.access: + Inherited guest access permissions in the form uwx. Note execute + permission is positive, not negative. + role.invalid: + The page is invalid and should not be used. It is a root page that is + currently pinned (by a cpu hardware register pointing to it); once it is + unpinned it will be destroyed. + role.gpte_is_8_bytes: + Reflects the size of the guest PTE for which the page is valid, i.e. '1' + if 64-bit gptes are in use, '0' if 32-bit gptes are in use. + role.nxe: + Contains the value of efer.nxe for which the page is valid. + role.cr0_wp: + Contains the value of cr0.wp for which the page is valid. + role.smep_andnot_wp: + Contains the value of cr4.smep && !cr0.wp for which the page is valid + (pages for which this is true are different from other pages; see the + treatment of cr0.wp=0 below). + role.smap_andnot_wp: + Contains the value of cr4.smap && !cr0.wp for which the page is valid + (pages for which this is true are different from other pages; see the + treatment of cr0.wp=0 below). + role.ept_sp: + This is a virtual flag to denote a shadowed nested EPT page. ept_sp + is true if "cr0_wp && smap_andnot_wp", an otherwise invalid combination. + role.smm: + Is 1 if the page is valid in system management mode. This field + determines which of the kvm_memslots array was used to build this + shadow page; it is also used to go back from a struct kvm_mmu_page + to a memslot, through the kvm_memslots_for_spte_role macro and + __gfn_to_memslot. + role.ad_disabled: + Is 1 if the MMU instance cannot use A/D bits. EPT did not have A/D + bits before Haswell; shadow EPT page tables also cannot use A/D bits + if the L1 hypervisor does not enable them. + gfn: + Either the guest page table containing the translations shadowed by this + page, or the base page frame for linear translations. See role.direct. + spt: + A pageful of 64-bit sptes containing the translations for this page. + Accessed by both kvm and hardware. + The page pointed to by spt will have its page->private pointing back + at the shadow page structure. + sptes in spt point either at guest pages, or at lower-level shadow pages. + Specifically, if sp1 and sp2 are shadow pages, then sp1->spt[n] may point + at __pa(sp2->spt). sp2 will point back at sp1 through parent_pte. + The spt array forms a DAG structure with the shadow page as a node, and + guest pages as leaves. + gfns: + An array of 512 guest frame numbers, one for each present pte. Used to + perform a reverse map from a pte to a gfn. When role.direct is set, any + element of this array can be calculated from the gfn field when used, in + this case, the array of gfns is not allocated. See role.direct and gfn. + root_count: + A counter keeping track of how many hardware registers (guest cr3 or + pdptrs) are now pointing at the page. While this counter is nonzero, the + page cannot be destroyed. See role.invalid. + parent_ptes: + The reverse mapping for the pte/ptes pointing at this page's spt. If + parent_ptes bit 0 is zero, only one spte points at this page and + parent_ptes points at this single spte, otherwise, there exists multiple + sptes pointing at this page and (parent_ptes & ~0x1) points at a data + structure with a list of parent sptes. + unsync: + If true, then the translations in this page may not match the guest's + translation. This is equivalent to the state of the tlb when a pte is + changed but before the tlb entry is flushed. Accordingly, unsync ptes + are synchronized when the guest executes invlpg or flushes its tlb by + other means. Valid for leaf pages. + unsync_children: + How many sptes in the page point at pages that are unsync (or have + unsynchronized children). + unsync_child_bitmap: + A bitmap indicating which sptes in spt point (directly or indirectly) at + pages that may be unsynchronized. Used to quickly locate all unsychronized + pages reachable from a given page. + clear_spte_count: + Only present on 32-bit hosts, where a 64-bit spte cannot be written + atomically. The reader uses this while running out of the MMU lock + to detect in-progress updates and retry them until the writer has + finished the write. + write_flooding_count: + A guest may write to a page table many times, causing a lot of + emulations if the page needs to be write-protected (see "Synchronized + and unsynchronized pages" below). Leaf pages can be unsynchronized + so that they do not trigger frequent emulation, but this is not + possible for non-leafs. This field counts the number of emulations + since the last time the page table was actually used; if emulation + is triggered too frequently on this page, KVM will unmap the page + to avoid emulation in the future. + +Reverse map +=========== + +The mmu maintains a reverse mapping whereby all ptes mapping a page can be +reached given its gfn. This is used, for example, when swapping out a page. + +Synchronized and unsynchronized pages +===================================== + +The guest uses two events to synchronize its tlb and page tables: tlb flushes +and page invalidations (invlpg). + +A tlb flush means that we need to synchronize all sptes reachable from the +guest's cr3. This is expensive, so we keep all guest page tables write +protected, and synchronize sptes to gptes when a gpte is written. + +A special case is when a guest page table is reachable from the current +guest cr3. In this case, the guest is obliged to issue an invlpg instruction +before using the translation. We take advantage of that by removing write +protection from the guest page, and allowing the guest to modify it freely. +We synchronize modified gptes when the guest invokes invlpg. This reduces +the amount of emulation we have to do when the guest modifies multiple gptes, +or when the a guest page is no longer used as a page table and is used for +random guest data. + +As a side effect we have to resynchronize all reachable unsynchronized shadow +pages on a tlb flush. + + +Reaction to events +================== + +- guest page fault (or npt page fault, or ept violation) + +This is the most complicated event. The cause of a page fault can be: + + - a true guest fault (the guest translation won't allow the access) (*) + - access to a missing translation + - access to a protected translation + - when logging dirty pages, memory is write protected + - synchronized shadow pages are write protected (*) + - access to untranslatable memory (mmio) + + (*) not applicable in direct mode + +Handling a page fault is performed as follows: + + - if the RSV bit of the error code is set, the page fault is caused by guest + accessing MMIO and cached MMIO information is available. + - walk shadow page table + - check for valid generation number in the spte (see "Fast invalidation of + MMIO sptes" below) + - cache the information to vcpu->arch.mmio_gva, vcpu->arch.access and + vcpu->arch.mmio_gfn, and call the emulator + - If both P bit and R/W bit of error code are set, this could possibly + be handled as a "fast page fault" (fixed without taking the MMU lock). See + the description in Documentation/virt/kvm/locking.txt. + - if needed, walk the guest page tables to determine the guest translation + (gva->gpa or ngpa->gpa) + - if permissions are insufficient, reflect the fault back to the guest + - determine the host page + - if this is an mmio request, there is no host page; cache the info to + vcpu->arch.mmio_gva, vcpu->arch.access and vcpu->arch.mmio_gfn + - walk the shadow page table to find the spte for the translation, + instantiating missing intermediate page tables as necessary + - If this is an mmio request, cache the mmio info to the spte and set some + reserved bit on the spte (see callers of kvm_mmu_set_mmio_spte_mask) + - try to unsynchronize the page + - if successful, we can let the guest continue and modify the gpte + - emulate the instruction + - if failed, unshadow the page and let the guest continue + - update any translations that were modified by the instruction + +invlpg handling: + + - walk the shadow page hierarchy and drop affected translations + - try to reinstantiate the indicated translation in the hope that the + guest will use it in the near future + +Guest control register updates: + +- mov to cr3 + - look up new shadow roots + - synchronize newly reachable shadow pages + +- mov to cr0/cr4/efer + - set up mmu context for new paging mode + - look up new shadow roots + - synchronize newly reachable shadow pages + +Host translation updates: + + - mmu notifier called with updated hva + - look up affected sptes through reverse map + - drop (or update) translations + +Emulating cr0.wp +================ + +If tdp is not enabled, the host must keep cr0.wp=1 so page write protection +works for the guest kernel, not guest guest userspace. When the guest +cr0.wp=1, this does not present a problem. However when the guest cr0.wp=0, +we cannot map the permissions for gpte.u=1, gpte.w=0 to any spte (the +semantics require allowing any guest kernel access plus user read access). + +We handle this by mapping the permissions to two possible sptes, depending +on fault type: + +- kernel write fault: spte.u=0, spte.w=1 (allows full kernel access, + disallows user access) +- read fault: spte.u=1, spte.w=0 (allows full read access, disallows kernel + write access) + +(user write faults generate a #PF) + +In the first case there are two additional complications: +- if CR4.SMEP is enabled: since we've turned the page into a kernel page, + the kernel may now execute it. We handle this by also setting spte.nx. + If we get a user fetch or read fault, we'll change spte.u=1 and + spte.nx=gpte.nx back. For this to work, KVM forces EFER.NX to 1 when + shadow paging is in use. +- if CR4.SMAP is disabled: since the page has been changed to a kernel + page, it can not be reused when CR4.SMAP is enabled. We set + CR4.SMAP && !CR0.WP into shadow page's role to avoid this case. Note, + here we do not care the case that CR4.SMAP is enabled since KVM will + directly inject #PF to guest due to failed permission check. + +To prevent an spte that was converted into a kernel page with cr0.wp=0 +from being written by the kernel after cr0.wp has changed to 1, we make +the value of cr0.wp part of the page role. This means that an spte created +with one value of cr0.wp cannot be used when cr0.wp has a different value - +it will simply be missed by the shadow page lookup code. A similar issue +exists when an spte created with cr0.wp=0 and cr4.smep=0 is used after +changing cr4.smep to 1. To avoid this, the value of !cr0.wp && cr4.smep +is also made a part of the page role. + +Large pages +=========== + +The mmu supports all combinations of large and small guest and host pages. +Supported page sizes include 4k, 2M, 4M, and 1G. 4M pages are treated as +two separate 2M pages, on both guest and host, since the mmu always uses PAE +paging. + +To instantiate a large spte, four constraints must be satisfied: + +- the spte must point to a large host page +- the guest pte must be a large pte of at least equivalent size (if tdp is + enabled, there is no guest pte and this condition is satisfied) +- if the spte will be writeable, the large page frame may not overlap any + write-protected pages +- the guest page must be wholly contained by a single memory slot + +To check the last two conditions, the mmu maintains a ->disallow_lpage set of +arrays for each memory slot and large page size. Every write protected page +causes its disallow_lpage to be incremented, thus preventing instantiation of +a large spte. The frames at the end of an unaligned memory slot have +artificially inflated ->disallow_lpages so they can never be instantiated. + +Fast invalidation of MMIO sptes +=============================== + +As mentioned in "Reaction to events" above, kvm will cache MMIO +information in leaf sptes. When a new memslot is added or an existing +memslot is changed, this information may become stale and needs to be +invalidated. This also needs to hold the MMU lock while walking all +shadow pages, and is made more scalable with a similar technique. + +MMIO sptes have a few spare bits, which are used to store a +generation number. The global generation number is stored in +kvm_memslots(kvm)->generation, and increased whenever guest memory info +changes. + +When KVM finds an MMIO spte, it checks the generation number of the spte. +If the generation number of the spte does not equal the global generation +number, it will ignore the cached MMIO information and handle the page +fault through the slow path. + +Since only 19 bits are used to store generation-number on mmio spte, all +pages are zapped when there is an overflow. + +Unfortunately, a single memory access might access kvm_memslots(kvm) multiple +times, the last one happening when the generation number is retrieved and +stored into the MMIO spte. Thus, the MMIO spte might be created based on +out-of-date information, but with an up-to-date generation number. + +To avoid this, the generation number is incremented again after synchronize_srcu +returns; thus, bit 63 of kvm_memslots(kvm)->generation set to 1 only during a +memslot update, while some SRCU readers might be using the old copy. We do not +want to use an MMIO sptes created with an odd generation number, and we can do +this without losing a bit in the MMIO spte. The "update in-progress" bit of the +generation is not stored in MMIO spte, and is so is implicitly zero when the +generation is extracted out of the spte. If KVM is unlucky and creates an MMIO +spte while an update is in-progress, the next access to the spte will always be +a cache miss. For example, a subsequent access during the update window will +miss due to the in-progress flag diverging, while an access after the update +window closes will have a higher generation number (as compared to the spte). + + +Further reading +=============== + +- NPT presentation from KVM Forum 2008 + http://www.linux-kvm.org/images/c/c8/KvmForum2008%24kdf2008_21.pdf + diff --git a/Documentation/virt/kvm/msr.txt b/Documentation/virt/kvm/msr.txt new file mode 100644 index 000000000000..df1f4338b3ca --- /dev/null +++ b/Documentation/virt/kvm/msr.txt @@ -0,0 +1,284 @@ +KVM-specific MSRs. +Glauber Costa , Red Hat Inc, 2010 +===================================================== + +KVM makes use of some custom MSRs to service some requests. + +Custom MSRs have a range reserved for them, that goes from +0x4b564d00 to 0x4b564dff. There are MSRs outside this area, +but they are deprecated and their use is discouraged. + +Custom MSR list +-------- + +The current supported Custom MSR list is: + +MSR_KVM_WALL_CLOCK_NEW: 0x4b564d00 + + data: 4-byte alignment physical address of a memory area which must be + in guest RAM. This memory is expected to hold a copy of the following + structure: + + struct pvclock_wall_clock { + u32 version; + u32 sec; + u32 nsec; + } __attribute__((__packed__)); + + whose data will be filled in by the hypervisor. The hypervisor is only + guaranteed to update this data at the moment of MSR write. + Users that want to reliably query this information more than once have + to write more than once to this MSR. Fields have the following meanings: + + version: guest has to check version before and after grabbing + time information and check that they are both equal and even. + An odd version indicates an in-progress update. + + sec: number of seconds for wallclock at time of boot. + + nsec: number of nanoseconds for wallclock at time of boot. + + In order to get the current wallclock time, the system_time from + MSR_KVM_SYSTEM_TIME_NEW needs to be added. + + Note that although MSRs are per-CPU entities, the effect of this + particular MSR is global. + + Availability of this MSR must be checked via bit 3 in 0x4000001 cpuid + leaf prior to usage. + +MSR_KVM_SYSTEM_TIME_NEW: 0x4b564d01 + + data: 4-byte aligned physical address of a memory area which must be in + guest RAM, plus an enable bit in bit 0. This memory is expected to hold + a copy of the following structure: + + struct pvclock_vcpu_time_info { + u32 version; + u32 pad0; + u64 tsc_timestamp; + u64 system_time; + u32 tsc_to_system_mul; + s8 tsc_shift; + u8 flags; + u8 pad[2]; + } __attribute__((__packed__)); /* 32 bytes */ + + whose data will be filled in by the hypervisor periodically. Only one + write, or registration, is needed for each VCPU. The interval between + updates of this structure is arbitrary and implementation-dependent. + The hypervisor may update this structure at any time it sees fit until + anything with bit0 == 0 is written to it. + + Fields have the following meanings: + + version: guest has to check version before and after grabbing + time information and check that they are both equal and even. + An odd version indicates an in-progress update. + + tsc_timestamp: the tsc value at the current VCPU at the time + of the update of this structure. Guests can subtract this value + from current tsc to derive a notion of elapsed time since the + structure update. + + system_time: a host notion of monotonic time, including sleep + time at the time this structure was last updated. Unit is + nanoseconds. + + tsc_to_system_mul: multiplier to be used when converting + tsc-related quantity to nanoseconds + + tsc_shift: shift to be used when converting tsc-related + quantity to nanoseconds. This shift will ensure that + multiplication with tsc_to_system_mul does not overflow. + A positive value denotes a left shift, a negative value + a right shift. + + The conversion from tsc to nanoseconds involves an additional + right shift by 32 bits. With this information, guests can + derive per-CPU time by doing: + + time = (current_tsc - tsc_timestamp) + if (tsc_shift >= 0) + time <<= tsc_shift; + else + time >>= -tsc_shift; + time = (time * tsc_to_system_mul) >> 32 + time = time + system_time + + flags: bits in this field indicate extended capabilities + coordinated between the guest and the hypervisor. Availability + of specific flags has to be checked in 0x40000001 cpuid leaf. + Current flags are: + + flag bit | cpuid bit | meaning + ------------------------------------------------------------- + | | time measures taken across + 0 | 24 | multiple cpus are guaranteed to + | | be monotonic + ------------------------------------------------------------- + | | guest vcpu has been paused by + 1 | N/A | the host + | | See 4.70 in api.txt + ------------------------------------------------------------- + + Availability of this MSR must be checked via bit 3 in 0x4000001 cpuid + leaf prior to usage. + + +MSR_KVM_WALL_CLOCK: 0x11 + + data and functioning: same as MSR_KVM_WALL_CLOCK_NEW. Use that instead. + + This MSR falls outside the reserved KVM range and may be removed in the + future. Its usage is deprecated. + + Availability of this MSR must be checked via bit 0 in 0x4000001 cpuid + leaf prior to usage. + +MSR_KVM_SYSTEM_TIME: 0x12 + + data and functioning: same as MSR_KVM_SYSTEM_TIME_NEW. Use that instead. + + This MSR falls outside the reserved KVM range and may be removed in the + future. Its usage is deprecated. + + Availability of this MSR must be checked via bit 0 in 0x4000001 cpuid + leaf prior to usage. + + The suggested algorithm for detecting kvmclock presence is then: + + if (!kvm_para_available()) /* refer to cpuid.txt */ + return NON_PRESENT; + + flags = cpuid_eax(0x40000001); + if (flags & 3) { + msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW; + msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW; + return PRESENT; + } else if (flags & 0) { + msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; + msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK; + return PRESENT; + } else + return NON_PRESENT; + +MSR_KVM_ASYNC_PF_EN: 0x4b564d02 + data: Bits 63-6 hold 64-byte aligned physical address of a + 64 byte memory area which must be in guest RAM and must be + zeroed. Bits 5-3 are reserved and should be zero. Bit 0 is 1 + when asynchronous page faults are enabled on the vcpu 0 when + disabled. Bit 1 is 1 if asynchronous page faults can be injected + when vcpu is in cpl == 0. Bit 2 is 1 if asynchronous page faults + are delivered to L1 as #PF vmexits. Bit 2 can be set only if + KVM_FEATURE_ASYNC_PF_VMEXIT is present in CPUID. + + First 4 byte of 64 byte memory location will be written to by + the hypervisor at the time of asynchronous page fault (APF) + injection to indicate type of asynchronous page fault. Value + of 1 means that the page referred to by the page fault is not + present. Value 2 means that the page is now available. Disabling + interrupt inhibits APFs. Guest must not enable interrupt + before the reason is read, or it may be overwritten by another + APF. Since APF uses the same exception vector as regular page + fault guest must reset the reason to 0 before it does + something that can generate normal page fault. If during page + fault APF reason is 0 it means that this is regular page + fault. + + During delivery of type 1 APF cr2 contains a token that will + be used to notify a guest when missing page becomes + available. When page becomes available type 2 APF is sent with + cr2 set to the token associated with the page. There is special + kind of token 0xffffffff which tells vcpu that it should wake + up all processes waiting for APFs and no individual type 2 APFs + will be sent. + + If APF is disabled while there are outstanding APFs, they will + not be delivered. + + Currently type 2 APF will be always delivered on the same vcpu as + type 1 was, but guest should not rely on that. + +MSR_KVM_STEAL_TIME: 0x4b564d03 + + data: 64-byte alignment physical address of a memory area which must be + in guest RAM, plus an enable bit in bit 0. This memory is expected to + hold a copy of the following structure: + + struct kvm_steal_time { + __u64 steal; + __u32 version; + __u32 flags; + __u8 preempted; + __u8 u8_pad[3]; + __u32 pad[11]; + } + + whose data will be filled in by the hypervisor periodically. Only one + write, or registration, is needed for each VCPU. The interval between + updates of this structure is arbitrary and implementation-dependent. + The hypervisor may update this structure at any time it sees fit until + anything with bit0 == 0 is written to it. Guest is required to make sure + this structure is initialized to zero. + + Fields have the following meanings: + + version: a sequence counter. In other words, guest has to check + this field before and after grabbing time information and make + sure they are both equal and even. An odd version indicates an + in-progress update. + + flags: At this point, always zero. May be used to indicate + changes in this structure in the future. + + steal: the amount of time in which this vCPU did not run, in + nanoseconds. Time during which the vcpu is idle, will not be + reported as steal time. + + preempted: indicate the vCPU who owns this struct is running or + not. Non-zero values mean the vCPU has been preempted. Zero + means the vCPU is not preempted. NOTE, it is always zero if the + the hypervisor doesn't support this field. + +MSR_KVM_EOI_EN: 0x4b564d04 + data: Bit 0 is 1 when PV end of interrupt is enabled on the vcpu; 0 + when disabled. Bit 1 is reserved and must be zero. When PV end of + interrupt is enabled (bit 0 set), bits 63-2 hold a 4-byte aligned + physical address of a 4 byte memory area which must be in guest RAM and + must be zeroed. + + The first, least significant bit of 4 byte memory location will be + written to by the hypervisor, typically at the time of interrupt + injection. Value of 1 means that guest can skip writing EOI to the apic + (using MSR or MMIO write); instead, it is sufficient to signal + EOI by clearing the bit in guest memory - this location will + later be polled by the hypervisor. + Value of 0 means that the EOI write is required. + + It is always safe for the guest to ignore the optimization and perform + the APIC EOI write anyway. + + Hypervisor is guaranteed to only modify this least + significant bit while in the current VCPU context, this means that + guest does not need to use either lock prefix or memory ordering + primitives to synchronise with the hypervisor. + + However, hypervisor can set and clear this memory bit at any time: + therefore to make sure hypervisor does not interrupt the + guest and clear the least significant bit in the memory area + in the window between guest testing it to detect + whether it can skip EOI apic write and between guest + clearing it to signal EOI to the hypervisor, + guest must both read the least significant bit in the memory area and + clear it using a single CPU instruction, such as test and clear, or + compare and exchange. + +MSR_KVM_POLL_CONTROL: 0x4b564d05 + Control host-side polling. + + data: Bit 0 enables (1) or disables (0) host-side HLT polling logic. + + KVM guests can request the host not to poll on HLT, for example if + they are performing polling themselves. + diff --git a/Documentation/virt/kvm/nested-vmx.txt b/Documentation/virt/kvm/nested-vmx.txt new file mode 100644 index 000000000000..97eb1353e962 --- /dev/null +++ b/Documentation/virt/kvm/nested-vmx.txt @@ -0,0 +1,240 @@ +Nested VMX +========== + +Overview +--------- + +On Intel processors, KVM uses Intel's VMX (Virtual-Machine eXtensions) +to easily and efficiently run guest operating systems. Normally, these guests +*cannot* themselves be hypervisors running their own guests, because in VMX, +guests cannot use VMX instructions. + +The "Nested VMX" feature adds this missing capability - of running guest +hypervisors (which use VMX) with their own nested guests. It does so by +allowing a guest to use VMX instructions, and correctly and efficiently +emulating them using the single level of VMX available in the hardware. + +We describe in much greater detail the theory behind the nested VMX feature, +its implementation and its performance characteristics, in the OSDI 2010 paper +"The Turtles Project: Design and Implementation of Nested Virtualization", +available at: + + http://www.usenix.org/events/osdi10/tech/full_papers/Ben-Yehuda.pdf + + +Terminology +----------- + +Single-level virtualization has two levels - the host (KVM) and the guests. +In nested virtualization, we have three levels: The host (KVM), which we call +L0, the guest hypervisor, which we call L1, and its nested guest, which we +call L2. + + +Running nested VMX +------------------ + +The nested VMX feature is disabled by default. It can be enabled by giving +the "nested=1" option to the kvm-intel module. + +No modifications are required to user space (qemu). However, qemu's default +emulated CPU type (qemu64) does not list the "VMX" CPU feature, so it must be +explicitly enabled, by giving qemu one of the following options: + + -cpu host (emulated CPU has all features of the real CPU) + + -cpu qemu64,+vmx (add just the vmx feature to a named CPU type) + + +ABIs +---- + +Nested VMX aims to present a standard and (eventually) fully-functional VMX +implementation for the a guest hypervisor to use. As such, the official +specification of the ABI that it provides is Intel's VMX specification, +namely volume 3B of their "Intel 64 and IA-32 Architectures Software +Developer's Manual". Not all of VMX's features are currently fully supported, +but the goal is to eventually support them all, starting with the VMX features +which are used in practice by popular hypervisors (KVM and others). + +As a VMX implementation, nested VMX presents a VMCS structure to L1. +As mandated by the spec, other than the two fields revision_id and abort, +this structure is *opaque* to its user, who is not supposed to know or care +about its internal structure. Rather, the structure is accessed through the +VMREAD and VMWRITE instructions. +Still, for debugging purposes, KVM developers might be interested to know the +internals of this structure; This is struct vmcs12 from arch/x86/kvm/vmx.c. + +The name "vmcs12" refers to the VMCS that L1 builds for L2. In the code we +also have "vmcs01", the VMCS that L0 built for L1, and "vmcs02" is the VMCS +which L0 builds to actually run L2 - how this is done is explained in the +aforementioned paper. + +For convenience, we repeat the content of struct vmcs12 here. If the internals +of this structure changes, this can break live migration across KVM versions. +VMCS12_REVISION (from vmx.c) should be changed if struct vmcs12 or its inner +struct shadow_vmcs is ever changed. + + typedef u64 natural_width; + struct __packed vmcs12 { + /* According to the Intel spec, a VMCS region must start with + * these two user-visible fields */ + u32 revision_id; + u32 abort; + + u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */ + u32 padding[7]; /* room for future expansion */ + + u64 io_bitmap_a; + u64 io_bitmap_b; + u64 msr_bitmap; + u64 vm_exit_msr_store_addr; + u64 vm_exit_msr_load_addr; + u64 vm_entry_msr_load_addr; + u64 tsc_offset; + u64 virtual_apic_page_addr; + u64 apic_access_addr; + u64 ept_pointer; + u64 guest_physical_address; + u64 vmcs_link_pointer; + u64 guest_ia32_debugctl; + u64 guest_ia32_pat; + u64 guest_ia32_efer; + u64 guest_pdptr0; + u64 guest_pdptr1; + u64 guest_pdptr2; + u64 guest_pdptr3; + u64 host_ia32_pat; + u64 host_ia32_efer; + u64 padding64[8]; /* room for future expansion */ + natural_width cr0_guest_host_mask; + natural_width cr4_guest_host_mask; + natural_width cr0_read_shadow; + natural_width cr4_read_shadow; + natural_width cr3_target_value0; + natural_width cr3_target_value1; + natural_width cr3_target_value2; + natural_width cr3_target_value3; + natural_width exit_qualification; + natural_width guest_linear_address; + natural_width guest_cr0; + natural_width guest_cr3; + natural_width guest_cr4; + natural_width guest_es_base; + natural_width guest_cs_base; + natural_width guest_ss_base; + natural_width guest_ds_base; + natural_width guest_fs_base; + natural_width guest_gs_base; + natural_width guest_ldtr_base; + natural_width guest_tr_base; + natural_width guest_gdtr_base; + natural_width guest_idtr_base; + natural_width guest_dr7; + natural_width guest_rsp; + natural_width guest_rip; + natural_width guest_rflags; + natural_width guest_pending_dbg_exceptions; + natural_width guest_sysenter_esp; + natural_width guest_sysenter_eip; + natural_width host_cr0; + natural_width host_cr3; + natural_width host_cr4; + natural_width host_fs_base; + natural_width host_gs_base; + natural_width host_tr_base; + natural_width host_gdtr_base; + natural_width host_idtr_base; + natural_width host_ia32_sysenter_esp; + natural_width host_ia32_sysenter_eip; + natural_width host_rsp; + natural_width host_rip; + natural_width paddingl[8]; /* room for future expansion */ + u32 pin_based_vm_exec_control; + u32 cpu_based_vm_exec_control; + u32 exception_bitmap; + u32 page_fault_error_code_mask; + u32 page_fault_error_code_match; + u32 cr3_target_count; + u32 vm_exit_controls; + u32 vm_exit_msr_store_count; + u32 vm_exit_msr_load_count; + u32 vm_entry_controls; + u32 vm_entry_msr_load_count; + u32 vm_entry_intr_info_field; + u32 vm_entry_exception_error_code; + u32 vm_entry_instruction_len; + u32 tpr_threshold; + u32 secondary_vm_exec_control; + u32 vm_instruction_error; + u32 vm_exit_reason; + u32 vm_exit_intr_info; + u32 vm_exit_intr_error_code; + u32 idt_vectoring_info_field; + u32 idt_vectoring_error_code; + u32 vm_exit_instruction_len; + u32 vmx_instruction_info; + u32 guest_es_limit; + u32 guest_cs_limit; + u32 guest_ss_limit; + u32 guest_ds_limit; + u32 guest_fs_limit; + u32 guest_gs_limit; + u32 guest_ldtr_limit; + u32 guest_tr_limit; + u32 guest_gdtr_limit; + u32 guest_idtr_limit; + u32 guest_es_ar_bytes; + u32 guest_cs_ar_bytes; + u32 guest_ss_ar_bytes; + u32 guest_ds_ar_bytes; + u32 guest_fs_ar_bytes; + u32 guest_gs_ar_bytes; + u32 guest_ldtr_ar_bytes; + u32 guest_tr_ar_bytes; + u32 guest_interruptibility_info; + u32 guest_activity_state; + u32 guest_sysenter_cs; + u32 host_ia32_sysenter_cs; + u32 padding32[8]; /* room for future expansion */ + u16 virtual_processor_id; + u16 guest_es_selector; + u16 guest_cs_selector; + u16 guest_ss_selector; + u16 guest_ds_selector; + u16 guest_fs_selector; + u16 guest_gs_selector; + u16 guest_ldtr_selector; + u16 guest_tr_selector; + u16 host_es_selector; + u16 host_cs_selector; + u16 host_ss_selector; + u16 host_ds_selector; + u16 host_fs_selector; + u16 host_gs_selector; + u16 host_tr_selector; + }; + + +Authors +------- + +These patches were written by: + Abel Gordon, abelg il.ibm.com + Nadav Har'El, nyh il.ibm.com + Orit Wasserman, oritw il.ibm.com + Ben-Ami Yassor, benami il.ibm.com + Muli Ben-Yehuda, muli il.ibm.com + +With contributions by: + Anthony Liguori, aliguori us.ibm.com + Mike Day, mdday us.ibm.com + Michael Factor, factor il.ibm.com + Zvi Dubitzky, dubi il.ibm.com + +And valuable reviews by: + Avi Kivity, avi redhat.com + Gleb Natapov, gleb redhat.com + Marcelo Tosatti, mtosatti redhat.com + Kevin Tian, kevin.tian intel.com + and others. diff --git a/Documentation/virt/kvm/ppc-pv.txt b/Documentation/virt/kvm/ppc-pv.txt new file mode 100644 index 000000000000..e26115ce4258 --- /dev/null +++ b/Documentation/virt/kvm/ppc-pv.txt @@ -0,0 +1,212 @@ +The PPC KVM paravirtual interface +================================= + +The basic execution principle by which KVM on PowerPC works is to run all kernel +space code in PR=1 which is user space. This way we trap all privileged +instructions and can emulate them accordingly. + +Unfortunately that is also the downfall. There are quite some privileged +instructions that needlessly return us to the hypervisor even though they +could be handled differently. + +This is what the PPC PV interface helps with. It takes privileged instructions +and transforms them into unprivileged ones with some help from the hypervisor. +This cuts down virtualization costs by about 50% on some of my benchmarks. + +The code for that interface can be found in arch/powerpc/kernel/kvm* + +Querying for existence +====================== + +To find out if we're running on KVM or not, we leverage the device tree. When +Linux is running on KVM, a node /hypervisor exists. That node contains a +compatible property with the value "linux,kvm". + +Once you determined you're running under a PV capable KVM, you can now use +hypercalls as described below. + +KVM hypercalls +============== + +Inside the device tree's /hypervisor node there's a property called +'hypercall-instructions'. This property contains at most 4 opcodes that make +up the hypercall. To call a hypercall, just call these instructions. + +The parameters are as follows: + + Register IN OUT + + r0 - volatile + r3 1st parameter Return code + r4 2nd parameter 1st output value + r5 3rd parameter 2nd output value + r6 4th parameter 3rd output value + r7 5th parameter 4th output value + r8 6th parameter 5th output value + r9 7th parameter 6th output value + r10 8th parameter 7th output value + r11 hypercall number 8th output value + r12 - volatile + +Hypercall definitions are shared in generic code, so the same hypercall numbers +apply for x86 and powerpc alike with the exception that each KVM hypercall +also needs to be ORed with the KVM vendor code which is (42 << 16). + +Return codes can be as follows: + + Code Meaning + + 0 Success + 12 Hypercall not implemented + <0 Error + +The magic page +============== + +To enable communication between the hypervisor and guest there is a new shared +page that contains parts of supervisor visible register state. The guest can +map this shared page using the KVM hypercall KVM_HC_PPC_MAP_MAGIC_PAGE. + +With this hypercall issued the guest always gets the magic page mapped at the +desired location. The first parameter indicates the effective address when the +MMU is enabled. The second parameter indicates the address in real mode, if +applicable to the target. For now, we always map the page to -4096. This way we +can access it using absolute load and store functions. The following +instruction reads the first field of the magic page: + + ld rX, -4096(0) + +The interface is designed to be extensible should there be need later to add +additional registers to the magic page. If you add fields to the magic page, +also define a new hypercall feature to indicate that the host can give you more +registers. Only if the host supports the additional features, make use of them. + +The magic page layout is described by struct kvm_vcpu_arch_shared +in arch/powerpc/include/asm/kvm_para.h. + +Magic page features +=================== + +When mapping the magic page using the KVM hypercall KVM_HC_PPC_MAP_MAGIC_PAGE, +a second return value is passed to the guest. This second return value contains +a bitmap of available features inside the magic page. + +The following enhancements to the magic page are currently available: + + KVM_MAGIC_FEAT_SR Maps SR registers r/w in the magic page + KVM_MAGIC_FEAT_MAS0_TO_SPRG7 Maps MASn, ESR, PIR and high SPRGs + +For enhanced features in the magic page, please check for the existence of the +feature before using them! + +Magic page flags +================ + +In addition to features that indicate whether a host is capable of a particular +feature we also have a channel for a guest to tell the guest whether it's capable +of something. This is what we call "flags". + +Flags are passed to the host in the low 12 bits of the Effective Address. + +The following flags are currently available for a guest to expose: + + MAGIC_PAGE_FLAG_NOT_MAPPED_NX Guest handles NX bits correctly wrt magic page + +MSR bits +======== + +The MSR contains bits that require hypervisor intervention and bits that do +not require direct hypervisor intervention because they only get interpreted +when entering the guest or don't have any impact on the hypervisor's behavior. + +The following bits are safe to be set inside the guest: + + MSR_EE + MSR_RI + +If any other bit changes in the MSR, please still use mtmsr(d). + +Patched instructions +==================== + +The "ld" and "std" instructions are transformed to "lwz" and "stw" instructions +respectively on 32 bit systems with an added offset of 4 to accommodate for big +endianness. + +The following is a list of mapping the Linux kernel performs when running as +guest. Implementing any of those mappings is optional, as the instruction traps +also act on the shared page. So calling privileged instructions still works as +before. + +From To +==== == + +mfmsr rX ld rX, magic_page->msr +mfsprg rX, 0 ld rX, magic_page->sprg0 +mfsprg rX, 1 ld rX, magic_page->sprg1 +mfsprg rX, 2 ld rX, magic_page->sprg2 +mfsprg rX, 3 ld rX, magic_page->sprg3 +mfsrr0 rX ld rX, magic_page->srr0 +mfsrr1 rX ld rX, magic_page->srr1 +mfdar rX ld rX, magic_page->dar +mfdsisr rX lwz rX, magic_page->dsisr + +mtmsr rX std rX, magic_page->msr +mtsprg 0, rX std rX, magic_page->sprg0 +mtsprg 1, rX std rX, magic_page->sprg1 +mtsprg 2, rX std rX, magic_page->sprg2 +mtsprg 3, rX std rX, magic_page->sprg3 +mtsrr0 rX std rX, magic_page->srr0 +mtsrr1 rX std rX, magic_page->srr1 +mtdar rX std rX, magic_page->dar +mtdsisr rX stw rX, magic_page->dsisr + +tlbsync nop + +mtmsrd rX, 0 b +mtmsr rX b + +mtmsrd rX, 1 b + +[Book3S only] +mtsrin rX, rY b + +[BookE only] +wrteei [0|1] b + + +Some instructions require more logic to determine what's going on than a load +or store instruction can deliver. To enable patching of those, we keep some +RAM around where we can live translate instructions to. What happens is the +following: + + 1) copy emulation code to memory + 2) patch that code to fit the emulated instruction + 3) patch that code to return to the original pc + 4 + 4) patch the original instruction to branch to the new code + +That way we can inject an arbitrary amount of code as replacement for a single +instruction. This allows us to check for pending interrupts when setting EE=1 +for example. + +Hypercall ABIs in KVM on PowerPC +================================= +1) KVM hypercalls (ePAPR) + +These are ePAPR compliant hypercall implementation (mentioned above). Even +generic hypercalls are implemented here, like the ePAPR idle hcall. These are +available on all targets. + +2) PAPR hypercalls + +PAPR hypercalls are needed to run server PowerPC PAPR guests (-M pseries in QEMU). +These are the same hypercalls that pHyp, the POWER hypervisor implements. Some of +them are handled in the kernel, some are handled in user space. This is only +available on book3s_64. + +3) OSI hypercalls + +Mac-on-Linux is another user of KVM on PowerPC, which has its own hypercall (long +before KVM). This is supported to maintain compatibility. All these hypercalls get +forwarded to user space. This is only useful on book3s_32, but can be used with +book3s_64 as well. diff --git a/Documentation/virt/kvm/review-checklist.txt b/Documentation/virt/kvm/review-checklist.txt new file mode 100644 index 000000000000..499af499e296 --- /dev/null +++ b/Documentation/virt/kvm/review-checklist.txt @@ -0,0 +1,38 @@ +Review checklist for kvm patches +================================ + +1. The patch must follow Documentation/process/coding-style.rst and + Documentation/process/submitting-patches.rst. + +2. Patches should be against kvm.git master branch. + +3. If the patch introduces or modifies a new userspace API: + - the API must be documented in Documentation/virt/kvm/api.txt + - the API must be discoverable using KVM_CHECK_EXTENSION + +4. New state must include support for save/restore. + +5. New features must default to off (userspace should explicitly request them). + Performance improvements can and should default to on. + +6. New cpu features should be exposed via KVM_GET_SUPPORTED_CPUID2 + +7. Emulator changes should be accompanied by unit tests for qemu-kvm.git + kvm/test directory. + +8. Changes should be vendor neutral when possible. Changes to common code + are better than duplicating changes to vendor code. + +9. Similarly, prefer changes to arch independent code than to arch dependent + code. + +10. User/kernel interfaces and guest/host interfaces must be 64-bit clean + (all variables and sizes naturally aligned on 64-bit; use specific types + only - u64 rather than ulong). + +11. New guest visible features must either be documented in a hardware manual + or be accompanied by documentation. + +12. Features must be robust against reset and kexec - for example, shared + host/guest memory must be unshared to prevent the host from writing to + guest memory that the guest has not reserved for this purpose. diff --git a/Documentation/virt/kvm/s390-diag.txt b/Documentation/virt/kvm/s390-diag.txt new file mode 100644 index 000000000000..7c52e5f8b210 --- /dev/null +++ b/Documentation/virt/kvm/s390-diag.txt @@ -0,0 +1,83 @@ +The s390 DIAGNOSE call on KVM +============================= + +KVM on s390 supports the DIAGNOSE call for making hypercalls, both for +native hypercalls and for selected hypercalls found on other s390 +hypervisors. + +Note that bits are numbered as by the usual s390 convention (most significant +bit on the left). + + +General remarks +--------------- + +DIAGNOSE calls by the guest cause a mandatory intercept. This implies +all supported DIAGNOSE calls need to be handled by either KVM or its +userspace. + +All DIAGNOSE calls supported by KVM use the RS-a format: + +-------------------------------------- +| '83' | R1 | R3 | B2 | D2 | +-------------------------------------- +0 8 12 16 20 31 + +The second-operand address (obtained by the base/displacement calculation) +is not used to address data. Instead, bits 48-63 of this address specify +the function code, and bits 0-47 are ignored. + +The supported DIAGNOSE function codes vary by the userspace used. For +DIAGNOSE function codes not specific to KVM, please refer to the +documentation for the s390 hypervisors defining them. + + +DIAGNOSE function code 'X'500' - KVM virtio functions +----------------------------------------------------- + +If the function code specifies 0x500, various virtio-related functions +are performed. + +General register 1 contains the virtio subfunction code. Supported +virtio subfunctions depend on KVM's userspace. Generally, userspace +provides either s390-virtio (subcodes 0-2) or virtio-ccw (subcode 3). + +Upon completion of the DIAGNOSE instruction, general register 2 contains +the function's return code, which is either a return code or a subcode +specific value. + +Subcode 0 - s390-virtio notification and early console printk + Handled by userspace. + +Subcode 1 - s390-virtio reset + Handled by userspace. + +Subcode 2 - s390-virtio set status + Handled by userspace. + +Subcode 3 - virtio-ccw notification + Handled by either userspace or KVM (ioeventfd case). + + General register 2 contains a subchannel-identification word denoting + the subchannel of the virtio-ccw proxy device to be notified. + + General register 3 contains the number of the virtqueue to be notified. + + General register 4 contains a 64bit identifier for KVM usage (the + kvm_io_bus cookie). If general register 4 does not contain a valid + identifier, it is ignored. + + After completion of the DIAGNOSE call, general register 2 may contain + a 64bit identifier (in the kvm_io_bus cookie case), or a negative + error value, if an internal error occurred. + + See also the virtio standard for a discussion of this hypercall. + + +DIAGNOSE function code 'X'501 - KVM breakpoint +---------------------------------------------- + +If the function code specifies 0x501, breakpoint functions may be performed. +This function code is handled by userspace. + +This diagnose function code has no subfunctions and uses no parameters. diff --git a/Documentation/virt/kvm/timekeeping.txt b/Documentation/virt/kvm/timekeeping.txt new file mode 100644 index 000000000000..76808a17ad84 --- /dev/null +++ b/Documentation/virt/kvm/timekeeping.txt @@ -0,0 +1,612 @@ + + Timekeeping Virtualization for X86-Based Architectures + + Zachary Amsden + Copyright (c) 2010, Red Hat. All rights reserved. + +1) Overview +2) Timing Devices +3) TSC Hardware +4) Virtualization Problems + +========================================================================= + +1) Overview + +One of the most complicated parts of the X86 platform, and specifically, +the virtualization of this platform is the plethora of timing devices available +and the complexity of emulating those devices. In addition, virtualization of +time introduces a new set of challenges because it introduces a multiplexed +division of time beyond the control of the guest CPU. + +First, we will describe the various timekeeping hardware available, then +present some of the problems which arise and solutions available, giving +specific recommendations for certain classes of KVM guests. + +The purpose of this document is to collect data and information relevant to +timekeeping which may be difficult to find elsewhere, specifically, +information relevant to KVM and hardware-based virtualization. + +========================================================================= + +2) Timing Devices + +First we discuss the basic hardware devices available. TSC and the related +KVM clock are special enough to warrant a full exposition and are described in +the following section. + +2.1) i8254 - PIT + +One of the first timer devices available is the programmable interrupt timer, +or PIT. The PIT has a fixed frequency 1.193182 MHz base clock and three +channels which can be programmed to deliver periodic or one-shot interrupts. +These three channels can be configured in different modes and have individual +counters. Channel 1 and 2 were not available for general use in the original +IBM PC, and historically were connected to control RAM refresh and the PC +speaker. Now the PIT is typically integrated as part of an emulated chipset +and a separate physical PIT is not used. + +The PIT uses I/O ports 0x40 - 0x43. Access to the 16-bit counters is done +using single or multiple byte access to the I/O ports. There are 6 modes +available, but not all modes are available to all timers, as only timer 2 +has a connected gate input, required for modes 1 and 5. The gate line is +controlled by port 61h, bit 0, as illustrated in the following diagram. + + -------------- ---------------- +| | | | +| 1.1932 MHz |---------->| CLOCK OUT | ---------> IRQ 0 +| Clock | | | | + -------------- | +->| GATE TIMER 0 | + | ---------------- + | + | ---------------- + | | | + |------>| CLOCK OUT | ---------> 66.3 KHZ DRAM + | | | (aka /dev/null) + | +->| GATE TIMER 1 | + | ---------------- + | + | ---------------- + | | | + |------>| CLOCK OUT | ---------> Port 61h, bit 5 + | | | +Port 61h, bit 0 ---------->| GATE TIMER 2 | \_.---- ____ + ---------------- _| )--|LPF|---Speaker + / *---- \___/ +Port 61h, bit 1 -----------------------------------/ + +The timer modes are now described. + +Mode 0: Single Timeout. This is a one-shot software timeout that counts down + when the gate is high (always true for timers 0 and 1). When the count + reaches zero, the output goes high. + +Mode 1: Triggered One-shot. The output is initially set high. When the gate + line is set high, a countdown is initiated (which does not stop if the gate is + lowered), during which the output is set low. When the count reaches zero, + the output goes high. + +Mode 2: Rate Generator. The output is initially set high. When the countdown + reaches 1, the output goes low for one count and then returns high. The value + is reloaded and the countdown automatically resumes. If the gate line goes + low, the count is halted. If the output is low when the gate is lowered, the + output automatically goes high (this only affects timer 2). + +Mode 3: Square Wave. This generates a high / low square wave. The count + determines the length of the pulse, which alternates between high and low + when zero is reached. The count only proceeds when gate is high and is + automatically reloaded on reaching zero. The count is decremented twice at + each clock to generate a full high / low cycle at the full periodic rate. + If the count is even, the clock remains high for N/2 counts and low for N/2 + counts; if the clock is odd, the clock is high for (N+1)/2 counts and low + for (N-1)/2 counts. Only even values are latched by the counter, so odd + values are not observed when reading. This is the intended mode for timer 2, + which generates sine-like tones by low-pass filtering the square wave output. + +Mode 4: Software Strobe. After programming this mode and loading the counter, + the output remains high until the counter reaches zero. Then the output + goes low for 1 clock cycle and returns high. The counter is not reloaded. + Counting only occurs when gate is high. + +Mode 5: Hardware Strobe. After programming and loading the counter, the + output remains high. When the gate is raised, a countdown is initiated + (which does not stop if the gate is lowered). When the counter reaches zero, + the output goes low for 1 clock cycle and then returns high. The counter is + not reloaded. + +In addition to normal binary counting, the PIT supports BCD counting. The +command port, 0x43 is used to set the counter and mode for each of the three +timers. + +PIT commands, issued to port 0x43, using the following bit encoding: + +Bit 7-4: Command (See table below) +Bit 3-1: Mode (000 = Mode 0, 101 = Mode 5, 11X = undefined) +Bit 0 : Binary (0) / BCD (1) + +Command table: + +0000 - Latch Timer 0 count for port 0x40 + sample and hold the count to be read in port 0x40; + additional commands ignored until counter is read; + mode bits ignored. + +0001 - Set Timer 0 LSB mode for port 0x40 + set timer to read LSB only and force MSB to zero; + mode bits set timer mode + +0010 - Set Timer 0 MSB mode for port 0x40 + set timer to read MSB only and force LSB to zero; + mode bits set timer mode + +0011 - Set Timer 0 16-bit mode for port 0x40 + set timer to read / write LSB first, then MSB; + mode bits set timer mode + +0100 - Latch Timer 1 count for port 0x41 - as described above +0101 - Set Timer 1 LSB mode for port 0x41 - as described above +0110 - Set Timer 1 MSB mode for port 0x41 - as described above +0111 - Set Timer 1 16-bit mode for port 0x41 - as described above + +1000 - Latch Timer 2 count for port 0x42 - as described above +1001 - Set Timer 2 LSB mode for port 0x42 - as described above +1010 - Set Timer 2 MSB mode for port 0x42 - as described above +1011 - Set Timer 2 16-bit mode for port 0x42 as described above + +1101 - General counter latch + Latch combination of counters into corresponding ports + Bit 3 = Counter 2 + Bit 2 = Counter 1 + Bit 1 = Counter 0 + Bit 0 = Unused + +1110 - Latch timer status + Latch combination of counter mode into corresponding ports + Bit 3 = Counter 2 + Bit 2 = Counter 1 + Bit 1 = Counter 0 + + The output of ports 0x40-0x42 following this command will be: + + Bit 7 = Output pin + Bit 6 = Count loaded (0 if timer has expired) + Bit 5-4 = Read / Write mode + 01 = MSB only + 10 = LSB only + 11 = LSB / MSB (16-bit) + Bit 3-1 = Mode + Bit 0 = Binary (0) / BCD mode (1) + +2.2) RTC + +The second device which was available in the original PC was the MC146818 real +time clock. The original device is now obsolete, and usually emulated by the +system chipset, sometimes by an HPET and some frankenstein IRQ routing. + +The RTC is accessed through CMOS variables, which uses an index register to +control which bytes are read. Since there is only one index register, read +of the CMOS and read of the RTC require lock protection (in addition, it is +dangerous to allow userspace utilities such as hwclock to have direct RTC +access, as they could corrupt kernel reads and writes of CMOS memory). + +The RTC generates an interrupt which is usually routed to IRQ 8. The interrupt +can function as a periodic timer, an additional once a day alarm, and can issue +interrupts after an update of the CMOS registers by the MC146818 is complete. +The type of interrupt is signalled in the RTC status registers. + +The RTC will update the current time fields by battery power even while the +system is off. The current time fields should not be read while an update is +in progress, as indicated in the status register. + +The clock uses a 32.768kHz crystal, so bits 6-4 of register A should be +programmed to a 32kHz divider if the RTC is to count seconds. + +This is the RAM map originally used for the RTC/CMOS: + +Location Size Description +------------------------------------------ +00h byte Current second (BCD) +01h byte Seconds alarm (BCD) +02h byte Current minute (BCD) +03h byte Minutes alarm (BCD) +04h byte Current hour (BCD) +05h byte Hours alarm (BCD) +06h byte Current day of week (BCD) +07h byte Current day of month (BCD) +08h byte Current month (BCD) +09h byte Current year (BCD) +0Ah byte Register A + bit 7 = Update in progress + bit 6-4 = Divider for clock + 000 = 4.194 MHz + 001 = 1.049 MHz + 010 = 32 kHz + 10X = test modes + 110 = reset / disable + 111 = reset / disable + bit 3-0 = Rate selection for periodic interrupt + 000 = periodic timer disabled + 001 = 3.90625 uS + 010 = 7.8125 uS + 011 = .122070 mS + 100 = .244141 mS + ... + 1101 = 125 mS + 1110 = 250 mS + 1111 = 500 mS +0Bh byte Register B + bit 7 = Run (0) / Halt (1) + bit 6 = Periodic interrupt enable + bit 5 = Alarm interrupt enable + bit 4 = Update-ended interrupt enable + bit 3 = Square wave interrupt enable + bit 2 = BCD calendar (0) / Binary (1) + bit 1 = 12-hour mode (0) / 24-hour mode (1) + bit 0 = 0 (DST off) / 1 (DST enabled) +OCh byte Register C (read only) + bit 7 = interrupt request flag (IRQF) + bit 6 = periodic interrupt flag (PF) + bit 5 = alarm interrupt flag (AF) + bit 4 = update interrupt flag (UF) + bit 3-0 = reserved +ODh byte Register D (read only) + bit 7 = RTC has power + bit 6-0 = reserved +32h byte Current century BCD (*) + (*) location vendor specific and now determined from ACPI global tables + +2.3) APIC + +On Pentium and later processors, an on-board timer is available to each CPU +as part of the Advanced Programmable Interrupt Controller. The APIC is +accessed through memory-mapped registers and provides interrupt service to each +CPU, used for IPIs and local timer interrupts. + +Although in theory the APIC is a safe and stable source for local interrupts, +in practice, many bugs and glitches have occurred due to the special nature of +the APIC CPU-local memory-mapped hardware. Beware that CPU errata may affect +the use of the APIC and that workarounds may be required. In addition, some of +these workarounds pose unique constraints for virtualization - requiring either +extra overhead incurred from extra reads of memory-mapped I/O or additional +functionality that may be more computationally expensive to implement. + +Since the APIC is documented quite well in the Intel and AMD manuals, we will +avoid repetition of the detail here. It should be pointed out that the APIC +timer is programmed through the LVT (local vector timer) register, is capable +of one-shot or periodic operation, and is based on the bus clock divided down +by the programmable divider register. + +2.4) HPET + +HPET is quite complex, and was originally intended to replace the PIT / RTC +support of the X86 PC. It remains to be seen whether that will be the case, as +the de facto standard of PC hardware is to emulate these older devices. Some +systems designated as legacy free may support only the HPET as a hardware timer +device. + +The HPET spec is rather loose and vague, requiring at least 3 hardware timers, +but allowing implementation freedom to support many more. It also imposes no +fixed rate on the timer frequency, but does impose some extremal values on +frequency, error and slew. + +In general, the HPET is recommended as a high precision (compared to PIT /RTC) +time source which is independent of local variation (as there is only one HPET +in any given system). The HPET is also memory-mapped, and its presence is +indicated through ACPI tables by the BIOS. + +Detailed specification of the HPET is beyond the current scope of this +document, as it is also very well documented elsewhere. + +2.5) Offboard Timers + +Several cards, both proprietary (watchdog boards) and commonplace (e1000) have +timing chips built into the cards which may have registers which are accessible +to kernel or user drivers. To the author's knowledge, using these to generate +a clocksource for a Linux or other kernel has not yet been attempted and is in +general frowned upon as not playing by the agreed rules of the game. Such a +timer device would require additional support to be virtualized properly and is +not considered important at this time as no known operating system does this. + +========================================================================= + +3) TSC Hardware + +The TSC or time stamp counter is relatively simple in theory; it counts +instruction cycles issued by the processor, which can be used as a measure of +time. In practice, due to a number of problems, it is the most complicated +timekeeping device to use. + +The TSC is represented internally as a 64-bit MSR which can be read with the +RDMSR, RDTSC, or RDTSCP (when available) instructions. In the past, hardware +limitations made it possible to write the TSC, but generally on old hardware it +was only possible to write the low 32-bits of the 64-bit counter, and the upper +32-bits of the counter were cleared. Now, however, on Intel processors family +0Fh, for models 3, 4 and 6, and family 06h, models e and f, this restriction +has been lifted and all 64-bits are writable. On AMD systems, the ability to +write the TSC MSR is not an architectural guarantee. + +The TSC is accessible from CPL-0 and conditionally, for CPL > 0 software by +means of the CR4.TSD bit, which when enabled, disables CPL > 0 TSC access. + +Some vendors have implemented an additional instruction, RDTSCP, which returns +atomically not just the TSC, but an indicator which corresponds to the +processor number. This can be used to index into an array of TSC variables to +determine offset information in SMP systems where TSCs are not synchronized. +The presence of this instruction must be determined by consulting CPUID feature +bits. + +Both VMX and SVM provide extension fields in the virtualization hardware which +allows the guest visible TSC to be offset by a constant. Newer implementations +promise to allow the TSC to additionally be scaled, but this hardware is not +yet widely available. + +3.1) TSC synchronization + +The TSC is a CPU-local clock in most implementations. This means, on SMP +platforms, the TSCs of different CPUs may start at different times depending +on when the CPUs are powered on. Generally, CPUs on the same die will share +the same clock, however, this is not always the case. + +The BIOS may attempt to resynchronize the TSCs during the poweron process and +the operating system or other system software may attempt to do this as well. +Several hardware limitations make the problem worse - if it is not possible to +write the full 64-bits of the TSC, it may be impossible to match the TSC in +newly arriving CPUs to that of the rest of the system, resulting in +unsynchronized TSCs. This may be done by BIOS or system software, but in +practice, getting a perfectly synchronized TSC will not be possible unless all +values are read from the same clock, which generally only is possible on single +socket systems or those with special hardware support. + +3.2) TSC and CPU hotplug + +As touched on already, CPUs which arrive later than the boot time of the system +may not have a TSC value that is synchronized with the rest of the system. +Either system software, BIOS, or SMM code may actually try to establish the TSC +to a value matching the rest of the system, but a perfect match is usually not +a guarantee. This can have the effect of bringing a system from a state where +TSC is synchronized back to a state where TSC synchronization flaws, however +small, may be exposed to the OS and any virtualization environment. + +3.3) TSC and multi-socket / NUMA + +Multi-socket systems, especially large multi-socket systems are likely to have +individual clocksources rather than a single, universally distributed clock. +Since these clocks are driven by different crystals, they will not have +perfectly matched frequency, and temperature and electrical variations will +cause the CPU clocks, and thus the TSCs to drift over time. Depending on the +exact clock and bus design, the drift may or may not be fixed in absolute +error, and may accumulate over time. + +In addition, very large systems may deliberately slew the clocks of individual +cores. This technique, known as spread-spectrum clocking, reduces EMI at the +clock frequency and harmonics of it, which may be required to pass FCC +standards for telecommunications and computer equipment. + +It is recommended not to trust the TSCs to remain synchronized on NUMA or +multiple socket systems for these reasons. + +3.4) TSC and C-states + +C-states, or idling states of the processor, especially C1E and deeper sleep +states may be problematic for TSC as well. The TSC may stop advancing in such +a state, resulting in a TSC which is behind that of other CPUs when execution +is resumed. Such CPUs must be detected and flagged by the operating system +based on CPU and chipset identifications. + +The TSC in such a case may be corrected by catching it up to a known external +clocksource. + +3.5) TSC frequency change / P-states + +To make things slightly more interesting, some CPUs may change frequency. They +may or may not run the TSC at the same rate, and because the frequency change +may be staggered or slewed, at some points in time, the TSC rate may not be +known other than falling within a range of values. In this case, the TSC will +not be a stable time source, and must be calibrated against a known, stable, +external clock to be a usable source of time. + +Whether the TSC runs at a constant rate or scales with the P-state is model +dependent and must be determined by inspecting CPUID, chipset or vendor +specific MSR fields. + +In addition, some vendors have known bugs where the P-state is actually +compensated for properly during normal operation, but when the processor is +inactive, the P-state may be raised temporarily to service cache misses from +other processors. In such cases, the TSC on halted CPUs could advance faster +than that of non-halted processors. AMD Turion processors are known to have +this problem. + +3.6) TSC and STPCLK / T-states + +External signals given to the processor may also have the effect of stopping +the TSC. This is typically done for thermal emergency power control to prevent +an overheating condition, and typically, there is no way to detect that this +condition has happened. + +3.7) TSC virtualization - VMX + +VMX provides conditional trapping of RDTSC, RDMSR, WRMSR and RDTSCP +instructions, which is enough for full virtualization of TSC in any manner. In +addition, VMX allows passing through the host TSC plus an additional TSC_OFFSET +field specified in the VMCS. Special instructions must be used to read and +write the VMCS field. + +3.8) TSC virtualization - SVM + +SVM provides conditional trapping of RDTSC, RDMSR, WRMSR and RDTSCP +instructions, which is enough for full virtualization of TSC in any manner. In +addition, SVM allows passing through the host TSC plus an additional offset +field specified in the SVM control block. + +3.9) TSC feature bits in Linux + +In summary, there is no way to guarantee the TSC remains in perfect +synchronization unless it is explicitly guaranteed by the architecture. Even +if so, the TSCs in multi-sockets or NUMA systems may still run independently +despite being locally consistent. + +The following feature bits are used by Linux to signal various TSC attributes, +but they can only be taken to be meaningful for UP or single node systems. + +X86_FEATURE_TSC : The TSC is available in hardware +X86_FEATURE_RDTSCP : The RDTSCP instruction is available +X86_FEATURE_CONSTANT_TSC : The TSC rate is unchanged with P-states +X86_FEATURE_NONSTOP_TSC : The TSC does not stop in C-states +X86_FEATURE_TSC_RELIABLE : TSC sync checks are skipped (VMware) + +4) Virtualization Problems + +Timekeeping is especially problematic for virtualization because a number of +challenges arise. The most obvious problem is that time is now shared between +the host and, potentially, a number of virtual machines. Thus the virtual +operating system does not run with 100% usage of the CPU, despite the fact that +it may very well make that assumption. It may expect it to remain true to very +exacting bounds when interrupt sources are disabled, but in reality only its +virtual interrupt sources are disabled, and the machine may still be preempted +at any time. This causes problems as the passage of real time, the injection +of machine interrupts and the associated clock sources are no longer completely +synchronized with real time. + +This same problem can occur on native hardware to a degree, as SMM mode may +steal cycles from the naturally on X86 systems when SMM mode is used by the +BIOS, but not in such an extreme fashion. However, the fact that SMM mode may +cause similar problems to virtualization makes it a good justification for +solving many of these problems on bare metal. + +4.1) Interrupt clocking + +One of the most immediate problems that occurs with legacy operating systems +is that the system timekeeping routines are often designed to keep track of +time by counting periodic interrupts. These interrupts may come from the PIT +or the RTC, but the problem is the same: the host virtualization engine may not +be able to deliver the proper number of interrupts per second, and so guest +time may fall behind. This is especially problematic if a high interrupt rate +is selected, such as 1000 HZ, which is unfortunately the default for many Linux +guests. + +There are three approaches to solving this problem; first, it may be possible +to simply ignore it. Guests which have a separate time source for tracking +'wall clock' or 'real time' may not need any adjustment of their interrupts to +maintain proper time. If this is not sufficient, it may be necessary to inject +additional interrupts into the guest in order to increase the effective +interrupt rate. This approach leads to complications in extreme conditions, +where host load or guest lag is too much to compensate for, and thus another +solution to the problem has risen: the guest may need to become aware of lost +ticks and compensate for them internally. Although promising in theory, the +implementation of this policy in Linux has been extremely error prone, and a +number of buggy variants of lost tick compensation are distributed across +commonly used Linux systems. + +Windows uses periodic RTC clocking as a means of keeping time internally, and +thus requires interrupt slewing to keep proper time. It does use a low enough +rate (ed: is it 18.2 Hz?) however that it has not yet been a problem in +practice. + +4.2) TSC sampling and serialization + +As the highest precision time source available, the cycle counter of the CPU +has aroused much interest from developers. As explained above, this timer has +many problems unique to its nature as a local, potentially unstable and +potentially unsynchronized source. One issue which is not unique to the TSC, +but is highlighted because of its very precise nature is sampling delay. By +definition, the counter, once read is already old. However, it is also +possible for the counter to be read ahead of the actual use of the result. +This is a consequence of the superscalar execution of the instruction stream, +which may execute instructions out of order. Such execution is called +non-serialized. Forcing serialized execution is necessary for precise +measurement with the TSC, and requires a serializing instruction, such as CPUID +or an MSR read. + +Since CPUID may actually be virtualized by a trap and emulate mechanism, this +serialization can pose a performance issue for hardware virtualization. An +accurate time stamp counter reading may therefore not always be available, and +it may be necessary for an implementation to guard against "backwards" reads of +the TSC as seen from other CPUs, even in an otherwise perfectly synchronized +system. + +4.3) Timespec aliasing + +Additionally, this lack of serialization from the TSC poses another challenge +when using results of the TSC when measured against another time source. As +the TSC is much higher precision, many possible values of the TSC may be read +while another clock is still expressing the same value. + +That is, you may read (T,T+10) while external clock C maintains the same value. +Due to non-serialized reads, you may actually end up with a range which +fluctuates - from (T-1.. T+10). Thus, any time calculated from a TSC, but +calibrated against an external value may have a range of valid values. +Re-calibrating this computation may actually cause time, as computed after the +calibration, to go backwards, compared with time computed before the +calibration. + +This problem is particularly pronounced with an internal time source in Linux, +the kernel time, which is expressed in the theoretically high resolution +timespec - but which advances in much larger granularity intervals, sometimes +at the rate of jiffies, and possibly in catchup modes, at a much larger step. + +This aliasing requires care in the computation and recalibration of kvmclock +and any other values derived from TSC computation (such as TSC virtualization +itself). + +4.4) Migration + +Migration of a virtual machine raises problems for timekeeping in two ways. +First, the migration itself may take time, during which interrupts cannot be +delivered, and after which, the guest time may need to be caught up. NTP may +be able to help to some degree here, as the clock correction required is +typically small enough to fall in the NTP-correctable window. + +An additional concern is that timers based off the TSC (or HPET, if the raw bus +clock is exposed) may now be running at different rates, requiring compensation +in some way in the hypervisor by virtualizing these timers. In addition, +migrating to a faster machine may preclude the use of a passthrough TSC, as a +faster clock cannot be made visible to a guest without the potential of time +advancing faster than usual. A slower clock is less of a problem, as it can +always be caught up to the original rate. KVM clock avoids these problems by +simply storing multipliers and offsets against the TSC for the guest to convert +back into nanosecond resolution values. + +4.5) Scheduling + +Since scheduling may be based on precise timing and firing of interrupts, the +scheduling algorithms of an operating system may be adversely affected by +virtualization. In theory, the effect is random and should be universally +distributed, but in contrived as well as real scenarios (guest device access, +causes of virtualization exits, possible context switch), this may not always +be the case. The effect of this has not been well studied. + +In an attempt to work around this, several implementations have provided a +paravirtualized scheduler clock, which reveals the true amount of CPU time for +which a virtual machine has been running. + +4.6) Watchdogs + +Watchdog timers, such as the lock detector in Linux may fire accidentally when +running under hardware virtualization due to timer interrupts being delayed or +misinterpretation of the passage of real time. Usually, these warnings are +spurious and can be ignored, but in some circumstances it may be necessary to +disable such detection. + +4.7) Delays and precision timing + +Precise timing and delays may not be possible in a virtualized system. This +can happen if the system is controlling physical hardware, or issues delays to +compensate for slower I/O to and from devices. The first issue is not solvable +in general for a virtualized system; hardware control software can't be +adequately virtualized without a full real-time operating system, which would +require an RT aware virtualization platform. + +The second issue may cause performance problems, but this is unlikely to be a +significant issue. In many cases these delays may be eliminated through +configuration or paravirtualization. + +4.8) Covert channels and leaks + +In addition to the above problems, time information will inevitably leak to the +guest about the host in anything but a perfect implementation of virtualized +time. This may allow the guest to infer the presence of a hypervisor (as in a +red-pill type detection), and it may allow information to leak between guests +by using CPU utilization itself as a signalling channel. Preventing such +problems would require completely isolated virtual time which may not track +real time any longer. This may be useful in certain security or QA contexts, +but in general isn't recommended for real-world deployment scenarios. diff --git a/Documentation/virt/kvm/vcpu-requests.rst b/Documentation/virt/kvm/vcpu-requests.rst new file mode 100644 index 000000000000..5feb3706a7ae --- /dev/null +++ b/Documentation/virt/kvm/vcpu-requests.rst @@ -0,0 +1,307 @@ +================= +KVM VCPU Requests +================= + +Overview +======== + +KVM supports an internal API enabling threads to request a VCPU thread to +perform some activity. For example, a thread may request a VCPU to flush +its TLB with a VCPU request. The API consists of the following functions:: + + /* Check if any requests are pending for VCPU @vcpu. */ + bool kvm_request_pending(struct kvm_vcpu *vcpu); + + /* Check if VCPU @vcpu has request @req pending. */ + bool kvm_test_request(int req, struct kvm_vcpu *vcpu); + + /* Clear request @req for VCPU @vcpu. */ + void kvm_clear_request(int req, struct kvm_vcpu *vcpu); + + /* + * Check if VCPU @vcpu has request @req pending. When the request is + * pending it will be cleared and a memory barrier, which pairs with + * another in kvm_make_request(), will be issued. + */ + bool kvm_check_request(int req, struct kvm_vcpu *vcpu); + + /* + * Make request @req of VCPU @vcpu. Issues a memory barrier, which pairs + * with another in kvm_check_request(), prior to setting the request. + */ + void kvm_make_request(int req, struct kvm_vcpu *vcpu); + + /* Make request @req of all VCPUs of the VM with struct kvm @kvm. */ + bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req); + +Typically a requester wants the VCPU to perform the activity as soon +as possible after making the request. This means most requests +(kvm_make_request() calls) are followed by a call to kvm_vcpu_kick(), +and kvm_make_all_cpus_request() has the kicking of all VCPUs built +into it. + +VCPU Kicks +---------- + +The goal of a VCPU kick is to bring a VCPU thread out of guest mode in +order to perform some KVM maintenance. To do so, an IPI is sent, forcing +a guest mode exit. However, a VCPU thread may not be in guest mode at the +time of the kick. Therefore, depending on the mode and state of the VCPU +thread, there are two other actions a kick may take. All three actions +are listed below: + +1) Send an IPI. This forces a guest mode exit. +2) Waking a sleeping VCPU. Sleeping VCPUs are VCPU threads outside guest + mode that wait on waitqueues. Waking them removes the threads from + the waitqueues, allowing the threads to run again. This behavior + may be suppressed, see KVM_REQUEST_NO_WAKEUP below. +3) Nothing. When the VCPU is not in guest mode and the VCPU thread is not + sleeping, then there is nothing to do. + +VCPU Mode +--------- + +VCPUs have a mode state, ``vcpu->mode``, that is used to track whether the +guest is running in guest mode or not, as well as some specific +outside guest mode states. The architecture may use ``vcpu->mode`` to +ensure VCPU requests are seen by VCPUs (see "Ensuring Requests Are Seen"), +as well as to avoid sending unnecessary IPIs (see "IPI Reduction"), and +even to ensure IPI acknowledgements are waited upon (see "Waiting for +Acknowledgements"). The following modes are defined: + +OUTSIDE_GUEST_MODE + + The VCPU thread is outside guest mode. + +IN_GUEST_MODE + + The VCPU thread is in guest mode. + +EXITING_GUEST_MODE + + The VCPU thread is transitioning from IN_GUEST_MODE to + OUTSIDE_GUEST_MODE. + +READING_SHADOW_PAGE_TABLES + + The VCPU thread is outside guest mode, but it wants the sender of + certain VCPU requests, namely KVM_REQ_TLB_FLUSH, to wait until the VCPU + thread is done reading the page tables. + +VCPU Request Internals +====================== + +VCPU requests are simply bit indices of the ``vcpu->requests`` bitmap. +This means general bitops, like those documented in [atomic-ops]_ could +also be used, e.g. :: + + clear_bit(KVM_REQ_UNHALT & KVM_REQUEST_MASK, &vcpu->requests); + +However, VCPU request users should refrain from doing so, as it would +break the abstraction. The first 8 bits are reserved for architecture +independent requests, all additional bits are available for architecture +dependent requests. + +Architecture Independent Requests +--------------------------------- + +KVM_REQ_TLB_FLUSH + + KVM's common MMU notifier may need to flush all of a guest's TLB + entries, calling kvm_flush_remote_tlbs() to do so. Architectures that + choose to use the common kvm_flush_remote_tlbs() implementation will + need to handle this VCPU request. + +KVM_REQ_MMU_RELOAD + + When shadow page tables are used and memory slots are removed it's + necessary to inform each VCPU to completely refresh the tables. This + request is used for that. + +KVM_REQ_PENDING_TIMER + + This request may be made from a timer handler run on the host on behalf + of a VCPU. It informs the VCPU thread to inject a timer interrupt. + +KVM_REQ_UNHALT + + This request may be made from the KVM common function kvm_vcpu_block(), + which is used to emulate an instruction that causes a CPU to halt until + one of an architectural specific set of events and/or interrupts is + received (determined by checking kvm_arch_vcpu_runnable()). When that + event or interrupt arrives kvm_vcpu_block() makes the request. This is + in contrast to when kvm_vcpu_block() returns due to any other reason, + such as a pending signal, which does not indicate the VCPU's halt + emulation should stop, and therefore does not make the request. + +KVM_REQUEST_MASK +---------------- + +VCPU requests should be masked by KVM_REQUEST_MASK before using them with +bitops. This is because only the lower 8 bits are used to represent the +request's number. The upper bits are used as flags. Currently only two +flags are defined. + +VCPU Request Flags +------------------ + +KVM_REQUEST_NO_WAKEUP + + This flag is applied to requests that only need immediate attention + from VCPUs running in guest mode. That is, sleeping VCPUs do not need + to be awaken for these requests. Sleeping VCPUs will handle the + requests when they are awaken later for some other reason. + +KVM_REQUEST_WAIT + + When requests with this flag are made with kvm_make_all_cpus_request(), + then the caller will wait for each VCPU to acknowledge its IPI before + proceeding. This flag only applies to VCPUs that would receive IPIs. + If, for example, the VCPU is sleeping, so no IPI is necessary, then + the requesting thread does not wait. This means that this flag may be + safely combined with KVM_REQUEST_NO_WAKEUP. See "Waiting for + Acknowledgements" for more information about requests with + KVM_REQUEST_WAIT. + +VCPU Requests with Associated State +=================================== + +Requesters that want the receiving VCPU to handle new state need to ensure +the newly written state is observable to the receiving VCPU thread's CPU +by the time it observes the request. This means a write memory barrier +must be inserted after writing the new state and before setting the VCPU +request bit. Additionally, on the receiving VCPU thread's side, a +corresponding read barrier must be inserted after reading the request bit +and before proceeding to read the new state associated with it. See +scenario 3, Message and Flag, of [lwn-mb]_ and the kernel documentation +[memory-barriers]_. + +The pair of functions, kvm_check_request() and kvm_make_request(), provide +the memory barriers, allowing this requirement to be handled internally by +the API. + +Ensuring Requests Are Seen +========================== + +When making requests to VCPUs, we want to avoid the receiving VCPU +executing in guest mode for an arbitrary long time without handling the +request. We can be sure this won't happen as long as we ensure the VCPU +thread checks kvm_request_pending() before entering guest mode and that a +kick will send an IPI to force an exit from guest mode when necessary. +Extra care must be taken to cover the period after the VCPU thread's last +kvm_request_pending() check and before it has entered guest mode, as kick +IPIs will only trigger guest mode exits for VCPU threads that are in guest +mode or at least have already disabled interrupts in order to prepare to +enter guest mode. This means that an optimized implementation (see "IPI +Reduction") must be certain when it's safe to not send the IPI. One +solution, which all architectures except s390 apply, is to: + +- set ``vcpu->mode`` to IN_GUEST_MODE between disabling the interrupts and + the last kvm_request_pending() check; +- enable interrupts atomically when entering the guest. + +This solution also requires memory barriers to be placed carefully in both +the requesting thread and the receiving VCPU. With the memory barriers we +can exclude the possibility of a VCPU thread observing +!kvm_request_pending() on its last check and then not receiving an IPI for +the next request made of it, even if the request is made immediately after +the check. This is done by way of the Dekker memory barrier pattern +(scenario 10 of [lwn-mb]_). As the Dekker pattern requires two variables, +this solution pairs ``vcpu->mode`` with ``vcpu->requests``. Substituting +them into the pattern gives:: + + CPU1 CPU2 + ================= ================= + local_irq_disable(); + WRITE_ONCE(vcpu->mode, IN_GUEST_MODE); kvm_make_request(REQ, vcpu); + smp_mb(); smp_mb(); + if (kvm_request_pending(vcpu)) { if (READ_ONCE(vcpu->mode) == + IN_GUEST_MODE) { + ...abort guest entry... ...send IPI... + } } + +As stated above, the IPI is only useful for VCPU threads in guest mode or +that have already disabled interrupts. This is why this specific case of +the Dekker pattern has been extended to disable interrupts before setting +``vcpu->mode`` to IN_GUEST_MODE. WRITE_ONCE() and READ_ONCE() are used to +pedantically implement the memory barrier pattern, guaranteeing the +compiler doesn't interfere with ``vcpu->mode``'s carefully planned +accesses. + +IPI Reduction +------------- + +As only one IPI is needed to get a VCPU to check for any/all requests, +then they may be coalesced. This is easily done by having the first IPI +sending kick also change the VCPU mode to something !IN_GUEST_MODE. The +transitional state, EXITING_GUEST_MODE, is used for this purpose. + +Waiting for Acknowledgements +---------------------------- + +Some requests, those with the KVM_REQUEST_WAIT flag set, require IPIs to +be sent, and the acknowledgements to be waited upon, even when the target +VCPU threads are in modes other than IN_GUEST_MODE. For example, one case +is when a target VCPU thread is in READING_SHADOW_PAGE_TABLES mode, which +is set after disabling interrupts. To support these cases, the +KVM_REQUEST_WAIT flag changes the condition for sending an IPI from +checking that the VCPU is IN_GUEST_MODE to checking that it is not +OUTSIDE_GUEST_MODE. + +Request-less VCPU Kicks +----------------------- + +As the determination of whether or not to send an IPI depends on the +two-variable Dekker memory barrier pattern, then it's clear that +request-less VCPU kicks are almost never correct. Without the assurance +that a non-IPI generating kick will still result in an action by the +receiving VCPU, as the final kvm_request_pending() check does for +request-accompanying kicks, then the kick may not do anything useful at +all. If, for instance, a request-less kick was made to a VCPU that was +just about to set its mode to IN_GUEST_MODE, meaning no IPI is sent, then +the VCPU thread may continue its entry without actually having done +whatever it was the kick was meant to initiate. + +One exception is x86's posted interrupt mechanism. In this case, however, +even the request-less VCPU kick is coupled with the same +local_irq_disable() + smp_mb() pattern described above; the ON bit +(Outstanding Notification) in the posted interrupt descriptor takes the +role of ``vcpu->requests``. When sending a posted interrupt, PIR.ON is +set before reading ``vcpu->mode``; dually, in the VCPU thread, +vmx_sync_pir_to_irr() reads PIR after setting ``vcpu->mode`` to +IN_GUEST_MODE. + +Additional Considerations +========================= + +Sleeping VCPUs +-------------- + +VCPU threads may need to consider requests before and/or after calling +functions that may put them to sleep, e.g. kvm_vcpu_block(). Whether they +do or not, and, if they do, which requests need consideration, is +architecture dependent. kvm_vcpu_block() calls kvm_arch_vcpu_runnable() +to check if it should awaken. One reason to do so is to provide +architectures a function where requests may be checked if necessary. + +Clearing Requests +----------------- + +Generally it only makes sense for the receiving VCPU thread to clear a +request. However, in some circumstances, such as when the requesting +thread and the receiving VCPU thread are executed serially, such as when +they are the same thread, or when they are using some form of concurrency +control to temporarily execute synchronously, then it's possible to know +that the request may be cleared immediately, rather than waiting for the +receiving VCPU thread to handle the request in VCPU RUN. The only current +examples of this are kvm_vcpu_block() calls made by VCPUs to block +themselves. A possible side-effect of that call is to make the +KVM_REQ_UNHALT request, which may then be cleared immediately when the +VCPU returns from the call. + +References +========== + +.. [atomic-ops] Documentation/core-api/atomic_ops.rst +.. [memory-barriers] Documentation/memory-barriers.txt +.. [lwn-mb] https://lwn.net/Articles/573436/ diff --git a/Documentation/virt/paravirt_ops.rst b/Documentation/virt/paravirt_ops.rst new file mode 100644 index 000000000000..6b789d27cead --- /dev/null +++ b/Documentation/virt/paravirt_ops.rst @@ -0,0 +1,35 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============ +Paravirt_ops +============ + +Linux provides support for different hypervisor virtualization technologies. +Historically different binary kernels would be required in order to support +different hypervisors, this restriction was removed with pv_ops. +Linux pv_ops is a virtualization API which enables support for different +hypervisors. It allows each hypervisor to override critical operations and +allows a single kernel binary to run on all supported execution environments +including native machine -- without any hypervisors. + +pv_ops provides a set of function pointers which represent operations +corresponding to low level critical instructions and high level +functionalities in various areas. pv-ops allows for optimizations at run +time by enabling binary patching of the low-ops critical operations +at boot time. + +pv_ops operations are classified into three categories: + +- simple indirect call + These operations correspond to high level functionality where it is + known that the overhead of indirect call isn't very important. + +- indirect call which allows optimization with binary patch + Usually these operations correspond to low level critical instructions. They + are called frequently and are performance critical. The overhead is + very important. + +- a set of macros for hand written assembly code + Hand written assembly codes (.S files) also need paravirtualization + because they include sensitive instructions or some of code paths in + them are very performance critical. diff --git a/Documentation/virt/uml/UserModeLinux-HOWTO.txt b/Documentation/virt/uml/UserModeLinux-HOWTO.txt new file mode 100644 index 000000000000..87b80f589e1c --- /dev/null +++ b/Documentation/virt/uml/UserModeLinux-HOWTO.txt @@ -0,0 +1,4589 @@ + User Mode Linux HOWTO + User Mode Linux Core Team + Mon Nov 18 14:16:16 EST 2002 + + This document describes the use and abuse of Jeff Dike's User Mode + Linux: a port of the Linux kernel as a normal Intel Linux process. + ______________________________________________________________________ + + Table of Contents + + 1. Introduction + + 1.1 How is User Mode Linux Different? + 1.2 Why Would I Want User Mode Linux? + + 2. Compiling the kernel and modules + + 2.1 Compiling the kernel + 2.2 Compiling and installing kernel modules + 2.3 Compiling and installing uml_utilities + + 3. Running UML and logging in + + 3.1 Running UML + 3.2 Logging in + 3.3 Examples + + 4. UML on 2G/2G hosts + + 4.1 Introduction + 4.2 The problem + 4.3 The solution + + 5. Setting up serial lines and consoles + + 5.1 Specifying the device + 5.2 Specifying the channel + 5.3 Examples + + 6. Setting up the network + + 6.1 General setup + 6.2 Userspace daemons + 6.3 Specifying ethernet addresses + 6.4 UML interface setup + 6.5 Multicast + 6.6 TUN/TAP with the uml_net helper + 6.7 TUN/TAP with a preconfigured tap device + 6.8 Ethertap + 6.9 The switch daemon + 6.10 Slip + 6.11 Slirp + 6.12 pcap + 6.13 Setting up the host yourself + + 7. Sharing Filesystems between Virtual Machines + + 7.1 A warning + 7.2 Using layered block devices + 7.3 Note! + 7.4 Another warning + 7.5 uml_moo : Merging a COW file with its backing file + + 8. Creating filesystems + + 8.1 Create the filesystem file + 8.2 Assign the file to a UML device + 8.3 Creating and mounting the filesystem + + 9. Host file access + + 9.1 Using hostfs + 9.2 hostfs as the root filesystem + 9.3 Building hostfs + + 10. The Management Console + 10.1 version + 10.2 halt and reboot + 10.3 config + 10.4 remove + 10.5 sysrq + 10.6 help + 10.7 cad + 10.8 stop + 10.9 go + + 11. Kernel debugging + + 11.1 Starting the kernel under gdb + 11.2 Examining sleeping processes + 11.3 Running ddd on UML + 11.4 Debugging modules + 11.5 Attaching gdb to the kernel + 11.6 Using alternate debuggers + + 12. Kernel debugging examples + + 12.1 The case of the hung fsck + 12.2 Episode 2: The case of the hung fsck + + 13. What to do when UML doesn't work + + 13.1 Strange compilation errors when you build from source + 13.2 (obsolete) + 13.3 A variety of panics and hangs with /tmp on a reiserfs filesystem + 13.4 The compile fails with errors about conflicting types for 'open', 'dup', and 'waitpid' + 13.5 UML doesn't work when /tmp is an NFS filesystem + 13.6 UML hangs on boot when compiled with gprof support + 13.7 syslogd dies with a SIGTERM on startup + 13.8 TUN/TAP networking doesn't work on a 2.4 host + 13.9 You can network to the host but not to other machines on the net + 13.10 I have no root and I want to scream + 13.11 UML build conflict between ptrace.h and ucontext.h + 13.12 The UML BogoMips is exactly half the host's BogoMips + 13.13 When you run UML, it immediately segfaults + 13.14 xterms appear, then immediately disappear + 13.15 Any other panic, hang, or strange behavior + + 14. Diagnosing Problems + + 14.1 Case 1 : Normal kernel panics + 14.2 Case 2 : Tracing thread panics + 14.3 Case 3 : Tracing thread panics caused by other threads + 14.4 Case 4 : Hangs + + 15. Thanks + + 15.1 Code and Documentation + 15.2 Flushing out bugs + 15.3 Buglets and clean-ups + 15.4 Case Studies + 15.5 Other contributions + + + ______________________________________________________________________ + + 1. Introduction + + Welcome to User Mode Linux. It's going to be fun. + + + + 1.1. How is User Mode Linux Different? + + Normally, the Linux Kernel talks straight to your hardware (video + card, keyboard, hard drives, etc), and any programs which run ask the + kernel to operate the hardware, like so: + + + + +-----------+-----------+----+ + | Process 1 | Process 2 | ...| + +-----------+-----------+----+ + | Linux Kernel | + +----------------------------+ + | Hardware | + +----------------------------+ + + + + + The User Mode Linux Kernel is different; instead of talking to the + hardware, it talks to a `real' Linux kernel (called the `host kernel' + from now on), like any other program. Programs can then run inside + User-Mode Linux as if they were running under a normal kernel, like + so: + + + + +----------------+ + | Process 2 | ...| + +-----------+----------------+ + | Process 1 | User-Mode Linux| + +----------------------------+ + | Linux Kernel | + +----------------------------+ + | Hardware | + +----------------------------+ + + + + + + 1.2. Why Would I Want User Mode Linux? + + + 1. If User Mode Linux crashes, your host kernel is still fine. + + 2. You can run a usermode kernel as a non-root user. + + 3. You can debug the User Mode Linux like any normal process. + + 4. You can run gprof (profiling) and gcov (coverage testing). + + 5. You can play with your kernel without breaking things. + + 6. You can use it as a sandbox for testing new apps. + + 7. You can try new development kernels safely. + + 8. You can run different distributions simultaneously. + + 9. It's extremely fun. + + + + + + 2. Compiling the kernel and modules + + + + + 2.1. Compiling the kernel + + + Compiling the user mode kernel is just like compiling any other + kernel. Let's go through the steps, using 2.4.0-prerelease (current + as of this writing) as an example: + + + 1. Download the latest UML patch from + + the download page + . + + + 3. Make a directory and unpack the kernel into it. + + + + host% + mkdir ~/uml + + + + + + + host% + cd ~/uml + + + + + + + host% + tar -xzvf linux-2.4.0-prerelease.tar.bz2 + + + + + + + 4. Apply the patch using + + + + host% + cd ~/uml/linux + + + + host% + bzcat uml-patch-2.4.0-prerelease.bz2 | patch -p1 + + + + + + + 5. Run your favorite config; `make xconfig ARCH=um' is the most + convenient. `make config ARCH=um' and 'make menuconfig ARCH=um' + will work as well. The defaults will give you a useful kernel. If + you want to change something, go ahead, it probably won't hurt + anything. + + + Note: If the host is configured with a 2G/2G address space split + rather than the usual 3G/1G split, then the packaged UML binaries + will not run. They will immediately segfault. See ``UML on 2G/2G + hosts'' for the scoop on running UML on your system. + + + + 6. Finish with `make linux ARCH=um': the result is a file called + `linux' in the top directory of your source tree. + + Make sure that you don't build this kernel in /usr/src/linux. On some + distributions, /usr/include/asm is a link into this pool. The user- + mode build changes the other end of that link, and things that include + stop compiling. + + The sources are also available from cvs at the project's cvs page, + which has directions on getting the sources. You can also browse the + CVS pool from there. + + If you get the CVS sources, you will have to check them out into an + empty directory. You will then have to copy each file into the + corresponding directory in the appropriate kernel pool. + + If you don't have the latest kernel pool, you can get the + corresponding user-mode sources with + + + host% cvs co -r v_2_3_x linux + + + + + where 'x' is the version in your pool. Note that you will not get the + bug fixes and enhancements that have gone into subsequent releases. + + + 2.2. Compiling and installing kernel modules + + UML modules are built in the same way as the native kernel (with the + exception of the 'ARCH=um' that you always need for UML): + + + host% make modules ARCH=um + + + + + Any modules that you want to load into this kernel need to be built in + the user-mode pool. Modules from the native kernel won't work. + + You can install them by using ftp or something to copy them into the + virtual machine and dropping them into /lib/modules/`uname -r`. + + You can also get the kernel build process to install them as follows: + + 1. with the kernel not booted, mount the root filesystem in the top + level of the kernel pool: + + + host% mount root_fs mnt -o loop + + + + + + + 2. run + + + host% + make modules_install INSTALL_MOD_PATH=`pwd`/mnt ARCH=um + + + + + + + 3. unmount the filesystem + + + host% umount mnt + + + + + + + 4. boot the kernel on it + + + When the system is booted, you can use insmod as usual to get the + modules into the kernel. A number of things have been loaded into UML + as modules, especially filesystems and network protocols and filters, + so most symbols which need to be exported probably already are. + However, if you do find symbols that need exporting, let us + know, and + they'll be "taken care of". + + + + 2.3. Compiling and installing uml_utilities + + Many features of the UML kernel require a user-space helper program, + so a uml_utilities package is distributed separately from the kernel + patch which provides these helpers. Included within this is: + + o port-helper - Used by consoles which connect to xterms or ports + + o tunctl - Configuration tool to create and delete tap devices + + o uml_net - Setuid binary for automatic tap device configuration + + o uml_switch - User-space virtual switch required for daemon + transport + + The uml_utilities tree is compiled with: + + + host# + make && make install + + + + + Note that UML kernel patches may require a specific version of the + uml_utilities distribution. If you don't keep up with the mailing + lists, ensure that you have the latest release of uml_utilities if you + are experiencing problems with your UML kernel, particularly when + dealing with consoles or command-line switches to the helper programs + + + + + + + + + 3. Running UML and logging in + + + + 3.1. Running UML + + It runs on 2.2.15 or later, and all 2.4 kernels. + + + Booting UML is straightforward. Simply run 'linux': it will try to + mount the file `root_fs' in the current directory. You do not need to + run it as root. If your root filesystem is not named `root_fs', then + you need to put a `ubd0=root_fs_whatever' switch on the linux command + line. + + + You will need a filesystem to boot UML from. There are a number + available for download from here . There are also several tools + which can be + used to generate UML-compatible filesystem images from media. + The kernel will boot up and present you with a login prompt. + + + Note: If the host is configured with a 2G/2G address space split + rather than the usual 3G/1G split, then the packaged UML binaries will + not run. They will immediately segfault. See ``UML on 2G/2G hosts'' + for the scoop on running UML on your system. + + + + 3.2. Logging in + + + + The prepackaged filesystems have a root account with password 'root' + and a user account with password 'user'. The login banner will + generally tell you how to log in. So, you log in and you will find + yourself inside a little virtual machine. Our filesystems have a + variety of commands and utilities installed (and it is fairly easy to + add more), so you will have a lot of tools with which to poke around + the system. + + There are a couple of other ways to log in: + + o On a virtual console + + + + Each virtual console that is configured (i.e. the device exists in + /dev and /etc/inittab runs a getty on it) will come up in its own + xterm. If you get tired of the xterms, read ``Setting up serial + lines and consoles'' to see how to attach the consoles to + something else, like host ptys. + + + + o Over the serial line + + + In the boot output, find a line that looks like: + + + + serial line 0 assigned pty /dev/ptyp1 + + + + + Attach your favorite terminal program to the corresponding tty. I.e. + for minicom, the command would be + + + host% minicom -o -p /dev/ttyp1 + + + + + + + o Over the net + + + If the network is running, then you can telnet to the virtual + machine and log in to it. See ``Setting up the network'' to learn + about setting up a virtual network. + + When you're done using it, run halt, and the kernel will bring itself + down and the process will exit. + + + 3.3. Examples + + Here are some examples of UML in action: + + o A login session + + o A virtual network + + + + + + + + 4. UML on 2G/2G hosts + + + + + 4.1. Introduction + + + Most Linux machines are configured so that the kernel occupies the + upper 1G (0xc0000000 - 0xffffffff) of the 4G address space and + processes use the lower 3G (0x00000000 - 0xbfffffff). However, some + machine are configured with a 2G/2G split, with the kernel occupying + the upper 2G (0x80000000 - 0xffffffff) and processes using the lower + 2G (0x00000000 - 0x7fffffff). + + + + + 4.2. The problem + + + The prebuilt UML binaries on this site will not run on 2G/2G hosts + because UML occupies the upper .5G of the 3G process address space + (0xa0000000 - 0xbfffffff). Obviously, on 2G/2G hosts, this is right + in the middle of the kernel address space, so UML won't even load - it + will immediately segfault. + + + + + 4.3. The solution + + + The fix for this is to rebuild UML from source after enabling + CONFIG_HOST_2G_2G (under 'General Setup'). This will cause UML to + load itself in the top .5G of that smaller process address space, + where it will run fine. See ``Compiling the kernel and modules'' if + you need help building UML from source. + + + + + + + + + + + 5. Setting up serial lines and consoles + + + It is possible to attach UML serial lines and consoles to many types + of host I/O channels by specifying them on the command line. + + + You can attach them to host ptys, ttys, file descriptors, and ports. + This allows you to do things like + + o have a UML console appear on an unused host console, + + o hook two virtual machines together by having one attach to a pty + and having the other attach to the corresponding tty + + o make a virtual machine accessible from the net by attaching a + console to a port on the host. + + + The general format of the command line option is device=channel. + + + + 5.1. Specifying the device + + Devices are specified with "con" or "ssl" (console or serial line, + respectively), optionally with a device number if you are talking + about a specific device. + + + Using just "con" or "ssl" describes all of the consoles or serial + lines. If you want to talk about console #3 or serial line #10, they + would be "con3" and "ssl10", respectively. + + + A specific device name will override a less general "con=" or "ssl=". + So, for example, you can assign a pty to each of the serial lines + except for the first two like this: + + + ssl=pty ssl0=tty:/dev/tty0 ssl1=tty:/dev/tty1 + + + + + The specificity of the device name is all that matters; order on the + command line is irrelevant. + + + + 5.2. Specifying the channel + + There are a number of different types of channels to attach a UML + device to, each with a different way of specifying exactly what to + attach to. + + o pseudo-terminals - device=pty pts terminals - device=pts + + + This will cause UML to allocate a free host pseudo-terminal for the + device. The terminal that it got will be announced in the boot + log. You access it by attaching a terminal program to the + corresponding tty: + + o screen /dev/pts/n + + o screen /dev/ttyxx + + o minicom -o -p /dev/ttyxx - minicom seems not able to handle pts + devices + + o kermit - start it up, 'open' the device, then 'connect' + + + + + + o terminals - device=tty:tty device file + + + This will make UML attach the device to the specified tty (i.e + + + con1=tty:/dev/tty3 + + + + + will attach UML's console 1 to the host's /dev/tty3). If the tty that + you specify is the slave end of a tty/pty pair, something else must + have already opened the corresponding pty in order for this to work. + + + + + + o xterms - device=xterm + + + UML will run an xterm and the device will be attached to it. + + + + + + o Port - device=port:port number + + + This will attach the UML devices to the specified host port. + Attaching console 1 to the host's port 9000 would be done like + this: + + + con1=port:9000 + + + + + Attaching all the serial lines to that port would be done similarly: + + + ssl=port:9000 + + + + + You access these devices by telnetting to that port. Each active tel- + net session gets a different device. If there are more telnets to a + port than UML devices attached to it, then the extra telnet sessions + will block until an existing telnet detaches, or until another device + becomes active (i.e. by being activated in /etc/inittab). + + This channel has the advantage that you can both attach multiple UML + devices to it and know how to access them without reading the UML boot + log. It is also unique in allowing access to a UML from remote + machines without requiring that the UML be networked. This could be + useful in allowing public access to UMLs because they would be + accessible from the net, but wouldn't need any kind of network + filtering or access control because they would have no network access. + + + If you attach the main console to a portal, then the UML boot will + appear to hang. In reality, it's waiting for a telnet to connect, at + which point the boot will proceed. + + + + + + o already-existing file descriptors - device=file descriptor + + + If you set up a file descriptor on the UML command line, you can + attach a UML device to it. This is most commonly used to put the + main console back on stdin and stdout after assigning all the other + consoles to something else: + + + con0=fd:0,fd:1 con=pts + + + + + + + + + o Nothing - device=null + + + This allows the device to be opened, in contrast to 'none', but + reads will block, and writes will succeed and the data will be + thrown out. + + + + + + o None - device=none + + + This causes the device to disappear. + + + + You can also specify different input and output channels for a device + by putting a comma between them: + + + ssl3=tty:/dev/tty2,xterm + + + + + will cause serial line 3 to accept input on the host's /dev/tty2 and + display output on an xterm. That's a silly example - the most common + use of this syntax is to reattach the main console to stdin and stdout + as shown above. + + + If you decide to move the main console away from stdin/stdout, the + initial boot output will appear in the terminal that you're running + UML in. However, once the console driver has been officially + initialized, then the boot output will start appearing wherever you + specified that console 0 should be. That device will receive all + subsequent output. + + + + 5.3. Examples + + There are a number of interesting things you can do with this + capability. + + + First, this is how you get rid of those bleeding console xterms by + attaching them to host ptys: + + + con=pty con0=fd:0,fd:1 + + + + + This will make a UML console take over an unused host virtual console, + so that when you switch to it, you will see the UML login prompt + rather than the host login prompt: + + + con1=tty:/dev/tty6 + + + + + You can attach two virtual machines together with what amounts to a + serial line as follows: + + Run one UML with a serial line attached to a pty - + + + ssl1=pty + + + + + Look at the boot log to see what pty it got (this example will assume + that it got /dev/ptyp1). + + Boot the other UML with a serial line attached to the corresponding + tty - + + + ssl1=tty:/dev/ttyp1 + + + + + Log in, make sure that it has no getty on that serial line, attach a + terminal program like minicom to it, and you should see the login + prompt of the other virtual machine. + + + 6. Setting up the network + + + + This page describes how to set up the various transports and to + provide a UML instance with network access to the host, other machines + on the local net, and the rest of the net. + + + As of 2.4.5, UML networking has been completely redone to make it much + easier to set up, fix bugs, and add new features. + + + There is a new helper, uml_net, which does the host setup that + requires root privileges. + + + There are currently five transport types available for a UML virtual + machine to exchange packets with other hosts: + + o ethertap + + o TUN/TAP + + o Multicast + + o a switch daemon + + o slip + + o slirp + + o pcap + + The TUN/TAP, ethertap, slip, and slirp transports allow a UML + instance to exchange packets with the host. They may be directed + to the host or the host may just act as a router to provide access + to other physical or virtual machines. + + + The pcap transport is a synthetic read-only interface, using the + libpcap binary to collect packets from interfaces on the host and + filter them. This is useful for building preconfigured traffic + monitors or sniffers. + + + The daemon and multicast transports provide a completely virtual + network to other virtual machines. This network is completely + disconnected from the physical network unless one of the virtual + machines on it is acting as a gateway. + + + With so many host transports, which one should you use? Here's when + you should use each one: + + o ethertap - if you want access to the host networking and it is + running 2.2 + + o TUN/TAP - if you want access to the host networking and it is + running 2.4. Also, the TUN/TAP transport is able to use a + preconfigured device, allowing it to avoid using the setuid uml_net + helper, which is a security advantage. + + o Multicast - if you want a purely virtual network and you don't want + to set up anything but the UML + + o a switch daemon - if you want a purely virtual network and you + don't mind running the daemon in order to get somewhat better + performance + + o slip - there is no particular reason to run the slip backend unless + ethertap and TUN/TAP are just not available for some reason + + o slirp - if you don't have root access on the host to setup + networking, or if you don't want to allocate an IP to your UML + + o pcap - not much use for actual network connectivity, but great for + monitoring traffic on the host + + Ethertap is available on 2.4 and works fine. TUN/TAP is preferred + to it because it has better performance and ethertap is officially + considered obsolete in 2.4. Also, the root helper only needs to + run occasionally for TUN/TAP, rather than handling every packet, as + it does with ethertap. This is a slight security advantage since + it provides fewer opportunities for a nasty UML user to somehow + exploit the helper's root privileges. + + + 6.1. General setup + + First, you must have the virtual network enabled in your UML. If are + running a prebuilt kernel from this site, everything is already + enabled. If you build the kernel yourself, under the "Network device + support" menu, enable "Network device support", and then the three + transports. + + + The next step is to provide a network device to the virtual machine. + This is done by describing it on the kernel command line. + + The general format is + + + eth = , + + + + + For example, a virtual ethernet device may be attached to a host + ethertap device as follows: + + + eth0=ethertap,tap0,fe:fd:0:0:0:1,192.168.0.254 + + + + + This sets up eth0 inside the virtual machine to attach itself to the + host /dev/tap0, assigns it an ethernet address, and assigns the host + tap0 interface an IP address. + + + + Note that the IP address you assign to the host end of the tap device + must be different than the IP you assign to the eth device inside UML. + If you are short on IPs and don't want to consume two per UML, then + you can reuse the host's eth IP address for the host ends of the tap + devices. Internally, the UMLs must still get unique IPs for their eth + devices. You can also give the UMLs non-routable IPs (192.168.x.x or + 10.x.x.x) and have the host masquerade them. This will let outgoing + connections work, but incoming connections won't without more work, + such as port forwarding from the host. + Also note that when you configure the host side of an interface, it is + only acting as a gateway. It will respond to pings sent to it + locally, but is not useful to do that since it's a host interface. + You are not talking to the UML when you ping that interface and get a + response. + + + You can also add devices to a UML and remove them at runtime. See the + ``The Management Console'' page for details. + + + The sections below describe this in more detail. + + + Once you've decided how you're going to set up the devices, you boot + UML, log in, configure the UML side of the devices, and set up routes + to the outside world. At that point, you will be able to talk to any + other machines, physical or virtual, on the net. + + + If ifconfig inside UML fails and the network refuses to come up, run + tell you what went wrong. + + + + 6.2. Userspace daemons + + You will likely need the setuid helper, or the switch daemon, or both. + They are both installed with the RPM and deb, so if you've installed + either, you can skip the rest of this section. + + + If not, then you need to check them out of CVS, build them, and + install them. The helper is uml_net, in CVS /tools/uml_net, and the + daemon is uml_switch, in CVS /tools/uml_router. They are both built + with a plain 'make'. Both need to be installed in a directory that's + in your path - /usr/bin is recommend. On top of that, uml_net needs + to be setuid root. + + + + 6.3. Specifying ethernet addresses + + Below, you will see that the TUN/TAP, ethertap, and daemon interfaces + allow you to specify hardware addresses for the virtual ethernet + devices. This is generally not necessary. If you don't have a + specific reason to do it, you probably shouldn't. If one is not + specified on the command line, the driver will assign one based on the + device IP address. It will provide the address fe:fd:nn:nn:nn:nn + where nn.nn.nn.nn is the device IP address. This is nearly always + sufficient to guarantee a unique hardware address for the device. A + couple of exceptions are: + + o Another set of virtual ethernet devices are on the same network and + they are assigned hardware addresses using a different scheme which + may conflict with the UML IP address-based scheme + + o You aren't going to use the device for IP networking, so you don't + assign the device an IP address + + If you let the driver provide the hardware address, you should make + sure that the device IP address is known before the interface is + brought up. So, inside UML, this will guarantee that: + + + + UML# + ifconfig eth0 192.168.0.250 up + + + + + If you decide to assign the hardware address yourself, make sure that + the first byte of the address is even. Addresses with an odd first + byte are broadcast addresses, which you don't want assigned to a + device. + + + + 6.4. UML interface setup + + Once the network devices have been described on the command line, you + should boot UML and log in. + + + The first thing to do is bring the interface up: + + + UML# ifconfig ethn ip-address up + + + + + You should be able to ping the host at this point. + + + To reach the rest of the world, you should set a default route to the + host: + + + UML# route add default gw host ip + + + + + Again, with host ip of 192.168.0.4: + + + UML# route add default gw 192.168.0.4 + + + + + This page used to recommend setting a network route to your local net. + This is wrong, because it will cause UML to try to figure out hardware + addresses of the local machines by arping on the interface to the + host. Since that interface is basically a single strand of ethernet + with two nodes on it (UML and the host) and arp requests don't cross + networks, they will fail to elicit any responses. So, what you want + is for UML to just blindly throw all packets at the host and let it + figure out what to do with them, which is what leaving out the network + route and adding the default route does. + + + Note: If you can't communicate with other hosts on your physical + ethernet, it's probably because of a network route that's + automatically set up. If you run 'route -n' and see a route that + looks like this: + + + + + Destination Gateway Genmask Flags Metric Ref Use Iface + 192.168.0.0 0.0.0.0 255.255.255.0 U 0 0 0 eth0 + + + + + with a mask that's not 255.255.255.255, then replace it with a route + to your host: + + + UML# + route del -net 192.168.0.0 dev eth0 netmask 255.255.255.0 + + + + + + + UML# + route add -host 192.168.0.4 dev eth0 + + + + + This, plus the default route to the host, will allow UML to exchange + packets with any machine on your ethernet. + + + + 6.5. Multicast + + The simplest way to set up a virtual network between multiple UMLs is + to use the mcast transport. This was written by Harald Welte and is + present in UML version 2.4.5-5um and later. Your system must have + multicast enabled in the kernel and there must be a multicast-capable + network device on the host. Normally, this is eth0, but if there is + no ethernet card on the host, then you will likely get strange error + messages when you bring the device up inside UML. + + + To use it, run two UMLs with + + + eth0=mcast + + + + + on their command lines. Log in, configure the ethernet device in each + machine with different IP addresses: + + + UML1# ifconfig eth0 192.168.0.254 + + + + + + + UML2# ifconfig eth0 192.168.0.253 + + + + + and they should be able to talk to each other. + + The full set of command line options for this transport are + + + + ethn=mcast,ethernet address,multicast + address,multicast port,ttl + + + + + Harald's original README is here and explains these in detail, as well as + some other issues. + + There is also a related point-to-point only "ucast" transport. + This is useful when your network does not support multicast, and + all network connections are simple point to point links. + + The full set of command line options for this transport are + + + ethn=ucast,ethernet address,remote address,listen port,remote port + + + + + 6.6. TUN/TAP with the uml_net helper + + TUN/TAP is the preferred mechanism on 2.4 to exchange packets with the + host. The TUN/TAP backend has been in UML since 2.4.9-3um. + + + The easiest way to get up and running is to let the setuid uml_net + helper do the host setup for you. This involves insmod-ing the tun.o + module if necessary, configuring the device, and setting up IP + forwarding, routing, and proxy arp. If you are new to UML networking, + do this first. If you're concerned about the security implications of + the setuid helper, use it to get up and running, then read the next + section to see how to have UML use a preconfigured tap device, which + avoids the use of uml_net. + + + If you specify an IP address for the host side of the device, the + uml_net helper will do all necessary setup on the host - the only + requirement is that TUN/TAP be available, either built in to the host + kernel or as the tun.o module. + + The format of the command line switch to attach a device to a TUN/TAP + device is + + + eth =tuntap,,, + + + + + For example, this argument will attach the UML's eth0 to the next + available tap device and assign an ethernet address to it based on its + IP address + + + eth0=tuntap,,,192.168.0.254 + + + + + + + Note that the IP address that must be used for the eth device inside + UML is fixed by the routing and proxy arp that is set up on the + TUN/TAP device on the host. You can use a different one, but it won't + work because reply packets won't reach the UML. This is a feature. + It prevents a nasty UML user from doing things like setting the UML IP + to the same as the network's nameserver or mail server. + + + There are a couple potential problems with running the TUN/TAP + transport on a 2.4 host kernel + + o TUN/TAP seems not to work on 2.4.3 and earlier. Upgrade the host + kernel or use the ethertap transport. + + o With an upgraded kernel, TUN/TAP may fail with + + + File descriptor in bad state + + + + + This is due to a header mismatch between the upgraded kernel and the + kernel that was originally installed on the machine. The fix is to + make sure that /usr/src/linux points to the headers for the running + kernel. + + These were pointed out by Tim Robinson in + name="this uml- + user post"> . + + + + 6.7. TUN/TAP with a preconfigured tap device + + If you prefer not to have UML use uml_net (which is somewhat + insecure), with UML 2.4.17-11, you can set up a TUN/TAP device + beforehand. The setup needs to be done as root, but once that's done, + there is no need for root assistance. Setting up the device is done + as follows: + + o Create the device with tunctl (available from the UML utilities + tarball) + + + + + host# tunctl -u uid + + + + + where uid is the user id or username that UML will be run as. This + will tell you what device was created. + + o Configure the device IP (change IP addresses and device name to + suit) + + + + + host# ifconfig tap0 192.168.0.254 up + + + + + + o Set up routing and arping if desired - this is my recipe, there are + other ways of doing the same thing + + + host# + bash -c 'echo 1 > /proc/sys/net/ipv4/ip_forward' + + host# + route add -host 192.168.0.253 dev tap0 + + + + + + + host# + bash -c 'echo 1 > /proc/sys/net/ipv4/conf/tap0/proxy_arp' + + + + + + + host# + arp -Ds 192.168.0.253 eth0 pub + + + + + Note that this must be done every time the host boots - this configu- + ration is not stored across host reboots. So, it's probably a good + idea to stick it in an rc file. An even better idea would be a little + utility which reads the information from a config file and sets up + devices at boot time. + + o Rather than using up two IPs and ARPing for one of them, you can + also provide direct access to your LAN by the UML by using a + bridge. + + + host# + brctl addbr br0 + + + + + + + host# + ifconfig eth0 0.0.0.0 promisc up + + + + + + + host# + ifconfig tap0 0.0.0.0 promisc up + + + + + + + host# + ifconfig br0 192.168.0.1 netmask 255.255.255.0 up + + + + + + + + host# + brctl stp br0 off + + + + + + + host# + brctl setfd br0 1 + + + + + + + host# + brctl sethello br0 1 + + + + + + + host# + brctl addif br0 eth0 + + + + + + + host# + brctl addif br0 tap0 + + + + + Note that 'br0' should be setup using ifconfig with the existing IP + address of eth0, as eth0 no longer has its own IP. + + o + + + Also, the /dev/net/tun device must be writable by the user running + UML in order for the UML to use the device that's been configured + for it. The simplest thing to do is + + + host# chmod 666 /dev/net/tun + + + + + Making it world-writable looks bad, but it seems not to be + exploitable as a security hole. However, it does allow anyone to cre- + ate useless tap devices (useless because they can't configure them), + which is a DOS attack. A somewhat more secure alternative would to be + to create a group containing all the users who have preconfigured tap + devices and chgrp /dev/net/tun to that group with mode 664 or 660. + + + o Once the device is set up, run UML with 'eth0=tuntap,device name' + (i.e. 'eth0=tuntap,tap0') on the command line (or do it with the + mconsole config command). + + o Bring the eth device up in UML and you're in business. + + If you don't want that tap device any more, you can make it non- + persistent with + + + host# tunctl -d tap device + + + + + Finally, tunctl has a -b (for brief mode) switch which causes it to + output only the name of the tap device it created. This makes it + suitable for capture by a script: + + + host# TAP=`tunctl -u 1000 -b` + + + + + + + 6.8. Ethertap + + Ethertap is the general mechanism on 2.2 for userspace processes to + exchange packets with the kernel. + + + + To use this transport, you need to describe the virtual network device + on the UML command line. The general format for this is + + + eth =ethertap, , , + + + + + So, the previous example + + + eth0=ethertap,tap0,fe:fd:0:0:0:1,192.168.0.254 + + + + + attaches the UML eth0 device to the host /dev/tap0, assigns it the + ethernet address fe:fd:0:0:0:1, and assigns the IP address + 192.168.0.254 to the tap device. + + + + The tap device is mandatory, but the others are optional. If the + ethernet address is omitted, one will be assigned to it. + + + The presence of the tap IP address will cause the helper to run and do + whatever host setup is needed to allow the virtual machine to + communicate with the outside world. If you're not sure you know what + you're doing, this is the way to go. + + + If it is absent, then you must configure the tap device and whatever + arping and routing you will need on the host. However, even in this + case, the uml_net helper still needs to be in your path and it must be + setuid root if you're not running UML as root. This is because the + tap device doesn't support SIGIO, which UML needs in order to use + something as a source of input. So, the helper is used as a + convenient asynchronous IO thread. + + If you're using the uml_net helper, you can ignore the following host + setup - uml_net will do it for you. You just need to make sure you + have ethertap available, either built in to the host kernel or + available as a module. + + + If you want to set things up yourself, you need to make sure that the + appropriate /dev entry exists. If it doesn't, become root and create + it as follows: + + + mknod /dev/tap c 36 + 16 + + + + + For example, this is how to create /dev/tap0: + + + mknod /dev/tap0 c 36 0 + 16 + + + + + You also need to make sure that the host kernel has ethertap support. + If ethertap is enabled as a module, you apparently need to insmod + ethertap once for each ethertap device you want to enable. So, + + + host# + insmod ethertap + + + + + will give you the tap0 interface. To get the tap1 interface, you need + to run + + + host# + insmod ethertap unit=1 -o ethertap1 + + + + + + + + 6.9. The switch daemon + + Note: This is the daemon formerly known as uml_router, but which was + renamed so the network weenies of the world would stop growling at me. + + + The switch daemon, uml_switch, provides a mechanism for creating a + totally virtual network. By default, it provides no connection to the + host network (but see -tap, below). + + + The first thing you need to do is run the daemon. Running it with no + arguments will make it listen on a default pair of unix domain + sockets. + + + If you want it to listen on a different pair of sockets, use + + + -unix control socket data socket + + + + + + If you want it to act as a hub rather than a switch, use + + + -hub + + + + + + If you want the switch to be connected to host networking (allowing + the umls to get access to the outside world through the host), use + + + -tap tap0 + + + + + + Note that the tap device must be preconfigured (see "TUN/TAP with a + preconfigured tap device", above). If you're using a different tap + device than tap0, specify that instead of tap0. + + + uml_switch can be backgrounded as follows + + + host% + uml_switch [ options ] < /dev/null > /dev/null + + + + + The reason it doesn't background by default is that it listens to + stdin for EOF. When it sees that, it exits. + + + The general format of the kernel command line switch is + + + + ethn=daemon,ethernet address,socket + type,control socket,data socket + + + + + You can leave off everything except the 'daemon'. You only need to + specify the ethernet address if the one that will be assigned to it + isn't acceptable for some reason. The rest of the arguments describe + how to communicate with the daemon. You should only specify them if + you told the daemon to use different sockets than the default. So, if + you ran the daemon with no arguments, running the UML on the same + machine with + eth0=daemon + + + + + will cause the eth0 driver to attach itself to the daemon correctly. + + + + 6.10. Slip + + Slip is another, less general, mechanism for a process to communicate + with the host networking. In contrast to the ethertap interface, + which exchanges ethernet frames with the host and can be used to + transport any higher-level protocol, it can only be used to transport + IP. + + + The general format of the command line switch is + + + + ethn=slip,slip IP + + + + + The slip IP argument is the IP address that will be assigned to the + host end of the slip device. If it is specified, the helper will run + and will set up the host so that the virtual machine can reach it and + the rest of the network. + + + There are some oddities with this interface that you should be aware + of. You should only specify one slip device on a given virtual + machine, and its name inside UML will be 'umn', not 'eth0' or whatever + you specified on the command line. These problems will be fixed at + some point. + + + + 6.11. Slirp + + slirp uses an external program, usually /usr/bin/slirp, to provide IP + only networking connectivity through the host. This is similar to IP + masquerading with a firewall, although the translation is performed in + user-space, rather than by the kernel. As slirp does not set up any + interfaces on the host, or changes routing, slirp does not require + root access or setuid binaries on the host. + + + The general format of the command line switch for slirp is: + + + + ethn=slirp,ethernet address,slirp path + + + + + The ethernet address is optional, as UML will set up the interface + with an ethernet address based upon the initial IP address of the + interface. The slirp path is generally /usr/bin/slirp, although it + will depend on distribution. + + + The slirp program can have a number of options passed to the command + line and we can't add them to the UML command line, as they will be + parsed incorrectly. Instead, a wrapper shell script can be written or + the options inserted into the /.slirprc file. More information on + all of the slirp options can be found in its man pages. + + + The eth0 interface on UML should be set up with the IP 10.2.0.15, + although you can use anything as long as it is not used by a network + you will be connecting to. The default route on UML should be set to + use + + + UML# + route add default dev eth0 + + + + + slirp provides a number of useful IP addresses which can be used by + UML, such as 10.0.2.3 which is an alias for the DNS server specified + in /etc/resolv.conf on the host or the IP given in the 'dns' option + for slirp. + + + Even with a baudrate setting higher than 115200, the slirp connection + is limited to 115200. If you need it to go faster, the slirp binary + needs to be compiled with FULL_BOLT defined in config.h. + + + + 6.12. pcap + + The pcap transport is attached to a UML ethernet device on the command + line or with uml_mconsole with the following syntax: + + + + ethn=pcap,host interface,filter + expression,option1,option2 + + + + + The expression and options are optional. + + + The interface is whatever network device on the host you want to + sniff. The expression is a pcap filter expression, which is also what + tcpdump uses, so if you know how to specify tcpdump filters, you will + use the same expressions here. The options are up to two of + 'promisc', control whether pcap puts the host interface into + promiscuous mode. 'optimize' and 'nooptimize' control whether the pcap + expression optimizer is used. + + + Example: + + + + eth0=pcap,eth0,tcp + + eth1=pcap,eth0,!tcp + + + + will cause the UML eth0 to emit all tcp packets on the host eth0 and + the UML eth1 to emit all non-tcp packets on the host eth0. + + + + 6.13. Setting up the host yourself + + If you don't specify an address for the host side of the ethertap or + slip device, UML won't do any setup on the host. So this is what is + needed to get things working (the examples use a host-side IP of + 192.168.0.251 and a UML-side IP of 192.168.0.250 - adjust to suit your + own network): + + o The device needs to be configured with its IP address. Tap devices + are also configured with an mtu of 1484. Slip devices are + configured with a point-to-point address pointing at the UML ip + address. + + + host# ifconfig tap0 arp mtu 1484 192.168.0.251 up + + + + + + + host# + ifconfig sl0 192.168.0.251 pointopoint 192.168.0.250 up + + + + + + o If a tap device is being set up, a route is set to the UML IP. + + + UML# route add -host 192.168.0.250 gw 192.168.0.251 + + + + + + o To allow other hosts on your network to see the virtual machine, + proxy arp is set up for it. + + + host# arp -Ds 192.168.0.250 eth0 pub + + + + + + o Finally, the host is set up to route packets. + + + host# echo 1 > /proc/sys/net/ipv4/ip_forward + + + + + + + + + + + 7. Sharing Filesystems between Virtual Machines + + + + + 7.1. A warning + + Don't attempt to share filesystems simply by booting two UMLs from the + same file. That's the same thing as booting two physical machines + from a shared disk. It will result in filesystem corruption. + + + + 7.2. Using layered block devices + + The way to share a filesystem between two virtual machines is to use + the copy-on-write (COW) layering capability of the ubd block driver. + As of 2.4.6-2um, the driver supports layering a read-write private + device over a read-only shared device. A machine's writes are stored + in the private device, while reads come from either device - the + private one if the requested block is valid in it, the shared one if + not. Using this scheme, the majority of data which is unchanged is + shared between an arbitrary number of virtual machines, each of which + has a much smaller file containing the changes that it has made. With + a large number of UMLs booting from a large root filesystem, this + leads to a huge disk space saving. It will also help performance, + since the host will be able to cache the shared data using a much + smaller amount of memory, so UML disk requests will be served from the + host's memory rather than its disks. + + + + + To add a copy-on-write layer to an existing block device file, simply + add the name of the COW file to the appropriate ubd switch: + + + ubd0=root_fs_cow,root_fs_debian_22 + + + + + where 'root_fs_cow' is the private COW file and 'root_fs_debian_22' is + the existing shared filesystem. The COW file need not exist. If it + doesn't, the driver will create and initialize it. Once the COW file + has been initialized, it can be used on its own on the command line: + + + ubd0=root_fs_cow + + + + + The name of the backing file is stored in the COW file header, so it + would be redundant to continue specifying it on the command line. + + + + 7.3. Note! + + When checking the size of the COW file in order to see the gobs of + space that you're saving, make sure you use 'ls -ls' to see the actual + disk consumption rather than the length of the file. The COW file is + sparse, so the length will be very different from the disk usage. + Here is a 'ls -l' of a COW file and backing file from one boot and + shutdown: + host% ls -l cow.debian debian2.2 + -rw-r--r-- 1 jdike jdike 492504064 Aug 6 21:16 cow.debian + -rwxrw-rw- 1 jdike jdike 537919488 Aug 6 20:42 debian2.2 + + + + + Doesn't look like much saved space, does it? Well, here's 'ls -ls': + + + host% ls -ls cow.debian debian2.2 + 880 -rw-r--r-- 1 jdike jdike 492504064 Aug 6 21:16 cow.debian + 525832 -rwxrw-rw- 1 jdike jdike 537919488 Aug 6 20:42 debian2.2 + + + + + Now, you can see that the COW file has less than a meg of disk, rather + than 492 meg. + + + + 7.4. Another warning + + Once a filesystem is being used as a readonly backing file for a COW + file, do not boot directly from it or modify it in any way. Doing so + will invalidate any COW files that are using it. The mtime and size + of the backing file are stored in the COW file header at its creation, + and they must continue to match. If they don't, the driver will + refuse to use the COW file. + + + + + If you attempt to evade this restriction by changing either the + backing file or the COW header by hand, you will get a corrupted + filesystem. + + + + + Among other things, this means that upgrading the distribution in a + backing file and expecting that all of the COW files using it will see + the upgrade will not work. + + + + + 7.5. uml_moo : Merging a COW file with its backing file + + Depending on how you use UML and COW devices, it may be advisable to + merge the changes in the COW file into the backing file every once in + a while. + + + + + The utility that does this is uml_moo. Its usage is + + + host% uml_moo COW file new backing file + + + + + There's no need to specify the backing file since that information is + already in the COW file header. If you're paranoid, boot the new + merged file, and if you're happy with it, move it over the old backing + file. + + + + + uml_moo creates a new backing file by default as a safety measure. It + also has a destructive merge option which will merge the COW file + directly into its current backing file. This is really only usable + when the backing file only has one COW file associated with it. If + there are multiple COWs associated with a backing file, a -d merge of + one of them will invalidate all of the others. However, it is + convenient if you're short of disk space, and it should also be + noticeably faster than a non-destructive merge. + + + + + uml_moo is installed with the UML deb and RPM. If you didn't install + UML from one of those packages, you can also get it from the UML + utilities tar file in tools/moo. + + + + + + + + + 8. Creating filesystems + + + You may want to create and mount new UML filesystems, either because + your root filesystem isn't large enough or because you want to use a + filesystem other than ext2. + + + This was written on the occasion of reiserfs being included in the + 2.4.1 kernel pool, and therefore the 2.4.1 UML, so the examples will + talk about reiserfs. This information is generic, and the examples + should be easy to translate to the filesystem of your choice. + + + 8.1. Create the filesystem file + + dd is your friend. All you need to do is tell dd to create an empty + file of the appropriate size. I usually make it sparse to save time + and to avoid allocating disk space until it's actually used. For + example, the following command will create a sparse 100 meg file full + of zeroes. + + + host% + dd if=/dev/zero of=new_filesystem seek=100 count=1 bs=1M + + + + + + + 8.2. Assign the file to a UML device + + Add an argument like the following to the UML command line: + + ubd4=new_filesystem + + + + + making sure that you use an unassigned ubd device number. + + + + 8.3. Creating and mounting the filesystem + + Make sure that the filesystem is available, either by being built into + the kernel, or available as a module, then boot up UML and log in. If + the root filesystem doesn't have the filesystem utilities (mkfs, fsck, + etc), then get them into UML by way of the net or hostfs. + + + Make the new filesystem on the device assigned to the new file: + + + host# mkreiserfs /dev/ubd/4 + + + <----------- MKREISERFSv2 -----------> + + ReiserFS version 3.6.25 + Block size 4096 bytes + Block count 25856 + Used blocks 8212 + Journal - 8192 blocks (18-8209), journal header is in block 8210 + Bitmaps: 17 + Root block 8211 + Hash function "r5" + ATTENTION: ALL DATA WILL BE LOST ON '/dev/ubd/4'! (y/n)y + journal size 8192 (from 18) + Initializing journal - 0%....20%....40%....60%....80%....100% + Syncing..done. + + + + + Now, mount it: + + + UML# + mount /dev/ubd/4 /mnt + + + + + and you're in business. + + + + + + + + + + 9. Host file access + + + If you want to access files on the host machine from inside UML, you + can treat it as a separate machine and either nfs mount directories + from the host or copy files into the virtual machine with scp or rcp. + However, since UML is running on the host, it can access those + files just like any other process and make them available inside the + virtual machine without needing to use the network. + + + This is now possible with the hostfs virtual filesystem. With it, you + can mount a host directory into the UML filesystem and access the + files contained in it just as you would on the host. + + + 9.1. Using hostfs + + To begin with, make sure that hostfs is available inside the virtual + machine with + + + UML# cat /proc/filesystems + + + + . hostfs should be listed. If it's not, either rebuild the kernel + with hostfs configured into it or make sure that hostfs is built as a + module and available inside the virtual machine, and insmod it. + + + Now all you need to do is run mount: + + + UML# mount none /mnt/host -t hostfs + + + + + will mount the host's / on the virtual machine's /mnt/host. + + + If you don't want to mount the host root directory, then you can + specify a subdirectory to mount with the -o switch to mount: + + + UML# mount none /mnt/home -t hostfs -o /home + + + + + will mount the hosts's /home on the virtual machine's /mnt/home. + + + + 9.2. hostfs as the root filesystem + + It's possible to boot from a directory hierarchy on the host using + hostfs rather than using the standard filesystem in a file. + + To start, you need that hierarchy. The easiest way is to loop mount + an existing root_fs file: + + + host# mount root_fs uml_root_dir -o loop + + + + + You need to change the filesystem type of / in etc/fstab to be + 'hostfs', so that line looks like this: + + /dev/ubd/0 / hostfs defaults 1 1 + + + + + Then you need to chown to yourself all the files in that directory + that are owned by root. This worked for me: + + + host# find . -uid 0 -exec chown jdike {} \; + + + + + Next, make sure that your UML kernel has hostfs compiled in, not as a + module. Then run UML with the boot device pointing at that directory: + + + ubd0=/path/to/uml/root/directory + + + + + UML should then boot as it does normally. + + + 9.3. Building hostfs + + If you need to build hostfs because it's not in your kernel, you have + two choices: + + + + o Compiling hostfs into the kernel: + + + Reconfigure the kernel and set the 'Host filesystem' option under + + + o Compiling hostfs as a module: + + + Reconfigure the kernel and set the 'Host filesystem' option under + be in arch/um/fs/hostfs/hostfs.o. Install that in + /lib/modules/`uname -r`/fs in the virtual machine, boot it up, and + + + UML# insmod hostfs + + + + + + + + + + + + + 10. The Management Console + + + + The UML management console is a low-level interface to the kernel, + somewhat like the i386 SysRq interface. Since there is a full-blown + operating system under UML, there is much greater flexibility possible + than with the SysRq mechanism. + + + There are a number of things you can do with the mconsole interface: + + o get the kernel version + + o add and remove devices + + o halt or reboot the machine + + o Send SysRq commands + + o Pause and resume the UML + + + You need the mconsole client (uml_mconsole) which is present in CVS + (/tools/mconsole) in 2.4.5-9um and later, and will be in the RPM in + 2.4.6. + + + You also need CONFIG_MCONSOLE (under 'General Setup') enabled in UML. + When you boot UML, you'll see a line like: + + + mconsole initialized on /home/jdike/.uml/umlNJ32yL/mconsole + + + + + If you specify a unique machine id one the UML command line, i.e. + + + umid=debian + + + + + you'll see this + + + mconsole initialized on /home/jdike/.uml/debian/mconsole + + + + + That file is the socket that uml_mconsole will use to communicate with + UML. Run it with either the umid or the full path as its argument: + + + host% uml_mconsole debian + + + + + or + + + host% uml_mconsole /home/jdike/.uml/debian/mconsole + + + + + You'll get a prompt, at which you can run one of these commands: + + o version + + o halt + + o reboot + + o config + + o remove + + o sysrq + + o help + + o cad + + o stop + + o go + + + 10.1. version + + This takes no arguments. It prints the UML version. + + + (mconsole) version + OK Linux usermode 2.4.5-9um #1 Wed Jun 20 22:47:08 EDT 2001 i686 + + + + + There are a couple actual uses for this. It's a simple no-op which + can be used to check that a UML is running. It's also a way of + sending an interrupt to the UML. This is sometimes useful on SMP + hosts, where there's a bug which causes signals to UML to be lost, + often causing it to appear to hang. Sending such a UML the mconsole + version command is a good way to 'wake it up' before networking has + been enabled, as it does not do anything to the function of the UML. + + + + 10.2. halt and reboot + + These take no arguments. They shut the machine down immediately, with + no syncing of disks and no clean shutdown of userspace. So, they are + pretty close to crashing the machine. + + + (mconsole) halt + OK + + + + + + + 10.3. config + + "config" adds a new device to the virtual machine. Currently the ubd + and network drivers support this. It takes one argument, which is the + device to add, with the same syntax as the kernel command line. + + + + + (mconsole) + config ubd3=/home/jdike/incoming/roots/root_fs_debian22 + + OK + (mconsole) config eth1=mcast + OK + + + + + + + 10.4. remove + + "remove" deletes a device from the system. Its argument is just the + name of the device to be removed. The device must be idle in whatever + sense the driver considers necessary. In the case of the ubd driver, + the removed block device must not be mounted, swapped on, or otherwise + open, and in the case of the network driver, the device must be down. + + + (mconsole) remove ubd3 + OK + (mconsole) remove eth1 + OK + + + + + + + 10.5. sysrq + + This takes one argument, which is a single letter. It calls the + generic kernel's SysRq driver, which does whatever is called for by + that argument. See the SysRq documentation in + Documentation/admin-guide/sysrq.rst in your favorite kernel tree to + see what letters are valid and what they do. + + + + 10.6. help + + "help" returns a string listing the valid commands and what each one + does. + + + + 10.7. cad + + This invokes the Ctl-Alt-Del action on init. What exactly this ends + up doing is up to /etc/inittab. Normally, it reboots the machine. + With UML, this is usually not desired, so if a halt would be better, + then find the section of inittab that looks like this + + + # What to do when CTRL-ALT-DEL is pressed. + ca:12345:ctrlaltdel:/sbin/shutdown -t1 -a -r now + + + + + and change the command to halt. + + + + 10.8. stop + + This puts the UML in a loop reading mconsole requests until a 'go' + mconsole command is received. This is very useful for making backups + of UML filesystems, as the UML can be stopped, then synced via 'sysrq + s', so that everything is written to the filesystem. You can then copy + the filesystem and then send the UML 'go' via mconsole. + + + Note that a UML running with more than one CPU will have problems + after you send the 'stop' command, as only one CPU will be held in a + mconsole loop and all others will continue as normal. This is a bug, + and will be fixed. + + + + 10.9. go + + This resumes a UML after being paused by a 'stop' command. Note that + when the UML has resumed, TCP connections may have timed out and if + the UML is paused for a long period of time, crond might go a little + crazy, running all the jobs it didn't do earlier. + + + + + + + + + 11. Kernel debugging + + + Note: The interface that makes debugging, as described here, possible + is present in 2.4.0-test6 kernels and later. + + + Since the user-mode kernel runs as a normal Linux process, it is + possible to debug it with gdb almost like any other process. It is + slightly different because the kernel's threads are already being + ptraced for system call interception, so gdb can't ptrace them. + However, a mechanism has been added to work around that problem. + + + In order to debug the kernel, you need build it from source. See + ``Compiling the kernel and modules'' for information on doing that. + Make sure that you enable CONFIG_DEBUGSYM and CONFIG_PT_PROXY during + the config. These will compile the kernel with -g, and enable the + ptrace proxy so that gdb works with UML, respectively. + + + + + 11.1. Starting the kernel under gdb + + You can have the kernel running under the control of gdb from the + beginning by putting 'debug' on the command line. You will get an + xterm with gdb running inside it. The kernel will send some commands + to gdb which will leave it stopped at the beginning of start_kernel. + At this point, you can get things going with 'next', 'step', or + 'cont'. + + + There is a transcript of a debugging session here , with breakpoints being set in the scheduler and in an + interrupt handler. + 11.2. Examining sleeping processes + + Not every bug is evident in the currently running process. Sometimes, + processes hang in the kernel when they shouldn't because they've + deadlocked on a semaphore or something similar. In this case, when + you ^C gdb and get a backtrace, you will see the idle thread, which + isn't very relevant. + + + What you want is the stack of whatever process is sleeping when it + shouldn't be. You need to figure out which process that is, which is + generally fairly easy. Then you need to get its host process id, + which you can do either by looking at ps on the host or at + task.thread.extern_pid in gdb. + + + Now what you do is this: + + o detach from the current thread + + + (UML gdb) det + + + + + + o attach to the thread you are interested in + + + (UML gdb) att + + + + + + o look at its stack and anything else of interest + + + (UML gdb) bt + + + + + Note that you can't do anything at this point that requires that a + process execute, e.g. calling a function + + o when you're done looking at that process, reattach to the current + thread and continue it + + + (UML gdb) + att 1 + + + + + + + (UML gdb) + c + + + + + Here, specifying any pid which is not the process id of a UML thread + will cause gdb to reattach to the current thread. I commonly use 1, + but any other invalid pid would work. + + + + 11.3. Running ddd on UML + + ddd works on UML, but requires a special kludge. The process goes + like this: + + o Start ddd + + + host% ddd linux + + + + + + o With ps, get the pid of the gdb that ddd started. You can ask the + gdb to tell you, but for some reason that confuses things and + causes a hang. + + o run UML with 'debug=parent gdb-pid=' added to the command line + - it will just sit there after you hit return + + o type 'att 1' to the ddd gdb and you will see something like + + + 0xa013dc51 in __kill () + + + (gdb) + + + + + + o At this point, type 'c', UML will boot up, and you can use ddd just + as you do on any other process. + + + + 11.4. Debugging modules + + gdb has support for debugging code which is dynamically loaded into + the process. This support is what is needed to debug kernel modules + under UML. + + + Using that support is somewhat complicated. You have to tell gdb what + object file you just loaded into UML and where in memory it is. Then, + it can read the symbol table, and figure out where all the symbols are + from the load address that you provided. It gets more interesting + when you load the module again (i.e. after an rmmod). You have to + tell gdb to forget about all its symbols, including the main UML ones + for some reason, then load then all back in again. + + + There's an easy way and a hard way to do this. The easy way is to use + the umlgdb expect script written by Chandan Kudige. It basically + automates the process for you. + + + First, you must tell it where your modules are. There is a list in + the script that looks like this: + set MODULE_PATHS { + "fat" "/usr/src/uml/linux-2.4.18/fs/fat/fat.o" + "isofs" "/usr/src/uml/linux-2.4.18/fs/isofs/isofs.o" + "minix" "/usr/src/uml/linux-2.4.18/fs/minix/minix.o" + } + + + + + You change that to list the names and paths of the modules that you + are going to debug. Then you run it from the toplevel directory of + your UML pool and it basically tells you what to do: + + + + + ******** GDB pid is 21903 ******** + Start UML as: ./linux debug gdb-pid=21903 + + + + GNU gdb 5.0rh-5 Red Hat Linux 7.1 + Copyright 2001 Free Software Foundation, Inc. + GDB is free software, covered by the GNU General Public License, and you are + welcome to change it and/or distribute copies of it under certain conditions. + Type "show copying" to see the conditions. + There is absolutely no warranty for GDB. Type "show warranty" for details. + This GDB was configured as "i386-redhat-linux"... + (gdb) b sys_init_module + Breakpoint 1 at 0xa0011923: file module.c, line 349. + (gdb) att 1 + + + + + After you run UML and it sits there doing nothing, you hit return at + the 'att 1' and continue it: + + + Attaching to program: /home/jdike/linux/2.4/um/./linux, process 1 + 0xa00f4221 in __kill () + (UML gdb) c + Continuing. + + + + + At this point, you debug normally. When you insmod something, the + expect magic will kick in and you'll see something like: + + + + + + + + + + + + + + + + + + *** Module hostfs loaded *** + Breakpoint 1, sys_init_module (name_user=0x805abb0 "hostfs", + mod_user=0x8070e00) at module.c:349 + 349 char *name, *n_name, *name_tmp = NULL; + (UML gdb) finish + Run till exit from #0 sys_init_module (name_user=0x805abb0 "hostfs", + mod_user=0x8070e00) at module.c:349 + 0xa00e2e23 in execute_syscall (r=0xa8140284) at syscall_kern.c:411 + 411 else res = EXECUTE_SYSCALL(syscall, regs); + Value returned is $1 = 0 + (UML gdb) + p/x (int)module_list + module_list->size_of_struct + + $2 = 0xa9021054 + (UML gdb) symbol-file ./linux + Load new symbol table from "./linux"? (y or n) y + Reading symbols from ./linux... + done. + (UML gdb) + add-symbol-file /home/jdike/linux/2.4/um/arch/um/fs/hostfs/hostfs.o 0xa9021054 + + add symbol table from file "/home/jdike/linux/2.4/um/arch/um/fs/hostfs/hostfs.o" at + .text_addr = 0xa9021054 + (y or n) y + + Reading symbols from /home/jdike/linux/2.4/um/arch/um/fs/hostfs/hostfs.o... + done. + (UML gdb) p *module_list + $1 = {size_of_struct = 84, next = 0xa0178720, name = 0xa9022de0 "hostfs", + size = 9016, uc = {usecount = {counter = 0}, pad = 0}, flags = 1, + nsyms = 57, ndeps = 0, syms = 0xa9023170, deps = 0x0, refs = 0x0, + init = 0xa90221f0 , cleanup = 0xa902222c , + ex_table_start = 0x0, ex_table_end = 0x0, persist_start = 0x0, + persist_end = 0x0, can_unload = 0, runsize = 0, kallsyms_start = 0x0, + kallsyms_end = 0x0, + archdata_start = 0x1b855
, + archdata_end = 0xe5890000
, + kernel_data = 0xf689c35d
} + >> Finished loading symbols for hostfs ... + + + + + That's the easy way. It's highly recommended. The hard way is + described below in case you're interested in what's going on. + + + Boot the kernel under the debugger and load the module with insmod or + modprobe. With gdb, do: + + + (UML gdb) p module_list + + + + + This is a list of modules that have been loaded into the kernel, with + the most recently loaded module first. Normally, the module you want + is at module_list. If it's not, walk down the next links, looking at + the name fields until find the module you want to debug. Take the + address of that structure, and add module.size_of_struct (which in + 2.4.10 kernels is 96 (0x60)) to it. Gdb can make this hard addition + for you :-): + + + + (UML gdb) + printf "%#x\n", (int)module_list module_list->size_of_struct + + + + + The offset from the module start occasionally changes (before 2.4.0, + it was module.size_of_struct + 4), so it's a good idea to check the + init and cleanup addresses once in a while, as describe below. Now + do: + + + (UML gdb) + add-symbol-file /path/to/module/on/host that_address + + + + + Tell gdb you really want to do it, and you're in business. + + + If there's any doubt that you got the offset right, like breakpoints + appear not to work, or they're appearing in the wrong place, you can + check it by looking at the module structure. The init and cleanup + fields should look like: + + + init = 0x588066b0 , cleanup = 0x588066c0 + + + + + with no offsets on the symbol names. If the names are right, but they + are offset, then the offset tells you how much you need to add to the + address you gave to add-symbol-file. + + + When you want to load in a new version of the module, you need to get + gdb to forget about the old one. The only way I've found to do that + is to tell gdb to forget about all symbols that it knows about: + + + (UML gdb) symbol-file + + + + + Then reload the symbols from the kernel binary: + + + (UML gdb) symbol-file /path/to/kernel + + + + + and repeat the process above. You'll also need to re-enable break- + points. They were disabled when you dumped all the symbols because + gdb couldn't figure out where they should go. + + + + 11.5. Attaching gdb to the kernel + + If you don't have the kernel running under gdb, you can attach gdb to + it later by sending the tracing thread a SIGUSR1. The first line of + the console output identifies its pid: + tracing thread pid = 20093 + + + + + When you send it the signal: + + + host% kill -USR1 20093 + + + + + you will get an xterm with gdb running in it. + + + If you have the mconsole compiled into UML, then the mconsole client + can be used to start gdb: + + + (mconsole) (mconsole) config gdb=xterm + + + + + will fire up an xterm with gdb running in it. + + + + 11.6. Using alternate debuggers + + UML has support for attaching to an already running debugger rather + than starting gdb itself. This is present in CVS as of 17 Apr 2001. + I sent it to Alan for inclusion in the ac tree, and it will be in my + 2.4.4 release. + + + This is useful when gdb is a subprocess of some UI, such as emacs or + ddd. It can also be used to run debuggers other than gdb on UML. + Below is an example of using strace as an alternate debugger. + + + To do this, you need to get the pid of the debugger and pass it in + with the + + + If you are using gdb under some UI, then tell it to 'att 1', and + you'll find yourself attached to UML. + + + If you are using something other than gdb as your debugger, then + you'll need to get it to do the equivalent of 'att 1' if it doesn't do + it automatically. + + + An example of an alternate debugger is strace. You can strace the + actual kernel as follows: + + o Run the following in a shell + + + host% + sh -c 'echo pid=$$; echo -n hit return; read x; exec strace -p 1 -o strace.out' + + + + o Run UML with 'debug' and 'gdb-pid=' with the pid printed out + by the previous command + + o Hit return in the shell, and UML will start running, and strace + output will start accumulating in the output file. + + Note that this is different from running + + + host% strace ./linux + + + + + That will strace only the main UML thread, the tracing thread, which + doesn't do any of the actual kernel work. It just oversees the vir- + tual machine. In contrast, using strace as described above will show + you the low-level activity of the virtual machine. + + + + + + 12. Kernel debugging examples + + 12.1. The case of the hung fsck + + When booting up the kernel, fsck failed, and dropped me into a shell + to fix things up. I ran fsck -y, which hung: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Setting hostname uml [ OK ] + Checking root filesystem + /dev/fhd0 was not cleanly unmounted, check forced. + Error reading block 86894 (Attempt to read block from filesystem resulted in short read) while reading indirect blocks of inode 19780. + + /dev/fhd0: UNEXPECTED INCONSISTENCY; RUN fsck MANUALLY. + (i.e., without -a or -p options) + [ FAILED ] + + *** An error occurred during the file system check. + *** Dropping you to a shell; the system will reboot + *** when you leave the shell. + Give root password for maintenance + (or type Control-D for normal startup): + + [root@uml /root]# fsck -y /dev/fhd0 + fsck -y /dev/fhd0 + Parallelizing fsck version 1.14 (9-Jan-1999) + e2fsck 1.14, 9-Jan-1999 for EXT2 FS 0.5b, 95/08/09 + /dev/fhd0 contains a file system with errors, check forced. + Pass 1: Checking inodes, blocks, and sizes + Error reading block 86894 (Attempt to read block from filesystem resulted in short read) while reading indirect blocks of inode 19780. Ignore error? yes + + Inode 19780, i_blocks is 1548, should be 540. Fix? yes + + Pass 2: Checking directory structure + Error reading block 49405 (Attempt to read block from filesystem resulted in short read). Ignore error? yes + + Directory inode 11858, block 0, offset 0: directory corrupted + Salvage? yes + + Missing '.' in directory inode 11858. + Fix? yes + + Missing '..' in directory inode 11858. + Fix? yes + + + + + + The standard drill in this sort of situation is to fire up gdb on the + signal thread, which, in this case, was pid 1935. In another window, + I run gdb and attach pid 1935. + + + + + ~/linux/2.3.26/um 1016: gdb linux + GNU gdb 4.17.0.11 with Linux support + Copyright 1998 Free Software Foundation, Inc. + GDB is free software, covered by the GNU General Public License, and you are + welcome to change it and/or distribute copies of it under certain conditions. + Type "show copying" to see the conditions. + There is absolutely no warranty for GDB. Type "show warranty" for details. + This GDB was configured as "i386-redhat-linux"... + + (gdb) att 1935 + Attaching to program `/home/dike/linux/2.3.26/um/linux', Pid 1935 + 0x100756d9 in __wait4 () + + + + + + + Let's see what's currently running: + + + + (gdb) p current_task.pid + $1 = 0 + + + + + + It's the idle thread, which means that fsck went to sleep for some + reason and never woke up. + + + Let's guess that the last process in the process list is fsck: + + + + (gdb) p current_task.prev_task.comm + $13 = "fsck.ext2\000\000\000\000\000\000" + + + + + + It is, so let's see what it thinks it's up to: + + + + (gdb) p current_task.prev_task.thread + $14 = {extern_pid = 1980, tracing = 0, want_tracing = 0, forking = 0, + kernel_stack_page = 0, signal_stack = 1342627840, syscall = {id = 4, args = { + 3, 134973440, 1024, 0, 1024}, have_result = 0, result = 50590720}, + request = {op = 2, u = {exec = {ip = 1350467584, sp = 2952789424}, fork = { + regs = {1350467584, 2952789424, 0 }, sigstack = 0, + pid = 0}, switch_to = 0x507e8000, thread = {proc = 0x507e8000, + arg = 0xaffffdb0, flags = 0, new_pid = 0}, input_request = { + op = 1350467584, fd = -1342177872, proc = 0, pid = 0}}}} + + + + + + The interesting things here are the fact that its .thread.syscall.id + is __NR_write (see the big switch in arch/um/kernel/syscall_kern.c or + the defines in include/asm-um/arch/unistd.h), and that it never + returned. Also, its .request.op is OP_SWITCH (see + arch/um/include/user_util.h). These mean that it went into a write, + and, for some reason, called schedule(). + + + The fact that it never returned from write means that its stack should + be fairly interesting. Its pid is 1980 (.thread.extern_pid). That + process is being ptraced by the signal thread, so it must be detached + before gdb can attach it: + + + + + + + + + + + (gdb) call detach(1980) + + Program received signal SIGSEGV, Segmentation fault. + + The program being debugged stopped while in a function called from GDB. + When the function (detach) is done executing, GDB will silently + stop (instead of continuing to evaluate the expression containing + the function call). + (gdb) call detach(1980) + $15 = 0 + + + + + + The first detach segfaults for some reason, and the second one + succeeds. + + + Now I detach from the signal thread, attach to the fsck thread, and + look at its stack: + + + (gdb) det + Detaching from program: /home/dike/linux/2.3.26/um/linux Pid 1935 + (gdb) att 1980 + Attaching to program `/home/dike/linux/2.3.26/um/linux', Pid 1980 + 0x10070451 in __kill () + (gdb) bt + #0 0x10070451 in __kill () + #1 0x10068ccd in usr1_pid (pid=1980) at process.c:30 + #2 0x1006a03f in _switch_to (prev=0x50072000, next=0x507e8000) + at process_kern.c:156 + #3 0x1006a052 in switch_to (prev=0x50072000, next=0x507e8000, last=0x50072000) + at process_kern.c:161 + #4 0x10001d12 in schedule () at core.c:777 + #5 0x1006a744 in __down (sem=0x507d241c) at semaphore.c:71 + #6 0x1006aa10 in __down_failed () at semaphore.c:157 + #7 0x1006c5d8 in segv_handler (sc=0x5006e940) at trap_user.c:174 + #8 0x1006c5ec in kern_segv_handler (sig=11) at trap_user.c:182 + #9 + #10 0x10155404 in errno () + #11 0x1006c0aa in segv (address=1342179328, is_write=2) at trap_kern.c:50 + #12 0x1006c5d8 in segv_handler (sc=0x5006eaf8) at trap_user.c:174 + #13 0x1006c5ec in kern_segv_handler (sig=11) at trap_user.c:182 + #14 + #15 0xc0fd in ?? () + #16 0x10016647 in sys_write (fd=3, + buf=0x80b8800
, count=1024) + at read_write.c:159 + #17 0x1006d5b3 in execute_syscall (syscall=4, args=0x5006ef08) + at syscall_kern.c:254 + #18 0x1006af87 in really_do_syscall (sig=12) at syscall_user.c:35 + #19 + #20 0x400dc8b0 in ?? () + + + + + + The interesting things here are : + + o There are two segfaults on this stack (frames 9 and 14) + + o The first faulting address (frame 11) is 0x50000800 + + (gdb) p (void *)1342179328 + $16 = (void *) 0x50000800 + + + + + + The initial faulting address is interesting because it is on the idle + thread's stack. I had been seeing the idle thread segfault for no + apparent reason, and the cause looked like stack corruption. In hopes + of catching the culprit in the act, I had turned off all protections + to that stack while the idle thread wasn't running. This apparently + tripped that trap. + + + However, the more immediate problem is that second segfault and I'm + going to concentrate on that. First, I want to see where the fault + happened, so I have to go look at the sigcontent struct in frame 8: + + + + (gdb) up + #1 0x10068ccd in usr1_pid (pid=1980) at process.c:30 + 30 kill(pid, SIGUSR1); + (gdb) + #2 0x1006a03f in _switch_to (prev=0x50072000, next=0x507e8000) + at process_kern.c:156 + 156 usr1_pid(getpid()); + (gdb) + #3 0x1006a052 in switch_to (prev=0x50072000, next=0x507e8000, last=0x50072000) + at process_kern.c:161 + 161 _switch_to(prev, next); + (gdb) + #4 0x10001d12 in schedule () at core.c:777 + 777 switch_to(prev, next, prev); + (gdb) + #5 0x1006a744 in __down (sem=0x507d241c) at semaphore.c:71 + 71 schedule(); + (gdb) + #6 0x1006aa10 in __down_failed () at semaphore.c:157 + 157 } + (gdb) + #7 0x1006c5d8 in segv_handler (sc=0x5006e940) at trap_user.c:174 + 174 segv(sc->cr2, sc->err & 2); + (gdb) + #8 0x1006c5ec in kern_segv_handler (sig=11) at trap_user.c:182 + 182 segv_handler(sc); + (gdb) p *sc + Cannot access memory at address 0x0. + + + + + That's not very useful, so I'll try a more manual method: + + + (gdb) p *((struct sigcontext *) (&sig + 1)) + $19 = {gs = 0, __gsh = 0, fs = 0, __fsh = 0, es = 43, __esh = 0, ds = 43, + __dsh = 0, edi = 1342179328, esi = 1350378548, ebp = 1342630440, + esp = 1342630420, ebx = 1348150624, edx = 1280, ecx = 0, eax = 0, + trapno = 14, err = 4, eip = 268480945, cs = 35, __csh = 0, eflags = 66118, + esp_at_signal = 1342630420, ss = 43, __ssh = 0, fpstate = 0x0, oldmask = 0, + cr2 = 1280} + + + + The ip is in handle_mm_fault: + + + (gdb) p (void *)268480945 + $20 = (void *) 0x1000b1b1 + (gdb) i sym $20 + handle_mm_fault + 57 in section .text + + + + + + Specifically, it's in pte_alloc: + + + (gdb) i line *$20 + Line 124 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" + starts at address 0x1000b1b1 + and ends at 0x1000b1b7 . + + + + + + To find where in handle_mm_fault this is, I'll jump forward in the + code until I see an address in that procedure: + + + + (gdb) i line *0x1000b1c0 + Line 126 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" + starts at address 0x1000b1b7 + and ends at 0x1000b1c3 . + (gdb) i line *0x1000b1d0 + Line 131 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" + starts at address 0x1000b1d0 + and ends at 0x1000b1da . + (gdb) i line *0x1000b1e0 + Line 61 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" + starts at address 0x1000b1da + and ends at 0x1000b1e1 . + (gdb) i line *0x1000b1f0 + Line 134 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" + starts at address 0x1000b1f0 + and ends at 0x1000b200 . + (gdb) i line *0x1000b200 + Line 135 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" + starts at address 0x1000b200 + and ends at 0x1000b208 . + (gdb) i line *0x1000b210 + Line 139 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" + starts at address 0x1000b210 + and ends at 0x1000b219 . + (gdb) i line *0x1000b220 + Line 1168 of "memory.c" starts at address 0x1000b21e + and ends at 0x1000b222 . + + + + + + Something is apparently wrong with the page tables or vma_structs, so + lets go back to frame 11 and have a look at them: + + + + #11 0x1006c0aa in segv (address=1342179328, is_write=2) at trap_kern.c:50 + 50 handle_mm_fault(current, vma, address, is_write); + (gdb) call pgd_offset_proc(vma->vm_mm, address) + $22 = (pgd_t *) 0x80a548c + + + + + + That's pretty bogus. Page tables aren't supposed to be in process + text or data areas. Let's see what's in the vma: + + + (gdb) p *vma + $23 = {vm_mm = 0x507d2434, vm_start = 0, vm_end = 134512640, + vm_next = 0x80a4f8c, vm_page_prot = {pgprot = 0}, vm_flags = 31200, + vm_avl_height = 2058, vm_avl_left = 0x80a8c94, vm_avl_right = 0x80d1000, + vm_next_share = 0xaffffdb0, vm_pprev_share = 0xaffffe63, + vm_ops = 0xaffffe7a, vm_pgoff = 2952789626, vm_file = 0xafffffec, + vm_private_data = 0x62} + (gdb) p *vma.vm_mm + $24 = {mmap = 0x507d2434, mmap_avl = 0x0, mmap_cache = 0x8048000, + pgd = 0x80a4f8c, mm_users = {counter = 0}, mm_count = {counter = 134904288}, + map_count = 134909076, mmap_sem = {count = {counter = 135073792}, + sleepers = -1342177872, wait = {lock = , + task_list = {next = 0xaffffe63, prev = 0xaffffe7a}, + __magic = -1342177670, __creator = -1342177300}, __magic = 98}, + page_table_lock = {}, context = 138, start_code = 0, end_code = 0, + start_data = 0, end_data = 0, start_brk = 0, brk = 0, start_stack = 0, + arg_start = 0, arg_end = 0, env_start = 0, env_end = 0, rss = 1350381536, + total_vm = 0, locked_vm = 0, def_flags = 0, cpu_vm_mask = 0, swap_cnt = 0, + swap_address = 0, segments = 0x0} + + + + + + This also pretty bogus. With all of the 0x80xxxxx and 0xaffffxxx + addresses, this is looking like a stack was plonked down on top of + these structures. Maybe it's a stack overflow from the next page: + + + + (gdb) p vma + $25 = (struct vm_area_struct *) 0x507d2434 + + + + + + That's towards the lower quarter of the page, so that would have to + have been pretty heavy stack overflow: + + + + + + + + + + + + + + + (gdb) x/100x $25 + 0x507d2434: 0x507d2434 0x00000000 0x08048000 0x080a4f8c + 0x507d2444: 0x00000000 0x080a79e0 0x080a8c94 0x080d1000 + 0x507d2454: 0xaffffdb0 0xaffffe63 0xaffffe7a 0xaffffe7a + 0x507d2464: 0xafffffec 0x00000062 0x0000008a 0x00000000 + 0x507d2474: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d2484: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d2494: 0x00000000 0x00000000 0x507d2fe0 0x00000000 + 0x507d24a4: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d24b4: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d24c4: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d24d4: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d24e4: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d24f4: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d2504: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d2514: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d2524: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d2534: 0x00000000 0x00000000 0x507d25dc 0x00000000 + 0x507d2544: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d2554: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d2564: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d2574: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d2584: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d2594: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d25a4: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d25b4: 0x00000000 0x00000000 0x00000000 0x00000000 + + + + + + It's not stack overflow. The only "stack-like" piece of this data is + the vma_struct itself. + + + At this point, I don't see any avenues to pursue, so I just have to + admit that I have no idea what's going on. What I will do, though, is + stick a trap on the segfault handler which will stop if it sees any + writes to the idle thread's stack. That was the thing that happened + first, and it may be that if I can catch it immediately, what's going + on will be somewhat clearer. + + + 12.2. Episode 2: The case of the hung fsck + + After setting a trap in the SEGV handler for accesses to the signal + thread's stack, I reran the kernel. + + + fsck hung again, this time by hitting the trap: + + + + + + + + + + + + + + + + + Setting hostname uml [ OK ] + Checking root filesystem + /dev/fhd0 contains a file system with errors, check forced. + Error reading block 86894 (Attempt to read block from filesystem resulted in short read) while reading indirect blocks of inode 19780. + + /dev/fhd0: UNEXPECTED INCONSISTENCY; RUN fsck MANUALLY. + (i.e., without -a or -p options) + [ FAILED ] + + *** An error occurred during the file system check. + *** Dropping you to a shell; the system will reboot + *** when you leave the shell. + Give root password for maintenance + (or type Control-D for normal startup): + + [root@uml /root]# fsck -y /dev/fhd0 + fsck -y /dev/fhd0 + Parallelizing fsck version 1.14 (9-Jan-1999) + e2fsck 1.14, 9-Jan-1999 for EXT2 FS 0.5b, 95/08/09 + /dev/fhd0 contains a file system with errors, check forced. + Pass 1: Checking inodes, blocks, and sizes + Error reading block 86894 (Attempt to read block from filesystem resulted in short read) while reading indirect blocks of inode 19780. Ignore error? yes + + Pass 2: Checking directory structure + Error reading block 49405 (Attempt to read block from filesystem resulted in short read). Ignore error? yes + + Directory inode 11858, block 0, offset 0: directory corrupted + Salvage? yes + + Missing '.' in directory inode 11858. + Fix? yes + + Missing '..' in directory inode 11858. + Fix? yes + + Untested (4127) [100fe44c]: trap_kern.c line 31 + + + + + + I need to get the signal thread to detach from pid 4127 so that I can + attach to it with gdb. This is done by sending it a SIGUSR1, which is + caught by the signal thread, which detaches the process: + + + kill -USR1 4127 + + + + + + Now I can run gdb on it: + + + + + + + + + + + + + + ~/linux/2.3.26/um 1034: gdb linux + GNU gdb 4.17.0.11 with Linux support + Copyright 1998 Free Software Foundation, Inc. + GDB is free software, covered by the GNU General Public License, and you are + welcome to change it and/or distribute copies of it under certain conditions. + Type "show copying" to see the conditions. + There is absolutely no warranty for GDB. Type "show warranty" for details. + This GDB was configured as "i386-redhat-linux"... + (gdb) att 4127 + Attaching to program `/home/dike/linux/2.3.26/um/linux', Pid 4127 + 0x10075891 in __libc_nanosleep () + + + + + + The backtrace shows that it was in a write and that the fault address + (address in frame 3) is 0x50000800, which is right in the middle of + the signal thread's stack page: + + + (gdb) bt + #0 0x10075891 in __libc_nanosleep () + #1 0x1007584d in __sleep (seconds=1000000) + at ../sysdeps/unix/sysv/linux/sleep.c:78 + #2 0x1006ce9a in stop () at user_util.c:191 + #3 0x1006bf88 in segv (address=1342179328, is_write=2) at trap_kern.c:31 + #4 0x1006c628 in segv_handler (sc=0x5006eaf8) at trap_user.c:174 + #5 0x1006c63c in kern_segv_handler (sig=11) at trap_user.c:182 + #6 + #7 0xc0fd in ?? () + #8 0x10016647 in sys_write (fd=3, buf=0x80b8800 "R.", count=1024) + at read_write.c:159 + #9 0x1006d603 in execute_syscall (syscall=4, args=0x5006ef08) + at syscall_kern.c:254 + #10 0x1006af87 in really_do_syscall (sig=12) at syscall_user.c:35 + #11 + #12 0x400dc8b0 in ?? () + #13 + #14 0x400dc8b0 in ?? () + #15 0x80545fd in ?? () + #16 0x804daae in ?? () + #17 0x8054334 in ?? () + #18 0x804d23e in ?? () + #19 0x8049632 in ?? () + #20 0x80491d2 in ?? () + #21 0x80596b5 in ?? () + (gdb) p (void *)1342179328 + $3 = (void *) 0x50000800 + + + + + + Going up the stack to the segv_handler frame and looking at where in + the code the access happened shows that it happened near line 110 of + block_dev.c: + + + + + + + + + + (gdb) up + #1 0x1007584d in __sleep (seconds=1000000) + at ../sysdeps/unix/sysv/linux/sleep.c:78 + ../sysdeps/unix/sysv/linux/sleep.c:78: No such file or directory. + (gdb) + #2 0x1006ce9a in stop () at user_util.c:191 + 191 while(1) sleep(1000000); + (gdb) + #3 0x1006bf88 in segv (address=1342179328, is_write=2) at trap_kern.c:31 + 31 KERN_UNTESTED(); + (gdb) + #4 0x1006c628 in segv_handler (sc=0x5006eaf8) at trap_user.c:174 + 174 segv(sc->cr2, sc->err & 2); + (gdb) p *sc + $1 = {gs = 0, __gsh = 0, fs = 0, __fsh = 0, es = 43, __esh = 0, ds = 43, + __dsh = 0, edi = 1342179328, esi = 134973440, ebp = 1342631484, + esp = 1342630864, ebx = 256, edx = 0, ecx = 256, eax = 1024, trapno = 14, + err = 6, eip = 268550834, cs = 35, __csh = 0, eflags = 66070, + esp_at_signal = 1342630864, ss = 43, __ssh = 0, fpstate = 0x0, oldmask = 0, + cr2 = 1342179328} + (gdb) p (void *)268550834 + $2 = (void *) 0x1001c2b2 + (gdb) i sym $2 + block_write + 1090 in section .text + (gdb) i line *$2 + Line 209 of "/home/dike/linux/2.3.26/um/include/asm/arch/string.h" + starts at address 0x1001c2a1 + and ends at 0x1001c2bf . + (gdb) i line *0x1001c2c0 + Line 110 of "block_dev.c" starts at address 0x1001c2bf + and ends at 0x1001c2e3 . + + + + + + Looking at the source shows that the fault happened during a call to + copy_from_user to copy the data into the kernel: + + + 107 count -= chars; + 108 copy_from_user(p,buf,chars); + 109 p += chars; + 110 buf += chars; + + + + + + p is the pointer which must contain 0x50000800, since buf contains + 0x80b8800 (frame 8 above). It is defined as: + + + p = offset + bh->b_data; + + + + + + I need to figure out what bh is, and it just so happens that bh is + passed as an argument to mark_buffer_uptodate and mark_buffer_dirty a + few lines later, so I do a little disassembly: + + + + + (gdb) disas 0x1001c2bf 0x1001c2e0 + Dump of assembler code from 0x1001c2bf to 0x1001c2d0: + 0x1001c2bf : addl %eax,0xc(%ebp) + 0x1001c2c2 : movl 0xfffffdd4(%ebp),%edx + 0x1001c2c8 : btsl $0x0,0x18(%edx) + 0x1001c2cd : btsl $0x1,0x18(%edx) + 0x1001c2d2 : sbbl %ecx,%ecx + 0x1001c2d4 : testl %ecx,%ecx + 0x1001c2d6 : jne 0x1001c2e3 + 0x1001c2d8 : pushl $0x0 + 0x1001c2da : pushl %edx + 0x1001c2db : call 0x1001819c <__mark_buffer_dirty> + End of assembler dump. + + + + + + At that point, bh is in %edx (address 0x1001c2da), which is calculated + at 0x1001c2c2 as %ebp + 0xfffffdd4, so I figure exactly what that is, + taking %ebp from the sigcontext_struct above: + + + (gdb) p (void *)1342631484 + $5 = (void *) 0x5006ee3c + (gdb) p 0x5006ee3c+0xfffffdd4 + $6 = 1342630928 + (gdb) p (void *)$6 + $7 = (void *) 0x5006ec10 + (gdb) p *((void **)$7) + $8 = (void *) 0x50100200 + + + + + + Now, I look at the structure to see what's in it, and particularly, + what its b_data field contains: + + + (gdb) p *((struct buffer_head *)0x50100200) + $13 = {b_next = 0x50289380, b_blocknr = 49405, b_size = 1024, b_list = 0, + b_dev = 15872, b_count = {counter = 1}, b_rdev = 15872, b_state = 24, + b_flushtime = 0, b_next_free = 0x501001a0, b_prev_free = 0x50100260, + b_this_page = 0x501001a0, b_reqnext = 0x0, b_pprev = 0x507fcf58, + b_data = 0x50000800 "", b_page = 0x50004000, + b_end_io = 0x10017f60 , b_dev_id = 0x0, + b_rsector = 98810, b_wait = {lock = , + task_list = {next = 0x50100248, prev = 0x50100248}, __magic = 1343226448, + __creator = 0}, b_kiobuf = 0x0} + + + + + + The b_data field is indeed 0x50000800, so the question becomes how + that happened. The rest of the structure looks fine, so this probably + is not a case of data corruption. It happened on purpose somehow. + + + The b_page field is a pointer to the page_struct representing the + 0x50000000 page. Looking at it shows the kernel's idea of the state + of that page: + + + + (gdb) p *$13.b_page + $17 = {list = {next = 0x50004a5c, prev = 0x100c5174}, mapping = 0x0, + index = 0, next_hash = 0x0, count = {counter = 1}, flags = 132, lru = { + next = 0x50008460, prev = 0x50019350}, wait = { + lock = , task_list = {next = 0x50004024, + prev = 0x50004024}, __magic = 1342193708, __creator = 0}, + pprev_hash = 0x0, buffers = 0x501002c0, virtual = 1342177280, + zone = 0x100c5160} + + + + + + Some sanity-checking: the virtual field shows the "virtual" address of + this page, which in this kernel is the same as its "physical" address, + and the page_struct itself should be mem_map[0], since it represents + the first page of memory: + + + + (gdb) p (void *)1342177280 + $18 = (void *) 0x50000000 + (gdb) p mem_map + $19 = (mem_map_t *) 0x50004000 + + + + + + These check out fine. + + + Now to check out the page_struct itself. In particular, the flags + field shows whether the page is considered free or not: + + + (gdb) p (void *)132 + $21 = (void *) 0x84 + + + + + + The "reserved" bit is the high bit, which is definitely not set, so + the kernel considers the signal stack page to be free and available to + be used. + + + At this point, I jump to conclusions and start looking at my early + boot code, because that's where that page is supposed to be reserved. + + + In my setup_arch procedure, I have the following code which looks just + fine: + + + + bootmap_size = init_bootmem(start_pfn, end_pfn - start_pfn); + free_bootmem(__pa(low_physmem) + bootmap_size, high_physmem - low_physmem); + + + + + + Two stack pages have already been allocated, and low_physmem points to + the third page, which is the beginning of free memory. + The init_bootmem call declares the entire memory to the boot memory + manager, which marks it all reserved. The free_bootmem call frees up + all of it, except for the first two pages. This looks correct to me. + + + So, I decide to see init_bootmem run and make sure that it is marking + those first two pages as reserved. I never get that far. + + + Stepping into init_bootmem, and looking at bootmem_map before looking + at what it contains shows the following: + + + + (gdb) p bootmem_map + $3 = (void *) 0x50000000 + + + + + + Aha! The light dawns. That first page is doing double duty as a + stack and as the boot memory map. The last thing that the boot memory + manager does is to free the pages used by its memory map, so this page + is getting freed even its marked as reserved. + + + The fix was to initialize the boot memory manager before allocating + those two stack pages, and then allocate them through the boot memory + manager. After doing this, and fixing a couple of subsequent buglets, + the stack corruption problem disappeared. + + + + + + 13. What to do when UML doesn't work + + + + + 13.1. Strange compilation errors when you build from source + + As of test11, it is necessary to have "ARCH=um" in the environment or + on the make command line for all steps in building UML, including + clean, distclean, or mrproper, config, menuconfig, or xconfig, dep, + and linux. If you forget for any of them, the i386 build seems to + contaminate the UML build. If this happens, start from scratch with + + + host% + make mrproper ARCH=um + + + + + and repeat the build process with ARCH=um on all the steps. + + + See ``Compiling the kernel and modules'' for more details. + + + Another cause of strange compilation errors is building UML in + /usr/src/linux. If you do this, the first thing you need to do is + clean up the mess you made. The /usr/src/linux/asm link will now + point to /usr/src/linux/asm-um. Make it point back to + /usr/src/linux/asm-i386. Then, move your UML pool someplace else and + build it there. Also see below, where a more specific set of symptoms + is described. + + + + 13.3. A variety of panics and hangs with /tmp on a reiserfs filesys- + tem + + I saw this on reiserfs 3.5.21 and it seems to be fixed in 3.5.27. + Panics preceded by + + + Detaching pid nnnn + + + + are diagnostic of this problem. This is a reiserfs bug which causes a + thread to occasionally read stale data from a mmapped page shared with + another thread. The fix is to upgrade the filesystem or to have /tmp + be an ext2 filesystem. + + + + 13.4. The compile fails with errors about conflicting types for + 'open', 'dup', and 'waitpid' + + This happens when you build in /usr/src/linux. The UML build makes + the include/asm link point to include/asm-um. /usr/include/asm points + to /usr/src/linux/include/asm, so when that link gets moved, files + which need to include the asm-i386 versions of headers get the + incompatible asm-um versions. The fix is to move the include/asm link + back to include/asm-i386 and to do UML builds someplace else. + + + + 13.5. UML doesn't work when /tmp is an NFS filesystem + + This seems to be a similar situation with the ReiserFS problem above. + Some versions of NFS seems not to handle mmap correctly, which UML + depends on. The workaround is have /tmp be a non-NFS directory. + + + 13.6. UML hangs on boot when compiled with gprof support + + If you build UML with gprof support and, early in the boot, it does + this + + + kernel BUG at page_alloc.c:100! + + + + + you have a buggy gcc. You can work around the problem by removing + UM_FASTCALL from CFLAGS in arch/um/Makefile-i386. This will open up + another bug, but that one is fairly hard to reproduce. + + + + 13.7. syslogd dies with a SIGTERM on startup + + The exact boot error depends on the distribution that you're booting, + but Debian produces this: + + + /etc/rc2.d/S10sysklogd: line 49: 93 Terminated + start-stop-daemon --start --quiet --exec /sbin/syslogd -- $SYSLOGD + + + + + This is a syslogd bug. There's a race between a parent process + installing a signal handler and its child sending the signal. See + this uml-devel post for the details. + + + + 13.8. TUN/TAP networking doesn't work on a 2.4 host + + There are a couple of problems which were + name="pointed + out"> by Tim Robinson + + o It doesn't work on hosts running 2.4.7 (or thereabouts) or earlier. + The fix is to upgrade to something more recent and then read the + next item. + + o If you see + + + File descriptor in bad state + + + + when you bring up the device inside UML, you have a header mismatch + between the original kernel and the upgraded one. Make /usr/src/linux + point at the new headers. This will only be a problem if you build + uml_net yourself. + + + + 13.9. You can network to the host but not to other machines on the + net + + If you can connect to the host, and the host can connect to UML, but + you cannot connect to any other machines, then you may need to enable + IP Masquerading on the host. Usually this is only experienced when + using private IP addresses (192.168.x.x or 10.x.x.x) for host/UML + networking, rather than the public address space that your host is + connected to. UML does not enable IP Masquerading, so you will need + to create a static rule to enable it: + + + host% + iptables -t nat -A POSTROUTING -o eth0 -j MASQUERADE + + + + + Replace eth0 with the interface that you use to talk to the rest of + the world. + + + Documentation on IP Masquerading, and SNAT, can be found at + www.netfilter.org . + + + If you can reach the local net, but not the outside Internet, then + that is usually a routing problem. The UML needs a default route: + + + UML# + route add default gw gateway IP + + + + + The gateway IP can be any machine on the local net that knows how to + reach the outside world. Usually, this is the host or the local net- + work's gateway. + + + Occasionally, we hear from someone who can reach some machines, but + not others on the same net, or who can reach some ports on other + machines, but not others. These are usually caused by strange + firewalling somewhere between the UML and the other box. You track + this down by running tcpdump on every interface the packets travel + over and see where they disappear. When you find a machine that takes + the packets in, but does not send them onward, that's the culprit. + + + + 13.10. I have no root and I want to scream + + Thanks to Birgit Wahlich for telling me about this strange one. It + turns out that there's a limit of six environment variables on the + kernel command line. When that limit is reached or exceeded, argument + processing stops, which means that the 'root=' argument that UML + usually adds is not seen. So, the filesystem has no idea what the + root device is, so it panics. + + + The fix is to put less stuff on the command line. Glomming all your + setup variables into one is probably the best way to go. + + + + 13.11. UML build conflict between ptrace.h and ucontext.h + + On some older systems, /usr/include/asm/ptrace.h and + /usr/include/sys/ucontext.h define the same names. So, when they're + included together, the defines from one completely mess up the parsing + of the other, producing errors like: + /usr/include/sys/ucontext.h:47: parse error before + `10' + + + + + plus a pile of warnings. + + + This is a libc botch, which has since been fixed, and I don't see any + way around it besides upgrading. + + + + 13.12. The UML BogoMips is exactly half the host's BogoMips + + On i386 kernels, there are two ways of running the loop that is used + to calculate the BogoMips rating, using the TSC if it's there or using + a one-instruction loop. The TSC produces twice the BogoMips as the + loop. UML uses the loop, since it has nothing resembling a TSC, and + will get almost exactly the same BogoMips as a host using the loop. + However, on a host with a TSC, its BogoMips will be double the loop + BogoMips, and therefore double the UML BogoMips. + + + + 13.13. When you run UML, it immediately segfaults + + If the host is configured with the 2G/2G address space split, that's + why. See ``UML on 2G/2G hosts'' for the details on getting UML to + run on your host. + + + + 13.14. xterms appear, then immediately disappear + + If you're running an up to date kernel with an old release of + uml_utilities, the port-helper program will not work properly, so + xterms will exit straight after they appear. The solution is to + upgrade to the latest release of uml_utilities. Usually this problem + occurs when you have installed a packaged release of UML then compiled + your own development kernel without upgrading the uml_utilities from + the source distribution. + + + + 13.15. Any other panic, hang, or strange behavior + + If you're seeing truly strange behavior, such as hangs or panics that + happen in random places, or you try running the debugger to see what's + happening and it acts strangely, then it could be a problem in the + host kernel. If you're not running a stock Linus or -ac kernel, then + try that. An early version of the preemption patch and a 2.4.10 SuSE + kernel have caused very strange problems in UML. + + + Otherwise, let me know about it. Send a message to one of the UML + mailing lists - either the developer list - user-mode-linux-devel at + lists dot sourceforge dot net (subscription info) or the user list - + user-mode-linux-user at lists dot sourceforge do net (subscription + info), whichever you prefer. Don't assume that everyone knows about + it and that a fix is imminent. + + + If you want to be super-helpful, read ``Diagnosing Problems'' and + follow the instructions contained therein. + 14. Diagnosing Problems + + + If you get UML to crash, hang, or otherwise misbehave, you should + report this on one of the project mailing lists, either the developer + list - user-mode-linux-devel at lists dot sourceforge dot net + (subscription info) or the user list - user-mode-linux-user at lists + dot sourceforge dot net (subscription info). When you do, it is + likely that I will want more information. So, it would be helpful to + read the stuff below, do whatever is applicable in your case, and + report the results to the list. + + + For any diagnosis, you're going to need to build a debugging kernel. + The binaries from this site aren't debuggable. If you haven't done + this before, read about ``Compiling the kernel and modules'' and + ``Kernel debugging'' UML first. + + + 14.1. Case 1 : Normal kernel panics + + The most common case is for a normal thread to panic. To debug this, + you will need to run it under the debugger (add 'debug' to the command + line). An xterm will start up with gdb running inside it. Continue + it when it stops in start_kernel and make it crash. Now ^C gdb and + + + If the panic was a "Kernel mode fault", then there will be a segv + frame on the stack and I'm going to want some more information. The + stack might look something like this: + + + (UML gdb) backtrace + #0 0x1009bf76 in __sigprocmask (how=1, set=0x5f347940, oset=0x0) + at ../sysdeps/unix/sysv/linux/sigprocmask.c:49 + #1 0x10091411 in change_sig (signal=10, on=1) at process.c:218 + #2 0x10094785 in timer_handler (sig=26) at time_kern.c:32 + #3 0x1009bf38 in __restore () + at ../sysdeps/unix/sysv/linux/i386/sigaction.c:125 + #4 0x1009534c in segv (address=8, ip=268849158, is_write=2, is_user=0) + at trap_kern.c:66 + #5 0x10095c04 in segv_handler (sig=11) at trap_user.c:285 + #6 0x1009bf38 in __restore () + + + + + I'm going to want to see the symbol and line information for the value + of ip in the segv frame. In this case, you would do the following: + + + (UML gdb) i sym 268849158 + + + + + and + + + (UML gdb) i line *268849158 + + + + + The reason for this is the __restore frame right above the segv_han- + dler frame is hiding the frame that actually segfaulted. So, I have + to get that information from the faulting ip. + + + 14.2. Case 2 : Tracing thread panics + + The less common and more painful case is when the tracing thread + panics. In this case, the kernel debugger will be useless because it + needs a healthy tracing thread in order to work. The first thing to + do is get a backtrace from the tracing thread. This is done by + figuring out what its pid is, firing up gdb, and attaching it to that + pid. You can figure out the tracing thread pid by looking at the + first line of the console output, which will look like this: + + + tracing thread pid = 15851 + + + + + or by running ps on the host and finding the line that looks like + this: + + + jdike 15851 4.5 0.4 132568 1104 pts/0 S 21:34 0:05 ./linux [(tracing thread)] + + + + + If the panic was 'segfault in signals', then follow the instructions + above for collecting information about the location of the seg fault. + + + If the tracing thread flaked out all by itself, then send that + backtrace in and wait for our crack debugging team to fix the problem. + + + 14.3. Case 3 : Tracing thread panics caused by other threads + + However, there are cases where the misbehavior of another thread + caused the problem. The most common panic of this type is: + + + wait_for_stop failed to wait for to stop with + + + + + In this case, you'll need to get a backtrace from the process men- + tioned in the panic, which is complicated by the fact that the kernel + debugger is defunct and without some fancy footwork, another gdb can't + attach to it. So, this is how the fancy footwork goes: + + In a shell: + + + host% kill -STOP pid + + + + + Run gdb on the tracing thread as described in case 2 and do: + + + (host gdb) call detach(pid) + + + If you get a segfault, do it again. It always works the second time. + + Detach from the tracing thread and attach to that other thread: + + + (host gdb) detach + + + + + + + (host gdb) attach pid + + + + + If gdb hangs when attaching to that process, go back to a shell and + do: + + + host% + kill -CONT pid + + + + + And then get the backtrace: + + + (host gdb) backtrace + + + + + + 14.4. Case 4 : Hangs + + Hangs seem to be fairly rare, but they sometimes happen. When a hang + happens, we need a backtrace from the offending process. Run the + kernel debugger as described in case 1 and get a backtrace. If the + current process is not the idle thread, then send in the backtrace. + You can tell that it's the idle thread if the stack looks like this: + + + #0 0x100b1401 in __libc_nanosleep () + #1 0x100a2885 in idle_sleep (secs=10) at time.c:122 + #2 0x100a546f in do_idle () at process_kern.c:445 + #3 0x100a5508 in cpu_idle () at process_kern.c:471 + #4 0x100ec18f in start_kernel () at init/main.c:592 + #5 0x100a3e10 in start_kernel_proc (unused=0x0) at um_arch.c:71 + #6 0x100a383f in signal_tramp (arg=0x100a3dd8) at trap_user.c:50 + + + + + If this is the case, then some other process is at fault, and went to + sleep when it shouldn't have. Run ps on the host and figure out which + process should not have gone to sleep and stayed asleep. Then attach + to it with gdb and get a backtrace as described in case 3. + + + + + + + 15. Thanks + + + A number of people have helped this project in various ways, and this + page gives recognition where recognition is due. + + + If you're listed here and you would prefer a real link on your name, + or no link at all, instead of the despammed email address pseudo-link, + let me know. + + + If you're not listed here and you think maybe you should be, please + let me know that as well. I try to get everyone, but sometimes my + bookkeeping lapses and I forget about contributions. + + + 15.1. Code and Documentation + + Rusty Russell - + + o wrote the HOWTO + + o prodded me into making this project official and putting it on + SourceForge + + o came up with the way cool UML logo + + o redid the config process + + + Peter Moulder - Fixed my config and build + processes, and added some useful code to the block driver + + + Bill Stearns - + + o HOWTO updates + + o lots of bug reports + + o lots of testing + + o dedicated a box (uml.ists.dartmouth.edu) to support UML development + + o wrote the mkrootfs script, which allows bootable filesystems of + RPM-based distributions to be cranked out + + o cranked out a large number of filesystems with said script + + + Jim Leu - Wrote the virtual ethernet driver + and associated usermode tools + + Lars Brinkhoff - Contributed the ptrace + proxy from his own project to allow easier + kernel debugging + + + Andrea Arcangeli - Redid some of the early boot + code so that it would work on machines with Large File Support + + + Chris Emerson - Did + the first UML port to Linux/ppc + + + Harald Welte - Wrote the multicast + transport for the network driver + + + Jorgen Cederlof - Added special file support to hostfs + + + Greg Lonnon - Changed the ubd driver + to allow it to layer a COW file on a shared read-only filesystem and + wrote the iomem emulation support + + + Henrik Nordstrom - Provided a variety + of patches, fixes, and clues + + + Lennert Buytenhek - Contributed various patches, a rewrite of the + network driver, the first implementation of the mconsole driver, and + did the bulk of the work needed to get SMP working again. + + + Yon Uriarte - Fixed the TUN/TAP network backend while I slept. + + + Adam Heath - Made a bunch of nice cleanups to the initialization code, + plus various other small patches. + + + Matt Zimmerman - Matt volunteered to be the UML Debian maintainer and + is doing a real nice job of it. He also noticed and fixed a number of + actually and potentially exploitable security holes in uml_net. Plus + the occasional patch. I like patches. + + + James McMechan - James seems to have taken over maintenance of the ubd + driver and is doing a nice job of it. + + + Chandan Kudige - wrote the umlgdb script which automates the reloading + of module symbols. + + + Steve Schmidtke - wrote the UML slirp transport and hostaudio drivers, + enabling UML processes to access audio devices on the host. He also + submitted patches for the slip transport and lots of other things. + + + David Coulson - + + o Set up the usermodelinux.org site, + which is a great way of keeping the UML user community on top of + UML goings-on. + + o Site documentation and updates + + o Nifty little UML management daemon UMLd + + + o Lots of testing and bug reports + + + + + 15.2. Flushing out bugs + + + + o Yuri Pudgorodsky + + o Gerald Britton + + o Ian Wehrman + + o Gord Lamb + + o Eugene Koontz + + o John H. Hartman + + o Anders Karlsson + + o Daniel Phillips + + o John Fremlin + + o Rainer Burgstaller + + o James Stevenson + + o Matt Clay + + o Cliff Jefferies + + o Geoff Hoff + + o Lennert Buytenhek + + o Al Viro + + o Frank Klingenhoefer + + o Livio Baldini Soares + + o Jon Burgess + + o Petru Paler + + o Paul + + o Chris Reahard + + o Sverker Nilsson + + o Gong Su + + o johan verrept + + o Bjorn Eriksson + + o Lorenzo Allegrucci + + o Muli Ben-Yehuda + + o David Mansfield + + o Howard Goff + + o Mike Anderson + + o John Byrne + + o Sapan J. Batia + + o Iris Huang + + o Jan Hudec + + o Voluspa + + + + + 15.3. Buglets and clean-ups + + + + o Dave Zarzycki + + o Adam Lazur + + o Boria Feigin + + o Brian J. Murrell + + o JS + + o Roman Zippel + + o Wil Cooley + + o Ayelet Shemesh + + o Will Dyson + + o Sverker Nilsson + + o dvorak + + o v.naga srinivas + + o Shlomi Fish + + o Roger Binns + + o johan verrept + + o MrChuoi + + o Peter Cleve + + o Vincent Guffens + + o Nathan Scott + + o Patrick Caulfield + + o jbearce + + o Catalin Marinas + + o Shane Spencer + + o Zou Min + + + o Ryan Boder + + o Lorenzo Colitti + + o Gwendal Grignou + + o Andre' Breiler + + o Tsutomu Yasuda + + + + 15.4. Case Studies + + + o Jon Wright + + o William McEwan + + o Michael Richardson + + + + 15.5. Other contributions + + + Bill Carr made the Red Hat mkrootfs script + work with RH 6.2. + + Michael Jennings sent in some material which + is now gracing the top of the index page of this site. + + SGI (and more specifically Ralf Baechle ) gave me an account on oss.sgi.com + . The bandwidth there made it possible to + produce most of the filesystems available on the project download + page. + + Laurent Bonnaud took the old grotty + Debian filesystem that I've been distributing and updated it to 2.2. + It is now available by itself here. + + Rik van Riel gave me some ftp space on ftp.nl.linux.org so I can make + releases even when Sourceforge is broken. + + Rodrigo de Castro looked at my broken pte code and told me what was + wrong with it, letting me fix a long-standing (several weeks) and + serious set of bugs. + + Chris Reahard built a specialized root filesystem for running a DNS + server jailed inside UML. It's available from the download + page in the Jail + Filesystems section. + + + + + + + + + + + + diff --git a/Documentation/virtual/index.rst b/Documentation/virtual/index.rst deleted file mode 100644 index 062ffb527043..000000000000 --- a/Documentation/virtual/index.rst +++ /dev/null @@ -1,18 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -============================ -Linux Virtualization Support -============================ - -.. toctree:: - :maxdepth: 2 - - kvm/index - paravirt_ops - -.. only:: html and subproject - - Indices - ======= - - * :ref:`genindex` diff --git a/Documentation/virtual/kvm/amd-memory-encryption.rst b/Documentation/virtual/kvm/amd-memory-encryption.rst deleted file mode 100644 index d18c97b4e140..000000000000 --- a/Documentation/virtual/kvm/amd-memory-encryption.rst +++ /dev/null @@ -1,250 +0,0 @@ -====================================== -Secure Encrypted Virtualization (SEV) -====================================== - -Overview -======== - -Secure Encrypted Virtualization (SEV) is a feature found on AMD processors. - -SEV is an extension to the AMD-V architecture which supports running -virtual machines (VMs) under the control of a hypervisor. When enabled, -the memory contents of a VM will be transparently encrypted with a key -unique to that VM. - -The hypervisor can determine the SEV support through the CPUID -instruction. The CPUID function 0x8000001f reports information related -to SEV:: - - 0x8000001f[eax]: - Bit[1] indicates support for SEV - ... - [ecx]: - Bits[31:0] Number of encrypted guests supported simultaneously - -If support for SEV is present, MSR 0xc001_0010 (MSR_K8_SYSCFG) and MSR 0xc001_0015 -(MSR_K7_HWCR) can be used to determine if it can be enabled:: - - 0xc001_0010: - Bit[23] 1 = memory encryption can be enabled - 0 = memory encryption can not be enabled - - 0xc001_0015: - Bit[0] 1 = memory encryption can be enabled - 0 = memory encryption can not be enabled - -When SEV support is available, it can be enabled in a specific VM by -setting the SEV bit before executing VMRUN.:: - - VMCB[0x90]: - Bit[1] 1 = SEV is enabled - 0 = SEV is disabled - -SEV hardware uses ASIDs to associate a memory encryption key with a VM. -Hence, the ASID for the SEV-enabled guests must be from 1 to a maximum value -defined in the CPUID 0x8000001f[ecx] field. - -SEV Key Management -================== - -The SEV guest key management is handled by a separate processor called the AMD -Secure Processor (AMD-SP). Firmware running inside the AMD-SP provides a secure -key management interface to perform common hypervisor activities such as -encrypting bootstrap code, snapshot, migrating and debugging the guest. For more -information, see the SEV Key Management spec [api-spec]_ - -KVM implements the following commands to support common lifecycle events of SEV -guests, such as launching, running, snapshotting, migrating and decommissioning. - -1. KVM_SEV_INIT ---------------- - -The KVM_SEV_INIT command is used by the hypervisor to initialize the SEV platform -context. In a typical workflow, this command should be the first command issued. - -Returns: 0 on success, -negative on error - -2. KVM_SEV_LAUNCH_START ------------------------ - -The KVM_SEV_LAUNCH_START command is used for creating the memory encryption -context. To create the encryption context, user must provide a guest policy, -the owner's public Diffie-Hellman (PDH) key and session information. - -Parameters: struct kvm_sev_launch_start (in/out) - -Returns: 0 on success, -negative on error - -:: - - struct kvm_sev_launch_start { - __u32 handle; /* if zero then firmware creates a new handle */ - __u32 policy; /* guest's policy */ - - __u64 dh_uaddr; /* userspace address pointing to the guest owner's PDH key */ - __u32 dh_len; - - __u64 session_addr; /* userspace address which points to the guest session information */ - __u32 session_len; - }; - -On success, the 'handle' field contains a new handle and on error, a negative value. - -For more details, see SEV spec Section 6.2. - -3. KVM_SEV_LAUNCH_UPDATE_DATA ------------------------------ - -The KVM_SEV_LAUNCH_UPDATE_DATA is used for encrypting a memory region. It also -calculates a measurement of the memory contents. The measurement is a signature -of the memory contents that can be sent to the guest owner as an attestation -that the memory was encrypted correctly by the firmware. - -Parameters (in): struct kvm_sev_launch_update_data - -Returns: 0 on success, -negative on error - -:: - - struct kvm_sev_launch_update { - __u64 uaddr; /* userspace address to be encrypted (must be 16-byte aligned) */ - __u32 len; /* length of the data to be encrypted (must be 16-byte aligned) */ - }; - -For more details, see SEV spec Section 6.3. - -4. KVM_SEV_LAUNCH_MEASURE -------------------------- - -The KVM_SEV_LAUNCH_MEASURE command is used to retrieve the measurement of the -data encrypted by the KVM_SEV_LAUNCH_UPDATE_DATA command. The guest owner may -wait to provide the guest with confidential information until it can verify the -measurement. Since the guest owner knows the initial contents of the guest at -boot, the measurement can be verified by comparing it to what the guest owner -expects. - -Parameters (in): struct kvm_sev_launch_measure - -Returns: 0 on success, -negative on error - -:: - - struct kvm_sev_launch_measure { - __u64 uaddr; /* where to copy the measurement */ - __u32 len; /* length of measurement blob */ - }; - -For more details on the measurement verification flow, see SEV spec Section 6.4. - -5. KVM_SEV_LAUNCH_FINISH ------------------------- - -After completion of the launch flow, the KVM_SEV_LAUNCH_FINISH command can be -issued to make the guest ready for the execution. - -Returns: 0 on success, -negative on error - -6. KVM_SEV_GUEST_STATUS ------------------------ - -The KVM_SEV_GUEST_STATUS command is used to retrieve status information about a -SEV-enabled guest. - -Parameters (out): struct kvm_sev_guest_status - -Returns: 0 on success, -negative on error - -:: - - struct kvm_sev_guest_status { - __u32 handle; /* guest handle */ - __u32 policy; /* guest policy */ - __u8 state; /* guest state (see enum below) */ - }; - -SEV guest state: - -:: - - enum { - SEV_STATE_INVALID = 0; - SEV_STATE_LAUNCHING, /* guest is currently being launched */ - SEV_STATE_SECRET, /* guest is being launched and ready to accept the ciphertext data */ - SEV_STATE_RUNNING, /* guest is fully launched and running */ - SEV_STATE_RECEIVING, /* guest is being migrated in from another SEV machine */ - SEV_STATE_SENDING /* guest is getting migrated out to another SEV machine */ - }; - -7. KVM_SEV_DBG_DECRYPT ----------------------- - -The KVM_SEV_DEBUG_DECRYPT command can be used by the hypervisor to request the -firmware to decrypt the data at the given memory region. - -Parameters (in): struct kvm_sev_dbg - -Returns: 0 on success, -negative on error - -:: - - struct kvm_sev_dbg { - __u64 src_uaddr; /* userspace address of data to decrypt */ - __u64 dst_uaddr; /* userspace address of destination */ - __u32 len; /* length of memory region to decrypt */ - }; - -The command returns an error if the guest policy does not allow debugging. - -8. KVM_SEV_DBG_ENCRYPT ----------------------- - -The KVM_SEV_DEBUG_ENCRYPT command can be used by the hypervisor to request the -firmware to encrypt the data at the given memory region. - -Parameters (in): struct kvm_sev_dbg - -Returns: 0 on success, -negative on error - -:: - - struct kvm_sev_dbg { - __u64 src_uaddr; /* userspace address of data to encrypt */ - __u64 dst_uaddr; /* userspace address of destination */ - __u32 len; /* length of memory region to encrypt */ - }; - -The command returns an error if the guest policy does not allow debugging. - -9. KVM_SEV_LAUNCH_SECRET ------------------------- - -The KVM_SEV_LAUNCH_SECRET command can be used by the hypervisor to inject secret -data after the measurement has been validated by the guest owner. - -Parameters (in): struct kvm_sev_launch_secret - -Returns: 0 on success, -negative on error - -:: - - struct kvm_sev_launch_secret { - __u64 hdr_uaddr; /* userspace address containing the packet header */ - __u32 hdr_len; - - __u64 guest_uaddr; /* the guest memory region where the secret should be injected */ - __u32 guest_len; - - __u64 trans_uaddr; /* the hypervisor memory region which contains the secret */ - __u32 trans_len; - }; - -References -========== - - -See [white-paper]_, [api-spec]_, [amd-apm]_ and [kvm-forum]_ for more info. - -.. [white-paper] http://amd-dev.wpengine.netdna-cdn.com/wordpress/media/2013/12/AMD_Memory_Encryption_Whitepaper_v7-Public.pdf -.. [api-spec] http://support.amd.com/TechDocs/55766_SEV-KM_API_Specification.pdf -.. [amd-apm] http://support.amd.com/TechDocs/24593.pdf (section 15.34) -.. [kvm-forum] http://www.linux-kvm.org/images/7/74/02x08A-Thomas_Lendacky-AMDs_Virtualizatoin_Memory_Encryption_Technology.pdf diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt deleted file mode 100644 index e54a3f51ddc5..000000000000 --- a/Documentation/virtual/kvm/api.txt +++ /dev/null @@ -1,5296 +0,0 @@ -The Definitive KVM (Kernel-based Virtual Machine) API Documentation -=================================================================== - -1. General description ----------------------- - -The kvm API is a set of ioctls that are issued to control various aspects -of a virtual machine. The ioctls belong to three classes: - - - System ioctls: These query and set global attributes which affect the - whole kvm subsystem. In addition a system ioctl is used to create - virtual machines. - - - VM ioctls: These query and set attributes that affect an entire virtual - machine, for example memory layout. In addition a VM ioctl is used to - create virtual cpus (vcpus) and devices. - - VM ioctls must be issued from the same process (address space) that was - used to create the VM. - - - vcpu ioctls: These query and set attributes that control the operation - of a single virtual cpu. - - vcpu ioctls should be issued from the same thread that was used to create - the vcpu, except for asynchronous vcpu ioctl that are marked as such in - the documentation. Otherwise, the first ioctl after switching threads - could see a performance impact. - - - device ioctls: These query and set attributes that control the operation - of a single device. - - device ioctls must be issued from the same process (address space) that - was used to create the VM. - -2. File descriptors -------------------- - -The kvm API is centered around file descriptors. An initial -open("/dev/kvm") obtains a handle to the kvm subsystem; this handle -can be used to issue system ioctls. A KVM_CREATE_VM ioctl on this -handle will create a VM file descriptor which can be used to issue VM -ioctls. A KVM_CREATE_VCPU or KVM_CREATE_DEVICE ioctl on a VM fd will -create a virtual cpu or device and return a file descriptor pointing to -the new resource. Finally, ioctls on a vcpu or device fd can be used -to control the vcpu or device. For vcpus, this includes the important -task of actually running guest code. - -In general file descriptors can be migrated among processes by means -of fork() and the SCM_RIGHTS facility of unix domain socket. These -kinds of tricks are explicitly not supported by kvm. While they will -not cause harm to the host, their actual behavior is not guaranteed by -the API. See "General description" for details on the ioctl usage -model that is supported by KVM. - -It is important to note that althought VM ioctls may only be issued from -the process that created the VM, a VM's lifecycle is associated with its -file descriptor, not its creator (process). In other words, the VM and -its resources, *including the associated address space*, are not freed -until the last reference to the VM's file descriptor has been released. -For example, if fork() is issued after ioctl(KVM_CREATE_VM), the VM will -not be freed until both the parent (original) process and its child have -put their references to the VM's file descriptor. - -Because a VM's resources are not freed until the last reference to its -file descriptor is released, creating additional references to a VM via -via fork(), dup(), etc... without careful consideration is strongly -discouraged and may have unwanted side effects, e.g. memory allocated -by and on behalf of the VM's process may not be freed/unaccounted when -the VM is shut down. - - -3. Extensions -------------- - -As of Linux 2.6.22, the KVM ABI has been stabilized: no backward -incompatible change are allowed. However, there is an extension -facility that allows backward-compatible extensions to the API to be -queried and used. - -The extension mechanism is not based on the Linux version number. -Instead, kvm defines extension identifiers and a facility to query -whether a particular extension identifier is available. If it is, a -set of ioctls is available for application use. - - -4. API description ------------------- - -This section describes ioctls that can be used to control kvm guests. -For each ioctl, the following information is provided along with a -description: - - Capability: which KVM extension provides this ioctl. Can be 'basic', - which means that is will be provided by any kernel that supports - API version 12 (see section 4.1), a KVM_CAP_xyz constant, which - means availability needs to be checked with KVM_CHECK_EXTENSION - (see section 4.4), or 'none' which means that while not all kernels - support this ioctl, there's no capability bit to check its - availability: for kernels that don't support the ioctl, - the ioctl returns -ENOTTY. - - Architectures: which instruction set architectures provide this ioctl. - x86 includes both i386 and x86_64. - - Type: system, vm, or vcpu. - - Parameters: what parameters are accepted by the ioctl. - - Returns: the return value. General error numbers (EBADF, ENOMEM, EINVAL) - are not detailed, but errors with specific meanings are. - - -4.1 KVM_GET_API_VERSION - -Capability: basic -Architectures: all -Type: system ioctl -Parameters: none -Returns: the constant KVM_API_VERSION (=12) - -This identifies the API version as the stable kvm API. It is not -expected that this number will change. However, Linux 2.6.20 and -2.6.21 report earlier versions; these are not documented and not -supported. Applications should refuse to run if KVM_GET_API_VERSION -returns a value other than 12. If this check passes, all ioctls -described as 'basic' will be available. - - -4.2 KVM_CREATE_VM - -Capability: basic -Architectures: all -Type: system ioctl -Parameters: machine type identifier (KVM_VM_*) -Returns: a VM fd that can be used to control the new virtual machine. - -The new VM has no virtual cpus and no memory. -You probably want to use 0 as machine type. - -In order to create user controlled virtual machines on S390, check -KVM_CAP_S390_UCONTROL and use the flag KVM_VM_S390_UCONTROL as -privileged user (CAP_SYS_ADMIN). - -To use hardware assisted virtualization on MIPS (VZ ASE) rather than -the default trap & emulate implementation (which changes the virtual -memory layout to fit in user mode), check KVM_CAP_MIPS_VZ and use the -flag KVM_VM_MIPS_VZ. - - -On arm64, the physical address size for a VM (IPA Size limit) is limited -to 40bits by default. The limit can be configured if the host supports the -extension KVM_CAP_ARM_VM_IPA_SIZE. When supported, use -KVM_VM_TYPE_ARM_IPA_SIZE(IPA_Bits) to set the size in the machine type -identifier, where IPA_Bits is the maximum width of any physical -address used by the VM. The IPA_Bits is encoded in bits[7-0] of the -machine type identifier. - -e.g, to configure a guest to use 48bit physical address size : - - vm_fd = ioctl(dev_fd, KVM_CREATE_VM, KVM_VM_TYPE_ARM_IPA_SIZE(48)); - -The requested size (IPA_Bits) must be : - 0 - Implies default size, 40bits (for backward compatibility) - - or - - N - Implies N bits, where N is a positive integer such that, - 32 <= N <= Host_IPA_Limit - -Host_IPA_Limit is the maximum possible value for IPA_Bits on the host and -is dependent on the CPU capability and the kernel configuration. The limit can -be retrieved using KVM_CAP_ARM_VM_IPA_SIZE of the KVM_CHECK_EXTENSION -ioctl() at run-time. - -Please note that configuring the IPA size does not affect the capability -exposed by the guest CPUs in ID_AA64MMFR0_EL1[PARange]. It only affects -size of the address translated by the stage2 level (guest physical to -host physical address translations). - - -4.3 KVM_GET_MSR_INDEX_LIST, KVM_GET_MSR_FEATURE_INDEX_LIST - -Capability: basic, KVM_CAP_GET_MSR_FEATURES for KVM_GET_MSR_FEATURE_INDEX_LIST -Architectures: x86 -Type: system ioctl -Parameters: struct kvm_msr_list (in/out) -Returns: 0 on success; -1 on error -Errors: - EFAULT: the msr index list cannot be read from or written to - E2BIG: the msr index list is to be to fit in the array specified by - the user. - -struct kvm_msr_list { - __u32 nmsrs; /* number of msrs in entries */ - __u32 indices[0]; -}; - -The user fills in the size of the indices array in nmsrs, and in return -kvm adjusts nmsrs to reflect the actual number of msrs and fills in the -indices array with their numbers. - -KVM_GET_MSR_INDEX_LIST returns the guest msrs that are supported. The list -varies by kvm version and host processor, but does not change otherwise. - -Note: if kvm indicates supports MCE (KVM_CAP_MCE), then the MCE bank MSRs are -not returned in the MSR list, as different vcpus can have a different number -of banks, as set via the KVM_X86_SETUP_MCE ioctl. - -KVM_GET_MSR_FEATURE_INDEX_LIST returns the list of MSRs that can be passed -to the KVM_GET_MSRS system ioctl. This lets userspace probe host capabilities -and processor features that are exposed via MSRs (e.g., VMX capabilities). -This list also varies by kvm version and host processor, but does not change -otherwise. - - -4.4 KVM_CHECK_EXTENSION - -Capability: basic, KVM_CAP_CHECK_EXTENSION_VM for vm ioctl -Architectures: all -Type: system ioctl, vm ioctl -Parameters: extension identifier (KVM_CAP_*) -Returns: 0 if unsupported; 1 (or some other positive integer) if supported - -The API allows the application to query about extensions to the core -kvm API. Userspace passes an extension identifier (an integer) and -receives an integer that describes the extension availability. -Generally 0 means no and 1 means yes, but some extensions may report -additional information in the integer return value. - -Based on their initialization different VMs may have different capabilities. -It is thus encouraged to use the vm ioctl to query for capabilities (available -with KVM_CAP_CHECK_EXTENSION_VM on the vm fd) - -4.5 KVM_GET_VCPU_MMAP_SIZE - -Capability: basic -Architectures: all -Type: system ioctl -Parameters: none -Returns: size of vcpu mmap area, in bytes - -The KVM_RUN ioctl (cf.) communicates with userspace via a shared -memory region. This ioctl returns the size of that region. See the -KVM_RUN documentation for details. - - -4.6 KVM_SET_MEMORY_REGION - -Capability: basic -Architectures: all -Type: vm ioctl -Parameters: struct kvm_memory_region (in) -Returns: 0 on success, -1 on error - -This ioctl is obsolete and has been removed. - - -4.7 KVM_CREATE_VCPU - -Capability: basic -Architectures: all -Type: vm ioctl -Parameters: vcpu id (apic id on x86) -Returns: vcpu fd on success, -1 on error - -This API adds a vcpu to a virtual machine. No more than max_vcpus may be added. -The vcpu id is an integer in the range [0, max_vcpu_id). - -The recommended max_vcpus value can be retrieved using the KVM_CAP_NR_VCPUS of -the KVM_CHECK_EXTENSION ioctl() at run-time. -The maximum possible value for max_vcpus can be retrieved using the -KVM_CAP_MAX_VCPUS of the KVM_CHECK_EXTENSION ioctl() at run-time. - -If the KVM_CAP_NR_VCPUS does not exist, you should assume that max_vcpus is 4 -cpus max. -If the KVM_CAP_MAX_VCPUS does not exist, you should assume that max_vcpus is -same as the value returned from KVM_CAP_NR_VCPUS. - -The maximum possible value for max_vcpu_id can be retrieved using the -KVM_CAP_MAX_VCPU_ID of the KVM_CHECK_EXTENSION ioctl() at run-time. - -If the KVM_CAP_MAX_VCPU_ID does not exist, you should assume that max_vcpu_id -is the same as the value returned from KVM_CAP_MAX_VCPUS. - -On powerpc using book3s_hv mode, the vcpus are mapped onto virtual -threads in one or more virtual CPU cores. (This is because the -hardware requires all the hardware threads in a CPU core to be in the -same partition.) The KVM_CAP_PPC_SMT capability indicates the number -of vcpus per virtual core (vcore). The vcore id is obtained by -dividing the vcpu id by the number of vcpus per vcore. The vcpus in a -given vcore will always be in the same physical core as each other -(though that might be a different physical core from time to time). -Userspace can control the threading (SMT) mode of the guest by its -allocation of vcpu ids. For example, if userspace wants -single-threaded guest vcpus, it should make all vcpu ids be a multiple -of the number of vcpus per vcore. - -For virtual cpus that have been created with S390 user controlled virtual -machines, the resulting vcpu fd can be memory mapped at page offset -KVM_S390_SIE_PAGE_OFFSET in order to obtain a memory map of the virtual -cpu's hardware control block. - - -4.8 KVM_GET_DIRTY_LOG (vm ioctl) - -Capability: basic -Architectures: all -Type: vm ioctl -Parameters: struct kvm_dirty_log (in/out) -Returns: 0 on success, -1 on error - -/* for KVM_GET_DIRTY_LOG */ -struct kvm_dirty_log { - __u32 slot; - __u32 padding; - union { - void __user *dirty_bitmap; /* one bit per page */ - __u64 padding; - }; -}; - -Given a memory slot, return a bitmap containing any pages dirtied -since the last call to this ioctl. Bit 0 is the first page in the -memory slot. Ensure the entire structure is cleared to avoid padding -issues. - -If KVM_CAP_MULTI_ADDRESS_SPACE is available, bits 16-31 specifies -the address space for which you want to return the dirty bitmap. -They must be less than the value that KVM_CHECK_EXTENSION returns for -the KVM_CAP_MULTI_ADDRESS_SPACE capability. - -The bits in the dirty bitmap are cleared before the ioctl returns, unless -KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 is enabled. For more information, -see the description of the capability. - -4.9 KVM_SET_MEMORY_ALIAS - -Capability: basic -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_memory_alias (in) -Returns: 0 (success), -1 (error) - -This ioctl is obsolete and has been removed. - - -4.10 KVM_RUN - -Capability: basic -Architectures: all -Type: vcpu ioctl -Parameters: none -Returns: 0 on success, -1 on error -Errors: - EINTR: an unmasked signal is pending - -This ioctl is used to run a guest virtual cpu. While there are no -explicit parameters, there is an implicit parameter block that can be -obtained by mmap()ing the vcpu fd at offset 0, with the size given by -KVM_GET_VCPU_MMAP_SIZE. The parameter block is formatted as a 'struct -kvm_run' (see below). - - -4.11 KVM_GET_REGS - -Capability: basic -Architectures: all except ARM, arm64 -Type: vcpu ioctl -Parameters: struct kvm_regs (out) -Returns: 0 on success, -1 on error - -Reads the general purpose registers from the vcpu. - -/* x86 */ -struct kvm_regs { - /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */ - __u64 rax, rbx, rcx, rdx; - __u64 rsi, rdi, rsp, rbp; - __u64 r8, r9, r10, r11; - __u64 r12, r13, r14, r15; - __u64 rip, rflags; -}; - -/* mips */ -struct kvm_regs { - /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */ - __u64 gpr[32]; - __u64 hi; - __u64 lo; - __u64 pc; -}; - - -4.12 KVM_SET_REGS - -Capability: basic -Architectures: all except ARM, arm64 -Type: vcpu ioctl -Parameters: struct kvm_regs (in) -Returns: 0 on success, -1 on error - -Writes the general purpose registers into the vcpu. - -See KVM_GET_REGS for the data structure. - - -4.13 KVM_GET_SREGS - -Capability: basic -Architectures: x86, ppc -Type: vcpu ioctl -Parameters: struct kvm_sregs (out) -Returns: 0 on success, -1 on error - -Reads special registers from the vcpu. - -/* x86 */ -struct kvm_sregs { - struct kvm_segment cs, ds, es, fs, gs, ss; - struct kvm_segment tr, ldt; - struct kvm_dtable gdt, idt; - __u64 cr0, cr2, cr3, cr4, cr8; - __u64 efer; - __u64 apic_base; - __u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64]; -}; - -/* ppc -- see arch/powerpc/include/uapi/asm/kvm.h */ - -interrupt_bitmap is a bitmap of pending external interrupts. At most -one bit may be set. This interrupt has been acknowledged by the APIC -but not yet injected into the cpu core. - - -4.14 KVM_SET_SREGS - -Capability: basic -Architectures: x86, ppc -Type: vcpu ioctl -Parameters: struct kvm_sregs (in) -Returns: 0 on success, -1 on error - -Writes special registers into the vcpu. See KVM_GET_SREGS for the -data structures. - - -4.15 KVM_TRANSLATE - -Capability: basic -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_translation (in/out) -Returns: 0 on success, -1 on error - -Translates a virtual address according to the vcpu's current address -translation mode. - -struct kvm_translation { - /* in */ - __u64 linear_address; - - /* out */ - __u64 physical_address; - __u8 valid; - __u8 writeable; - __u8 usermode; - __u8 pad[5]; -}; - - -4.16 KVM_INTERRUPT - -Capability: basic -Architectures: x86, ppc, mips -Type: vcpu ioctl -Parameters: struct kvm_interrupt (in) -Returns: 0 on success, negative on failure. - -Queues a hardware interrupt vector to be injected. - -/* for KVM_INTERRUPT */ -struct kvm_interrupt { - /* in */ - __u32 irq; -}; - -X86: - -Returns: 0 on success, - -EEXIST if an interrupt is already enqueued - -EINVAL the the irq number is invalid - -ENXIO if the PIC is in the kernel - -EFAULT if the pointer is invalid - -Note 'irq' is an interrupt vector, not an interrupt pin or line. This -ioctl is useful if the in-kernel PIC is not used. - -PPC: - -Queues an external interrupt to be injected. This ioctl is overleaded -with 3 different irq values: - -a) KVM_INTERRUPT_SET - - This injects an edge type external interrupt into the guest once it's ready - to receive interrupts. When injected, the interrupt is done. - -b) KVM_INTERRUPT_UNSET - - This unsets any pending interrupt. - - Only available with KVM_CAP_PPC_UNSET_IRQ. - -c) KVM_INTERRUPT_SET_LEVEL - - This injects a level type external interrupt into the guest context. The - interrupt stays pending until a specific ioctl with KVM_INTERRUPT_UNSET - is triggered. - - Only available with KVM_CAP_PPC_IRQ_LEVEL. - -Note that any value for 'irq' other than the ones stated above is invalid -and incurs unexpected behavior. - -This is an asynchronous vcpu ioctl and can be invoked from any thread. - -MIPS: - -Queues an external interrupt to be injected into the virtual CPU. A negative -interrupt number dequeues the interrupt. - -This is an asynchronous vcpu ioctl and can be invoked from any thread. - - -4.17 KVM_DEBUG_GUEST - -Capability: basic -Architectures: none -Type: vcpu ioctl -Parameters: none) -Returns: -1 on error - -Support for this has been removed. Use KVM_SET_GUEST_DEBUG instead. - - -4.18 KVM_GET_MSRS - -Capability: basic (vcpu), KVM_CAP_GET_MSR_FEATURES (system) -Architectures: x86 -Type: system ioctl, vcpu ioctl -Parameters: struct kvm_msrs (in/out) -Returns: number of msrs successfully returned; - -1 on error - -When used as a system ioctl: -Reads the values of MSR-based features that are available for the VM. This -is similar to KVM_GET_SUPPORTED_CPUID, but it returns MSR indices and values. -The list of msr-based features can be obtained using KVM_GET_MSR_FEATURE_INDEX_LIST -in a system ioctl. - -When used as a vcpu ioctl: -Reads model-specific registers from the vcpu. Supported msr indices can -be obtained using KVM_GET_MSR_INDEX_LIST in a system ioctl. - -struct kvm_msrs { - __u32 nmsrs; /* number of msrs in entries */ - __u32 pad; - - struct kvm_msr_entry entries[0]; -}; - -struct kvm_msr_entry { - __u32 index; - __u32 reserved; - __u64 data; -}; - -Application code should set the 'nmsrs' member (which indicates the -size of the entries array) and the 'index' member of each array entry. -kvm will fill in the 'data' member. - - -4.19 KVM_SET_MSRS - -Capability: basic -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_msrs (in) -Returns: 0 on success, -1 on error - -Writes model-specific registers to the vcpu. See KVM_GET_MSRS for the -data structures. - -Application code should set the 'nmsrs' member (which indicates the -size of the entries array), and the 'index' and 'data' members of each -array entry. - - -4.20 KVM_SET_CPUID - -Capability: basic -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_cpuid (in) -Returns: 0 on success, -1 on error - -Defines the vcpu responses to the cpuid instruction. Applications -should use the KVM_SET_CPUID2 ioctl if available. - - -struct kvm_cpuid_entry { - __u32 function; - __u32 eax; - __u32 ebx; - __u32 ecx; - __u32 edx; - __u32 padding; -}; - -/* for KVM_SET_CPUID */ -struct kvm_cpuid { - __u32 nent; - __u32 padding; - struct kvm_cpuid_entry entries[0]; -}; - - -4.21 KVM_SET_SIGNAL_MASK - -Capability: basic -Architectures: all -Type: vcpu ioctl -Parameters: struct kvm_signal_mask (in) -Returns: 0 on success, -1 on error - -Defines which signals are blocked during execution of KVM_RUN. This -signal mask temporarily overrides the threads signal mask. Any -unblocked signal received (except SIGKILL and SIGSTOP, which retain -their traditional behaviour) will cause KVM_RUN to return with -EINTR. - -Note the signal will only be delivered if not blocked by the original -signal mask. - -/* for KVM_SET_SIGNAL_MASK */ -struct kvm_signal_mask { - __u32 len; - __u8 sigset[0]; -}; - - -4.22 KVM_GET_FPU - -Capability: basic -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_fpu (out) -Returns: 0 on success, -1 on error - -Reads the floating point state from the vcpu. - -/* for KVM_GET_FPU and KVM_SET_FPU */ -struct kvm_fpu { - __u8 fpr[8][16]; - __u16 fcw; - __u16 fsw; - __u8 ftwx; /* in fxsave format */ - __u8 pad1; - __u16 last_opcode; - __u64 last_ip; - __u64 last_dp; - __u8 xmm[16][16]; - __u32 mxcsr; - __u32 pad2; -}; - - -4.23 KVM_SET_FPU - -Capability: basic -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_fpu (in) -Returns: 0 on success, -1 on error - -Writes the floating point state to the vcpu. - -/* for KVM_GET_FPU and KVM_SET_FPU */ -struct kvm_fpu { - __u8 fpr[8][16]; - __u16 fcw; - __u16 fsw; - __u8 ftwx; /* in fxsave format */ - __u8 pad1; - __u16 last_opcode; - __u64 last_ip; - __u64 last_dp; - __u8 xmm[16][16]; - __u32 mxcsr; - __u32 pad2; -}; - - -4.24 KVM_CREATE_IRQCHIP - -Capability: KVM_CAP_IRQCHIP, KVM_CAP_S390_IRQCHIP (s390) -Architectures: x86, ARM, arm64, s390 -Type: vm ioctl -Parameters: none -Returns: 0 on success, -1 on error - -Creates an interrupt controller model in the kernel. -On x86, creates a virtual ioapic, a virtual PIC (two PICs, nested), and sets up -future vcpus to have a local APIC. IRQ routing for GSIs 0-15 is set to both -PIC and IOAPIC; GSI 16-23 only go to the IOAPIC. -On ARM/arm64, a GICv2 is created. Any other GIC versions require the usage of -KVM_CREATE_DEVICE, which also supports creating a GICv2. Using -KVM_CREATE_DEVICE is preferred over KVM_CREATE_IRQCHIP for GICv2. -On s390, a dummy irq routing table is created. - -Note that on s390 the KVM_CAP_S390_IRQCHIP vm capability needs to be enabled -before KVM_CREATE_IRQCHIP can be used. - - -4.25 KVM_IRQ_LINE - -Capability: KVM_CAP_IRQCHIP -Architectures: x86, arm, arm64 -Type: vm ioctl -Parameters: struct kvm_irq_level -Returns: 0 on success, -1 on error - -Sets the level of a GSI input to the interrupt controller model in the kernel. -On some architectures it is required that an interrupt controller model has -been previously created with KVM_CREATE_IRQCHIP. Note that edge-triggered -interrupts require the level to be set to 1 and then back to 0. - -On real hardware, interrupt pins can be active-low or active-high. This -does not matter for the level field of struct kvm_irq_level: 1 always -means active (asserted), 0 means inactive (deasserted). - -x86 allows the operating system to program the interrupt polarity -(active-low/active-high) for level-triggered interrupts, and KVM used -to consider the polarity. However, due to bitrot in the handling of -active-low interrupts, the above convention is now valid on x86 too. -This is signaled by KVM_CAP_X86_IOAPIC_POLARITY_IGNORED. Userspace -should not present interrupts to the guest as active-low unless this -capability is present (or unless it is not using the in-kernel irqchip, -of course). - - -ARM/arm64 can signal an interrupt either at the CPU level, or at the -in-kernel irqchip (GIC), and for in-kernel irqchip can tell the GIC to -use PPIs designated for specific cpus. The irq field is interpreted -like this: - -  bits: | 31 ... 24 | 23 ... 16 | 15 ... 0 | - field: | irq_type | vcpu_index | irq_id | - -The irq_type field has the following values: -- irq_type[0]: out-of-kernel GIC: irq_id 0 is IRQ, irq_id 1 is FIQ -- irq_type[1]: in-kernel GIC: SPI, irq_id between 32 and 1019 (incl.) - (the vcpu_index field is ignored) -- irq_type[2]: in-kernel GIC: PPI, irq_id between 16 and 31 (incl.) - -(The irq_id field thus corresponds nicely to the IRQ ID in the ARM GIC specs) - -In both cases, level is used to assert/deassert the line. - -struct kvm_irq_level { - union { - __u32 irq; /* GSI */ - __s32 status; /* not used for KVM_IRQ_LEVEL */ - }; - __u32 level; /* 0 or 1 */ -}; - - -4.26 KVM_GET_IRQCHIP - -Capability: KVM_CAP_IRQCHIP -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_irqchip (in/out) -Returns: 0 on success, -1 on error - -Reads the state of a kernel interrupt controller created with -KVM_CREATE_IRQCHIP into a buffer provided by the caller. - -struct kvm_irqchip { - __u32 chip_id; /* 0 = PIC1, 1 = PIC2, 2 = IOAPIC */ - __u32 pad; - union { - char dummy[512]; /* reserving space */ - struct kvm_pic_state pic; - struct kvm_ioapic_state ioapic; - } chip; -}; - - -4.27 KVM_SET_IRQCHIP - -Capability: KVM_CAP_IRQCHIP -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_irqchip (in) -Returns: 0 on success, -1 on error - -Sets the state of a kernel interrupt controller created with -KVM_CREATE_IRQCHIP from a buffer provided by the caller. - -struct kvm_irqchip { - __u32 chip_id; /* 0 = PIC1, 1 = PIC2, 2 = IOAPIC */ - __u32 pad; - union { - char dummy[512]; /* reserving space */ - struct kvm_pic_state pic; - struct kvm_ioapic_state ioapic; - } chip; -}; - - -4.28 KVM_XEN_HVM_CONFIG - -Capability: KVM_CAP_XEN_HVM -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_xen_hvm_config (in) -Returns: 0 on success, -1 on error - -Sets the MSR that the Xen HVM guest uses to initialize its hypercall -page, and provides the starting address and size of the hypercall -blobs in userspace. When the guest writes the MSR, kvm copies one -page of a blob (32- or 64-bit, depending on the vcpu mode) to guest -memory. - -struct kvm_xen_hvm_config { - __u32 flags; - __u32 msr; - __u64 blob_addr_32; - __u64 blob_addr_64; - __u8 blob_size_32; - __u8 blob_size_64; - __u8 pad2[30]; -}; - - -4.29 KVM_GET_CLOCK - -Capability: KVM_CAP_ADJUST_CLOCK -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_clock_data (out) -Returns: 0 on success, -1 on error - -Gets the current timestamp of kvmclock as seen by the current guest. In -conjunction with KVM_SET_CLOCK, it is used to ensure monotonicity on scenarios -such as migration. - -When KVM_CAP_ADJUST_CLOCK is passed to KVM_CHECK_EXTENSION, it returns the -set of bits that KVM can return in struct kvm_clock_data's flag member. - -The only flag defined now is KVM_CLOCK_TSC_STABLE. If set, the returned -value is the exact kvmclock value seen by all VCPUs at the instant -when KVM_GET_CLOCK was called. If clear, the returned value is simply -CLOCK_MONOTONIC plus a constant offset; the offset can be modified -with KVM_SET_CLOCK. KVM will try to make all VCPUs follow this clock, -but the exact value read by each VCPU could differ, because the host -TSC is not stable. - -struct kvm_clock_data { - __u64 clock; /* kvmclock current value */ - __u32 flags; - __u32 pad[9]; -}; - - -4.30 KVM_SET_CLOCK - -Capability: KVM_CAP_ADJUST_CLOCK -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_clock_data (in) -Returns: 0 on success, -1 on error - -Sets the current timestamp of kvmclock to the value specified in its parameter. -In conjunction with KVM_GET_CLOCK, it is used to ensure monotonicity on scenarios -such as migration. - -struct kvm_clock_data { - __u64 clock; /* kvmclock current value */ - __u32 flags; - __u32 pad[9]; -}; - - -4.31 KVM_GET_VCPU_EVENTS - -Capability: KVM_CAP_VCPU_EVENTS -Extended by: KVM_CAP_INTR_SHADOW -Architectures: x86, arm, arm64 -Type: vcpu ioctl -Parameters: struct kvm_vcpu_event (out) -Returns: 0 on success, -1 on error - -X86: - -Gets currently pending exceptions, interrupts, and NMIs as well as related -states of the vcpu. - -struct kvm_vcpu_events { - struct { - __u8 injected; - __u8 nr; - __u8 has_error_code; - __u8 pending; - __u32 error_code; - } exception; - struct { - __u8 injected; - __u8 nr; - __u8 soft; - __u8 shadow; - } interrupt; - struct { - __u8 injected; - __u8 pending; - __u8 masked; - __u8 pad; - } nmi; - __u32 sipi_vector; - __u32 flags; - struct { - __u8 smm; - __u8 pending; - __u8 smm_inside_nmi; - __u8 latched_init; - } smi; - __u8 reserved[27]; - __u8 exception_has_payload; - __u64 exception_payload; -}; - -The following bits are defined in the flags field: - -- KVM_VCPUEVENT_VALID_SHADOW may be set to signal that - interrupt.shadow contains a valid state. - -- KVM_VCPUEVENT_VALID_SMM may be set to signal that smi contains a - valid state. - -- KVM_VCPUEVENT_VALID_PAYLOAD may be set to signal that the - exception_has_payload, exception_payload, and exception.pending - fields contain a valid state. This bit will be set whenever - KVM_CAP_EXCEPTION_PAYLOAD is enabled. - -ARM/ARM64: - -If the guest accesses a device that is being emulated by the host kernel in -such a way that a real device would generate a physical SError, KVM may make -a virtual SError pending for that VCPU. This system error interrupt remains -pending until the guest takes the exception by unmasking PSTATE.A. - -Running the VCPU may cause it to take a pending SError, or make an access that -causes an SError to become pending. The event's description is only valid while -the VPCU is not running. - -This API provides a way to read and write the pending 'event' state that is not -visible to the guest. To save, restore or migrate a VCPU the struct representing -the state can be read then written using this GET/SET API, along with the other -guest-visible registers. It is not possible to 'cancel' an SError that has been -made pending. - -A device being emulated in user-space may also wish to generate an SError. To do -this the events structure can be populated by user-space. The current state -should be read first, to ensure no existing SError is pending. If an existing -SError is pending, the architecture's 'Multiple SError interrupts' rules should -be followed. (2.5.3 of DDI0587.a "ARM Reliability, Availability, and -Serviceability (RAS) Specification"). - -SError exceptions always have an ESR value. Some CPUs have the ability to -specify what the virtual SError's ESR value should be. These systems will -advertise KVM_CAP_ARM_INJECT_SERROR_ESR. In this case exception.has_esr will -always have a non-zero value when read, and the agent making an SError pending -should specify the ISS field in the lower 24 bits of exception.serror_esr. If -the system supports KVM_CAP_ARM_INJECT_SERROR_ESR, but user-space sets the events -with exception.has_esr as zero, KVM will choose an ESR. - -Specifying exception.has_esr on a system that does not support it will return --EINVAL. Setting anything other than the lower 24bits of exception.serror_esr -will return -EINVAL. - -struct kvm_vcpu_events { - struct { - __u8 serror_pending; - __u8 serror_has_esr; - /* Align it to 8 bytes */ - __u8 pad[6]; - __u64 serror_esr; - } exception; - __u32 reserved[12]; -}; - -4.32 KVM_SET_VCPU_EVENTS - -Capability: KVM_CAP_VCPU_EVENTS -Extended by: KVM_CAP_INTR_SHADOW -Architectures: x86, arm, arm64 -Type: vcpu ioctl -Parameters: struct kvm_vcpu_event (in) -Returns: 0 on success, -1 on error - -X86: - -Set pending exceptions, interrupts, and NMIs as well as related states of the -vcpu. - -See KVM_GET_VCPU_EVENTS for the data structure. - -Fields that may be modified asynchronously by running VCPUs can be excluded -from the update. These fields are nmi.pending, sipi_vector, smi.smm, -smi.pending. Keep the corresponding bits in the flags field cleared to -suppress overwriting the current in-kernel state. The bits are: - -KVM_VCPUEVENT_VALID_NMI_PENDING - transfer nmi.pending to the kernel -KVM_VCPUEVENT_VALID_SIPI_VECTOR - transfer sipi_vector -KVM_VCPUEVENT_VALID_SMM - transfer the smi sub-struct. - -If KVM_CAP_INTR_SHADOW is available, KVM_VCPUEVENT_VALID_SHADOW can be set in -the flags field to signal that interrupt.shadow contains a valid state and -shall be written into the VCPU. - -KVM_VCPUEVENT_VALID_SMM can only be set if KVM_CAP_X86_SMM is available. - -If KVM_CAP_EXCEPTION_PAYLOAD is enabled, KVM_VCPUEVENT_VALID_PAYLOAD -can be set in the flags field to signal that the -exception_has_payload, exception_payload, and exception.pending fields -contain a valid state and shall be written into the VCPU. - -ARM/ARM64: - -Set the pending SError exception state for this VCPU. It is not possible to -'cancel' an Serror that has been made pending. - -See KVM_GET_VCPU_EVENTS for the data structure. - - -4.33 KVM_GET_DEBUGREGS - -Capability: KVM_CAP_DEBUGREGS -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_debugregs (out) -Returns: 0 on success, -1 on error - -Reads debug registers from the vcpu. - -struct kvm_debugregs { - __u64 db[4]; - __u64 dr6; - __u64 dr7; - __u64 flags; - __u64 reserved[9]; -}; - - -4.34 KVM_SET_DEBUGREGS - -Capability: KVM_CAP_DEBUGREGS -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_debugregs (in) -Returns: 0 on success, -1 on error - -Writes debug registers into the vcpu. - -See KVM_GET_DEBUGREGS for the data structure. The flags field is unused -yet and must be cleared on entry. - - -4.35 KVM_SET_USER_MEMORY_REGION - -Capability: KVM_CAP_USER_MEMORY -Architectures: all -Type: vm ioctl -Parameters: struct kvm_userspace_memory_region (in) -Returns: 0 on success, -1 on error - -struct kvm_userspace_memory_region { - __u32 slot; - __u32 flags; - __u64 guest_phys_addr; - __u64 memory_size; /* bytes */ - __u64 userspace_addr; /* start of the userspace allocated memory */ -}; - -/* for kvm_memory_region::flags */ -#define KVM_MEM_LOG_DIRTY_PAGES (1UL << 0) -#define KVM_MEM_READONLY (1UL << 1) - -This ioctl allows the user to create, modify or delete a guest physical -memory slot. Bits 0-15 of "slot" specify the slot id and this value -should be less than the maximum number of user memory slots supported per -VM. The maximum allowed slots can be queried using KVM_CAP_NR_MEMSLOTS. -Slots may not overlap in guest physical address space. - -If KVM_CAP_MULTI_ADDRESS_SPACE is available, bits 16-31 of "slot" -specifies the address space which is being modified. They must be -less than the value that KVM_CHECK_EXTENSION returns for the -KVM_CAP_MULTI_ADDRESS_SPACE capability. Slots in separate address spaces -are unrelated; the restriction on overlapping slots only applies within -each address space. - -Deleting a slot is done by passing zero for memory_size. When changing -an existing slot, it may be moved in the guest physical memory space, -or its flags may be modified, but it may not be resized. - -Memory for the region is taken starting at the address denoted by the -field userspace_addr, which must point at user addressable memory for -the entire memory slot size. Any object may back this memory, including -anonymous memory, ordinary files, and hugetlbfs. - -It is recommended that the lower 21 bits of guest_phys_addr and userspace_addr -be identical. This allows large pages in the guest to be backed by large -pages in the host. - -The flags field supports two flags: KVM_MEM_LOG_DIRTY_PAGES and -KVM_MEM_READONLY. The former can be set to instruct KVM to keep track of -writes to memory within the slot. See KVM_GET_DIRTY_LOG ioctl to know how to -use it. The latter can be set, if KVM_CAP_READONLY_MEM capability allows it, -to make a new slot read-only. In this case, writes to this memory will be -posted to userspace as KVM_EXIT_MMIO exits. - -When the KVM_CAP_SYNC_MMU capability is available, changes in the backing of -the memory region are automatically reflected into the guest. For example, an -mmap() that affects the region will be made visible immediately. Another -example is madvise(MADV_DROP). - -It is recommended to use this API instead of the KVM_SET_MEMORY_REGION ioctl. -The KVM_SET_MEMORY_REGION does not allow fine grained control over memory -allocation and is deprecated. - - -4.36 KVM_SET_TSS_ADDR - -Capability: KVM_CAP_SET_TSS_ADDR -Architectures: x86 -Type: vm ioctl -Parameters: unsigned long tss_address (in) -Returns: 0 on success, -1 on error - -This ioctl defines the physical address of a three-page region in the guest -physical address space. The region must be within the first 4GB of the -guest physical address space and must not conflict with any memory slot -or any mmio address. The guest may malfunction if it accesses this memory -region. - -This ioctl is required on Intel-based hosts. This is needed on Intel hardware -because of a quirk in the virtualization implementation (see the internals -documentation when it pops into existence). - - -4.37 KVM_ENABLE_CAP - -Capability: KVM_CAP_ENABLE_CAP -Architectures: mips, ppc, s390 -Type: vcpu ioctl -Parameters: struct kvm_enable_cap (in) -Returns: 0 on success; -1 on error - -Capability: KVM_CAP_ENABLE_CAP_VM -Architectures: all -Type: vcpu ioctl -Parameters: struct kvm_enable_cap (in) -Returns: 0 on success; -1 on error - -+Not all extensions are enabled by default. Using this ioctl the application -can enable an extension, making it available to the guest. - -On systems that do not support this ioctl, it always fails. On systems that -do support it, it only works for extensions that are supported for enablement. - -To check if a capability can be enabled, the KVM_CHECK_EXTENSION ioctl should -be used. - -struct kvm_enable_cap { - /* in */ - __u32 cap; - -The capability that is supposed to get enabled. - - __u32 flags; - -A bitfield indicating future enhancements. Has to be 0 for now. - - __u64 args[4]; - -Arguments for enabling a feature. If a feature needs initial values to -function properly, this is the place to put them. - - __u8 pad[64]; -}; - -The vcpu ioctl should be used for vcpu-specific capabilities, the vm ioctl -for vm-wide capabilities. - -4.38 KVM_GET_MP_STATE - -Capability: KVM_CAP_MP_STATE -Architectures: x86, s390, arm, arm64 -Type: vcpu ioctl -Parameters: struct kvm_mp_state (out) -Returns: 0 on success; -1 on error - -struct kvm_mp_state { - __u32 mp_state; -}; - -Returns the vcpu's current "multiprocessing state" (though also valid on -uniprocessor guests). - -Possible values are: - - - KVM_MP_STATE_RUNNABLE: the vcpu is currently running [x86,arm/arm64] - - KVM_MP_STATE_UNINITIALIZED: the vcpu is an application processor (AP) - which has not yet received an INIT signal [x86] - - KVM_MP_STATE_INIT_RECEIVED: the vcpu has received an INIT signal, and is - now ready for a SIPI [x86] - - KVM_MP_STATE_HALTED: the vcpu has executed a HLT instruction and - is waiting for an interrupt [x86] - - KVM_MP_STATE_SIPI_RECEIVED: the vcpu has just received a SIPI (vector - accessible via KVM_GET_VCPU_EVENTS) [x86] - - KVM_MP_STATE_STOPPED: the vcpu is stopped [s390,arm/arm64] - - KVM_MP_STATE_CHECK_STOP: the vcpu is in a special error state [s390] - - KVM_MP_STATE_OPERATING: the vcpu is operating (running or halted) - [s390] - - KVM_MP_STATE_LOAD: the vcpu is in a special load/startup state - [s390] - -On x86, this ioctl is only useful after KVM_CREATE_IRQCHIP. Without an -in-kernel irqchip, the multiprocessing state must be maintained by userspace on -these architectures. - -For arm/arm64: - -The only states that are valid are KVM_MP_STATE_STOPPED and -KVM_MP_STATE_RUNNABLE which reflect if the vcpu is paused or not. - -4.39 KVM_SET_MP_STATE - -Capability: KVM_CAP_MP_STATE -Architectures: x86, s390, arm, arm64 -Type: vcpu ioctl -Parameters: struct kvm_mp_state (in) -Returns: 0 on success; -1 on error - -Sets the vcpu's current "multiprocessing state"; see KVM_GET_MP_STATE for -arguments. - -On x86, this ioctl is only useful after KVM_CREATE_IRQCHIP. Without an -in-kernel irqchip, the multiprocessing state must be maintained by userspace on -these architectures. - -For arm/arm64: - -The only states that are valid are KVM_MP_STATE_STOPPED and -KVM_MP_STATE_RUNNABLE which reflect if the vcpu should be paused or not. - -4.40 KVM_SET_IDENTITY_MAP_ADDR - -Capability: KVM_CAP_SET_IDENTITY_MAP_ADDR -Architectures: x86 -Type: vm ioctl -Parameters: unsigned long identity (in) -Returns: 0 on success, -1 on error - -This ioctl defines the physical address of a one-page region in the guest -physical address space. The region must be within the first 4GB of the -guest physical address space and must not conflict with any memory slot -or any mmio address. The guest may malfunction if it accesses this memory -region. - -Setting the address to 0 will result in resetting the address to its default -(0xfffbc000). - -This ioctl is required on Intel-based hosts. This is needed on Intel hardware -because of a quirk in the virtualization implementation (see the internals -documentation when it pops into existence). - -Fails if any VCPU has already been created. - -4.41 KVM_SET_BOOT_CPU_ID - -Capability: KVM_CAP_SET_BOOT_CPU_ID -Architectures: x86 -Type: vm ioctl -Parameters: unsigned long vcpu_id -Returns: 0 on success, -1 on error - -Define which vcpu is the Bootstrap Processor (BSP). Values are the same -as the vcpu id in KVM_CREATE_VCPU. If this ioctl is not called, the default -is vcpu 0. - - -4.42 KVM_GET_XSAVE - -Capability: KVM_CAP_XSAVE -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_xsave (out) -Returns: 0 on success, -1 on error - -struct kvm_xsave { - __u32 region[1024]; -}; - -This ioctl would copy current vcpu's xsave struct to the userspace. - - -4.43 KVM_SET_XSAVE - -Capability: KVM_CAP_XSAVE -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_xsave (in) -Returns: 0 on success, -1 on error - -struct kvm_xsave { - __u32 region[1024]; -}; - -This ioctl would copy userspace's xsave struct to the kernel. - - -4.44 KVM_GET_XCRS - -Capability: KVM_CAP_XCRS -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_xcrs (out) -Returns: 0 on success, -1 on error - -struct kvm_xcr { - __u32 xcr; - __u32 reserved; - __u64 value; -}; - -struct kvm_xcrs { - __u32 nr_xcrs; - __u32 flags; - struct kvm_xcr xcrs[KVM_MAX_XCRS]; - __u64 padding[16]; -}; - -This ioctl would copy current vcpu's xcrs to the userspace. - - -4.45 KVM_SET_XCRS - -Capability: KVM_CAP_XCRS -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_xcrs (in) -Returns: 0 on success, -1 on error - -struct kvm_xcr { - __u32 xcr; - __u32 reserved; - __u64 value; -}; - -struct kvm_xcrs { - __u32 nr_xcrs; - __u32 flags; - struct kvm_xcr xcrs[KVM_MAX_XCRS]; - __u64 padding[16]; -}; - -This ioctl would set vcpu's xcr to the value userspace specified. - - -4.46 KVM_GET_SUPPORTED_CPUID - -Capability: KVM_CAP_EXT_CPUID -Architectures: x86 -Type: system ioctl -Parameters: struct kvm_cpuid2 (in/out) -Returns: 0 on success, -1 on error - -struct kvm_cpuid2 { - __u32 nent; - __u32 padding; - struct kvm_cpuid_entry2 entries[0]; -}; - -#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX BIT(0) -#define KVM_CPUID_FLAG_STATEFUL_FUNC BIT(1) -#define KVM_CPUID_FLAG_STATE_READ_NEXT BIT(2) - -struct kvm_cpuid_entry2 { - __u32 function; - __u32 index; - __u32 flags; - __u32 eax; - __u32 ebx; - __u32 ecx; - __u32 edx; - __u32 padding[3]; -}; - -This ioctl returns x86 cpuid features which are supported by both the -hardware and kvm in its default configuration. Userspace can use the -information returned by this ioctl to construct cpuid information (for -KVM_SET_CPUID2) that is consistent with hardware, kernel, and -userspace capabilities, and with user requirements (for example, the -user may wish to constrain cpuid to emulate older hardware, or for -feature consistency across a cluster). - -Note that certain capabilities, such as KVM_CAP_X86_DISABLE_EXITS, may -expose cpuid features (e.g. MONITOR) which are not supported by kvm in -its default configuration. If userspace enables such capabilities, it -is responsible for modifying the results of this ioctl appropriately. - -Userspace invokes KVM_GET_SUPPORTED_CPUID by passing a kvm_cpuid2 structure -with the 'nent' field indicating the number of entries in the variable-size -array 'entries'. If the number of entries is too low to describe the cpu -capabilities, an error (E2BIG) is returned. If the number is too high, -the 'nent' field is adjusted and an error (ENOMEM) is returned. If the -number is just right, the 'nent' field is adjusted to the number of valid -entries in the 'entries' array, which is then filled. - -The entries returned are the host cpuid as returned by the cpuid instruction, -with unknown or unsupported features masked out. Some features (for example, -x2apic), may not be present in the host cpu, but are exposed by kvm if it can -emulate them efficiently. The fields in each entry are defined as follows: - - function: the eax value used to obtain the entry - index: the ecx value used to obtain the entry (for entries that are - affected by ecx) - flags: an OR of zero or more of the following: - KVM_CPUID_FLAG_SIGNIFCANT_INDEX: - if the index field is valid - KVM_CPUID_FLAG_STATEFUL_FUNC: - if cpuid for this function returns different values for successive - invocations; there will be several entries with the same function, - all with this flag set - KVM_CPUID_FLAG_STATE_READ_NEXT: - for KVM_CPUID_FLAG_STATEFUL_FUNC entries, set if this entry is - the first entry to be read by a cpu - eax, ebx, ecx, edx: the values returned by the cpuid instruction for - this function/index combination - -The TSC deadline timer feature (CPUID leaf 1, ecx[24]) is always returned -as false, since the feature depends on KVM_CREATE_IRQCHIP for local APIC -support. Instead it is reported via - - ioctl(KVM_CHECK_EXTENSION, KVM_CAP_TSC_DEADLINE_TIMER) - -if that returns true and you use KVM_CREATE_IRQCHIP, or if you emulate the -feature in userspace, then you can enable the feature for KVM_SET_CPUID2. - - -4.47 KVM_PPC_GET_PVINFO - -Capability: KVM_CAP_PPC_GET_PVINFO -Architectures: ppc -Type: vm ioctl -Parameters: struct kvm_ppc_pvinfo (out) -Returns: 0 on success, !0 on error - -struct kvm_ppc_pvinfo { - __u32 flags; - __u32 hcall[4]; - __u8 pad[108]; -}; - -This ioctl fetches PV specific information that need to be passed to the guest -using the device tree or other means from vm context. - -The hcall array defines 4 instructions that make up a hypercall. - -If any additional field gets added to this structure later on, a bit for that -additional piece of information will be set in the flags bitmap. - -The flags bitmap is defined as: - - /* the host supports the ePAPR idle hcall - #define KVM_PPC_PVINFO_FLAGS_EV_IDLE (1<<0) - -4.52 KVM_SET_GSI_ROUTING - -Capability: KVM_CAP_IRQ_ROUTING -Architectures: x86 s390 arm arm64 -Type: vm ioctl -Parameters: struct kvm_irq_routing (in) -Returns: 0 on success, -1 on error - -Sets the GSI routing table entries, overwriting any previously set entries. - -On arm/arm64, GSI routing has the following limitation: -- GSI routing does not apply to KVM_IRQ_LINE but only to KVM_IRQFD. - -struct kvm_irq_routing { - __u32 nr; - __u32 flags; - struct kvm_irq_routing_entry entries[0]; -}; - -No flags are specified so far, the corresponding field must be set to zero. - -struct kvm_irq_routing_entry { - __u32 gsi; - __u32 type; - __u32 flags; - __u32 pad; - union { - struct kvm_irq_routing_irqchip irqchip; - struct kvm_irq_routing_msi msi; - struct kvm_irq_routing_s390_adapter adapter; - struct kvm_irq_routing_hv_sint hv_sint; - __u32 pad[8]; - } u; -}; - -/* gsi routing entry types */ -#define KVM_IRQ_ROUTING_IRQCHIP 1 -#define KVM_IRQ_ROUTING_MSI 2 -#define KVM_IRQ_ROUTING_S390_ADAPTER 3 -#define KVM_IRQ_ROUTING_HV_SINT 4 - -flags: -- KVM_MSI_VALID_DEVID: used along with KVM_IRQ_ROUTING_MSI routing entry - type, specifies that the devid field contains a valid value. The per-VM - KVM_CAP_MSI_DEVID capability advertises the requirement to provide - the device ID. If this capability is not available, userspace should - never set the KVM_MSI_VALID_DEVID flag as the ioctl might fail. -- zero otherwise - -struct kvm_irq_routing_irqchip { - __u32 irqchip; - __u32 pin; -}; - -struct kvm_irq_routing_msi { - __u32 address_lo; - __u32 address_hi; - __u32 data; - union { - __u32 pad; - __u32 devid; - }; -}; - -If KVM_MSI_VALID_DEVID is set, devid contains a unique device identifier -for the device that wrote the MSI message. For PCI, this is usually a -BFD identifier in the lower 16 bits. - -On x86, address_hi is ignored unless the KVM_X2APIC_API_USE_32BIT_IDS -feature of KVM_CAP_X2APIC_API capability is enabled. If it is enabled, -address_hi bits 31-8 provide bits 31-8 of the destination id. Bits 7-0 of -address_hi must be zero. - -struct kvm_irq_routing_s390_adapter { - __u64 ind_addr; - __u64 summary_addr; - __u64 ind_offset; - __u32 summary_offset; - __u32 adapter_id; -}; - -struct kvm_irq_routing_hv_sint { - __u32 vcpu; - __u32 sint; -}; - - -4.55 KVM_SET_TSC_KHZ - -Capability: KVM_CAP_TSC_CONTROL -Architectures: x86 -Type: vcpu ioctl -Parameters: virtual tsc_khz -Returns: 0 on success, -1 on error - -Specifies the tsc frequency for the virtual machine. The unit of the -frequency is KHz. - - -4.56 KVM_GET_TSC_KHZ - -Capability: KVM_CAP_GET_TSC_KHZ -Architectures: x86 -Type: vcpu ioctl -Parameters: none -Returns: virtual tsc-khz on success, negative value on error - -Returns the tsc frequency of the guest. The unit of the return value is -KHz. If the host has unstable tsc this ioctl returns -EIO instead as an -error. - - -4.57 KVM_GET_LAPIC - -Capability: KVM_CAP_IRQCHIP -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_lapic_state (out) -Returns: 0 on success, -1 on error - -#define KVM_APIC_REG_SIZE 0x400 -struct kvm_lapic_state { - char regs[KVM_APIC_REG_SIZE]; -}; - -Reads the Local APIC registers and copies them into the input argument. The -data format and layout are the same as documented in the architecture manual. - -If KVM_X2APIC_API_USE_32BIT_IDS feature of KVM_CAP_X2APIC_API is -enabled, then the format of APIC_ID register depends on the APIC mode -(reported by MSR_IA32_APICBASE) of its VCPU. x2APIC stores APIC ID in -the APIC_ID register (bytes 32-35). xAPIC only allows an 8-bit APIC ID -which is stored in bits 31-24 of the APIC register, or equivalently in -byte 35 of struct kvm_lapic_state's regs field. KVM_GET_LAPIC must then -be called after MSR_IA32_APICBASE has been set with KVM_SET_MSR. - -If KVM_X2APIC_API_USE_32BIT_IDS feature is disabled, struct kvm_lapic_state -always uses xAPIC format. - - -4.58 KVM_SET_LAPIC - -Capability: KVM_CAP_IRQCHIP -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_lapic_state (in) -Returns: 0 on success, -1 on error - -#define KVM_APIC_REG_SIZE 0x400 -struct kvm_lapic_state { - char regs[KVM_APIC_REG_SIZE]; -}; - -Copies the input argument into the Local APIC registers. The data format -and layout are the same as documented in the architecture manual. - -The format of the APIC ID register (bytes 32-35 of struct kvm_lapic_state's -regs field) depends on the state of the KVM_CAP_X2APIC_API capability. -See the note in KVM_GET_LAPIC. - - -4.59 KVM_IOEVENTFD - -Capability: KVM_CAP_IOEVENTFD -Architectures: all -Type: vm ioctl -Parameters: struct kvm_ioeventfd (in) -Returns: 0 on success, !0 on error - -This ioctl attaches or detaches an ioeventfd to a legal pio/mmio address -within the guest. A guest write in the registered address will signal the -provided event instead of triggering an exit. - -struct kvm_ioeventfd { - __u64 datamatch; - __u64 addr; /* legal pio/mmio address */ - __u32 len; /* 0, 1, 2, 4, or 8 bytes */ - __s32 fd; - __u32 flags; - __u8 pad[36]; -}; - -For the special case of virtio-ccw devices on s390, the ioevent is matched -to a subchannel/virtqueue tuple instead. - -The following flags are defined: - -#define KVM_IOEVENTFD_FLAG_DATAMATCH (1 << kvm_ioeventfd_flag_nr_datamatch) -#define KVM_IOEVENTFD_FLAG_PIO (1 << kvm_ioeventfd_flag_nr_pio) -#define KVM_IOEVENTFD_FLAG_DEASSIGN (1 << kvm_ioeventfd_flag_nr_deassign) -#define KVM_IOEVENTFD_FLAG_VIRTIO_CCW_NOTIFY \ - (1 << kvm_ioeventfd_flag_nr_virtio_ccw_notify) - -If datamatch flag is set, the event will be signaled only if the written value -to the registered address is equal to datamatch in struct kvm_ioeventfd. - -For virtio-ccw devices, addr contains the subchannel id and datamatch the -virtqueue index. - -With KVM_CAP_IOEVENTFD_ANY_LENGTH, a zero length ioeventfd is allowed, and -the kernel will ignore the length of guest write and may get a faster vmexit. -The speedup may only apply to specific architectures, but the ioeventfd will -work anyway. - -4.60 KVM_DIRTY_TLB - -Capability: KVM_CAP_SW_TLB -Architectures: ppc -Type: vcpu ioctl -Parameters: struct kvm_dirty_tlb (in) -Returns: 0 on success, -1 on error - -struct kvm_dirty_tlb { - __u64 bitmap; - __u32 num_dirty; -}; - -This must be called whenever userspace has changed an entry in the shared -TLB, prior to calling KVM_RUN on the associated vcpu. - -The "bitmap" field is the userspace address of an array. This array -consists of a number of bits, equal to the total number of TLB entries as -determined by the last successful call to KVM_CONFIG_TLB, rounded up to the -nearest multiple of 64. - -Each bit corresponds to one TLB entry, ordered the same as in the shared TLB -array. - -The array is little-endian: the bit 0 is the least significant bit of the -first byte, bit 8 is the least significant bit of the second byte, etc. -This avoids any complications with differing word sizes. - -The "num_dirty" field is a performance hint for KVM to determine whether it -should skip processing the bitmap and just invalidate everything. It must -be set to the number of set bits in the bitmap. - - -4.62 KVM_CREATE_SPAPR_TCE - -Capability: KVM_CAP_SPAPR_TCE -Architectures: powerpc -Type: vm ioctl -Parameters: struct kvm_create_spapr_tce (in) -Returns: file descriptor for manipulating the created TCE table - -This creates a virtual TCE (translation control entry) table, which -is an IOMMU for PAPR-style virtual I/O. It is used to translate -logical addresses used in virtual I/O into guest physical addresses, -and provides a scatter/gather capability for PAPR virtual I/O. - -/* for KVM_CAP_SPAPR_TCE */ -struct kvm_create_spapr_tce { - __u64 liobn; - __u32 window_size; -}; - -The liobn field gives the logical IO bus number for which to create a -TCE table. The window_size field specifies the size of the DMA window -which this TCE table will translate - the table will contain one 64 -bit TCE entry for every 4kiB of the DMA window. - -When the guest issues an H_PUT_TCE hcall on a liobn for which a TCE -table has been created using this ioctl(), the kernel will handle it -in real mode, updating the TCE table. H_PUT_TCE calls for other -liobns will cause a vm exit and must be handled by userspace. - -The return value is a file descriptor which can be passed to mmap(2) -to map the created TCE table into userspace. This lets userspace read -the entries written by kernel-handled H_PUT_TCE calls, and also lets -userspace update the TCE table directly which is useful in some -circumstances. - - -4.63 KVM_ALLOCATE_RMA - -Capability: KVM_CAP_PPC_RMA -Architectures: powerpc -Type: vm ioctl -Parameters: struct kvm_allocate_rma (out) -Returns: file descriptor for mapping the allocated RMA - -This allocates a Real Mode Area (RMA) from the pool allocated at boot -time by the kernel. An RMA is a physically-contiguous, aligned region -of memory used on older POWER processors to provide the memory which -will be accessed by real-mode (MMU off) accesses in a KVM guest. -POWER processors support a set of sizes for the RMA that usually -includes 64MB, 128MB, 256MB and some larger powers of two. - -/* for KVM_ALLOCATE_RMA */ -struct kvm_allocate_rma { - __u64 rma_size; -}; - -The return value is a file descriptor which can be passed to mmap(2) -to map the allocated RMA into userspace. The mapped area can then be -passed to the KVM_SET_USER_MEMORY_REGION ioctl to establish it as the -RMA for a virtual machine. The size of the RMA in bytes (which is -fixed at host kernel boot time) is returned in the rma_size field of -the argument structure. - -The KVM_CAP_PPC_RMA capability is 1 or 2 if the KVM_ALLOCATE_RMA ioctl -is supported; 2 if the processor requires all virtual machines to have -an RMA, or 1 if the processor can use an RMA but doesn't require it, -because it supports the Virtual RMA (VRMA) facility. - - -4.64 KVM_NMI - -Capability: KVM_CAP_USER_NMI -Architectures: x86 -Type: vcpu ioctl -Parameters: none -Returns: 0 on success, -1 on error - -Queues an NMI on the thread's vcpu. Note this is well defined only -when KVM_CREATE_IRQCHIP has not been called, since this is an interface -between the virtual cpu core and virtual local APIC. After KVM_CREATE_IRQCHIP -has been called, this interface is completely emulated within the kernel. - -To use this to emulate the LINT1 input with KVM_CREATE_IRQCHIP, use the -following algorithm: - - - pause the vcpu - - read the local APIC's state (KVM_GET_LAPIC) - - check whether changing LINT1 will queue an NMI (see the LVT entry for LINT1) - - if so, issue KVM_NMI - - resume the vcpu - -Some guests configure the LINT1 NMI input to cause a panic, aiding in -debugging. - - -4.65 KVM_S390_UCAS_MAP - -Capability: KVM_CAP_S390_UCONTROL -Architectures: s390 -Type: vcpu ioctl -Parameters: struct kvm_s390_ucas_mapping (in) -Returns: 0 in case of success - -The parameter is defined like this: - struct kvm_s390_ucas_mapping { - __u64 user_addr; - __u64 vcpu_addr; - __u64 length; - }; - -This ioctl maps the memory at "user_addr" with the length "length" to -the vcpu's address space starting at "vcpu_addr". All parameters need to -be aligned by 1 megabyte. - - -4.66 KVM_S390_UCAS_UNMAP - -Capability: KVM_CAP_S390_UCONTROL -Architectures: s390 -Type: vcpu ioctl -Parameters: struct kvm_s390_ucas_mapping (in) -Returns: 0 in case of success - -The parameter is defined like this: - struct kvm_s390_ucas_mapping { - __u64 user_addr; - __u64 vcpu_addr; - __u64 length; - }; - -This ioctl unmaps the memory in the vcpu's address space starting at -"vcpu_addr" with the length "length". The field "user_addr" is ignored. -All parameters need to be aligned by 1 megabyte. - - -4.67 KVM_S390_VCPU_FAULT - -Capability: KVM_CAP_S390_UCONTROL -Architectures: s390 -Type: vcpu ioctl -Parameters: vcpu absolute address (in) -Returns: 0 in case of success - -This call creates a page table entry on the virtual cpu's address space -(for user controlled virtual machines) or the virtual machine's address -space (for regular virtual machines). This only works for minor faults, -thus it's recommended to access subject memory page via the user page -table upfront. This is useful to handle validity intercepts for user -controlled virtual machines to fault in the virtual cpu's lowcore pages -prior to calling the KVM_RUN ioctl. - - -4.68 KVM_SET_ONE_REG - -Capability: KVM_CAP_ONE_REG -Architectures: all -Type: vcpu ioctl -Parameters: struct kvm_one_reg (in) -Returns: 0 on success, negative value on failure -Errors: -  ENOENT:   no such register -  EINVAL:   invalid register ID, or no such register -  EPERM:    (arm64) register access not allowed before vcpu finalization -(These error codes are indicative only: do not rely on a specific error -code being returned in a specific situation.) - -struct kvm_one_reg { - __u64 id; - __u64 addr; -}; - -Using this ioctl, a single vcpu register can be set to a specific value -defined by user space with the passed in struct kvm_one_reg, where id -refers to the register identifier as described below and addr is a pointer -to a variable with the respective size. There can be architecture agnostic -and architecture specific registers. Each have their own range of operation -and their own constants and width. To keep track of the implemented -registers, find a list below: - - Arch | Register | Width (bits) - | | - PPC | KVM_REG_PPC_HIOR | 64 - PPC | KVM_REG_PPC_IAC1 | 64 - PPC | KVM_REG_PPC_IAC2 | 64 - PPC | KVM_REG_PPC_IAC3 | 64 - PPC | KVM_REG_PPC_IAC4 | 64 - PPC | KVM_REG_PPC_DAC1 | 64 - PPC | KVM_REG_PPC_DAC2 | 64 - PPC | KVM_REG_PPC_DABR | 64 - PPC | KVM_REG_PPC_DSCR | 64 - PPC | KVM_REG_PPC_PURR | 64 - PPC | KVM_REG_PPC_SPURR | 64 - PPC | KVM_REG_PPC_DAR | 64 - PPC | KVM_REG_PPC_DSISR | 32 - PPC | KVM_REG_PPC_AMR | 64 - PPC | KVM_REG_PPC_UAMOR | 64 - PPC | KVM_REG_PPC_MMCR0 | 64 - PPC | KVM_REG_PPC_MMCR1 | 64 - PPC | KVM_REG_PPC_MMCRA | 64 - PPC | KVM_REG_PPC_MMCR2 | 64 - PPC | KVM_REG_PPC_MMCRS | 64 - PPC | KVM_REG_PPC_SIAR | 64 - PPC | KVM_REG_PPC_SDAR | 64 - PPC | KVM_REG_PPC_SIER | 64 - PPC | KVM_REG_PPC_PMC1 | 32 - PPC | KVM_REG_PPC_PMC2 | 32 - PPC | KVM_REG_PPC_PMC3 | 32 - PPC | KVM_REG_PPC_PMC4 | 32 - PPC | KVM_REG_PPC_PMC5 | 32 - PPC | KVM_REG_PPC_PMC6 | 32 - PPC | KVM_REG_PPC_PMC7 | 32 - PPC | KVM_REG_PPC_PMC8 | 32 - PPC | KVM_REG_PPC_FPR0 | 64 - ... - PPC | KVM_REG_PPC_FPR31 | 64 - PPC | KVM_REG_PPC_VR0 | 128 - ... - PPC | KVM_REG_PPC_VR31 | 128 - PPC | KVM_REG_PPC_VSR0 | 128 - ... - PPC | KVM_REG_PPC_VSR31 | 128 - PPC | KVM_REG_PPC_FPSCR | 64 - PPC | KVM_REG_PPC_VSCR | 32 - PPC | KVM_REG_PPC_VPA_ADDR | 64 - PPC | KVM_REG_PPC_VPA_SLB | 128 - PPC | KVM_REG_PPC_VPA_DTL | 128 - PPC | KVM_REG_PPC_EPCR | 32 - PPC | KVM_REG_PPC_EPR | 32 - PPC | KVM_REG_PPC_TCR | 32 - PPC | KVM_REG_PPC_TSR | 32 - PPC | KVM_REG_PPC_OR_TSR | 32 - PPC | KVM_REG_PPC_CLEAR_TSR | 32 - PPC | KVM_REG_PPC_MAS0 | 32 - PPC | KVM_REG_PPC_MAS1 | 32 - PPC | KVM_REG_PPC_MAS2 | 64 - PPC | KVM_REG_PPC_MAS7_3 | 64 - PPC | KVM_REG_PPC_MAS4 | 32 - PPC | KVM_REG_PPC_MAS6 | 32 - PPC | KVM_REG_PPC_MMUCFG | 32 - PPC | KVM_REG_PPC_TLB0CFG | 32 - PPC | KVM_REG_PPC_TLB1CFG | 32 - PPC | KVM_REG_PPC_TLB2CFG | 32 - PPC | KVM_REG_PPC_TLB3CFG | 32 - PPC | KVM_REG_PPC_TLB0PS | 32 - PPC | KVM_REG_PPC_TLB1PS | 32 - PPC | KVM_REG_PPC_TLB2PS | 32 - PPC | KVM_REG_PPC_TLB3PS | 32 - PPC | KVM_REG_PPC_EPTCFG | 32 - PPC | KVM_REG_PPC_ICP_STATE | 64 - PPC | KVM_REG_PPC_VP_STATE | 128 - PPC | KVM_REG_PPC_TB_OFFSET | 64 - PPC | KVM_REG_PPC_SPMC1 | 32 - PPC | KVM_REG_PPC_SPMC2 | 32 - PPC | KVM_REG_PPC_IAMR | 64 - PPC | KVM_REG_PPC_TFHAR | 64 - PPC | KVM_REG_PPC_TFIAR | 64 - PPC | KVM_REG_PPC_TEXASR | 64 - PPC | KVM_REG_PPC_FSCR | 64 - PPC | KVM_REG_PPC_PSPB | 32 - PPC | KVM_REG_PPC_EBBHR | 64 - PPC | KVM_REG_PPC_EBBRR | 64 - PPC | KVM_REG_PPC_BESCR | 64 - PPC | KVM_REG_PPC_TAR | 64 - PPC | KVM_REG_PPC_DPDES | 64 - PPC | KVM_REG_PPC_DAWR | 64 - PPC | KVM_REG_PPC_DAWRX | 64 - PPC | KVM_REG_PPC_CIABR | 64 - PPC | KVM_REG_PPC_IC | 64 - PPC | KVM_REG_PPC_VTB | 64 - PPC | KVM_REG_PPC_CSIGR | 64 - PPC | KVM_REG_PPC_TACR | 64 - PPC | KVM_REG_PPC_TCSCR | 64 - PPC | KVM_REG_PPC_PID | 64 - PPC | KVM_REG_PPC_ACOP | 64 - PPC | KVM_REG_PPC_VRSAVE | 32 - PPC | KVM_REG_PPC_LPCR | 32 - PPC | KVM_REG_PPC_LPCR_64 | 64 - PPC | KVM_REG_PPC_PPR | 64 - PPC | KVM_REG_PPC_ARCH_COMPAT | 32 - PPC | KVM_REG_PPC_DABRX | 32 - PPC | KVM_REG_PPC_WORT | 64 - PPC | KVM_REG_PPC_SPRG9 | 64 - PPC | KVM_REG_PPC_DBSR | 32 - PPC | KVM_REG_PPC_TIDR | 64 - PPC | KVM_REG_PPC_PSSCR | 64 - PPC | KVM_REG_PPC_DEC_EXPIRY | 64 - PPC | KVM_REG_PPC_PTCR | 64 - PPC | KVM_REG_PPC_TM_GPR0 | 64 - ... - PPC | KVM_REG_PPC_TM_GPR31 | 64 - PPC | KVM_REG_PPC_TM_VSR0 | 128 - ... - PPC | KVM_REG_PPC_TM_VSR63 | 128 - PPC | KVM_REG_PPC_TM_CR | 64 - PPC | KVM_REG_PPC_TM_LR | 64 - PPC | KVM_REG_PPC_TM_CTR | 64 - PPC | KVM_REG_PPC_TM_FPSCR | 64 - PPC | KVM_REG_PPC_TM_AMR | 64 - PPC | KVM_REG_PPC_TM_PPR | 64 - PPC | KVM_REG_PPC_TM_VRSAVE | 64 - PPC | KVM_REG_PPC_TM_VSCR | 32 - PPC | KVM_REG_PPC_TM_DSCR | 64 - PPC | KVM_REG_PPC_TM_TAR | 64 - PPC | KVM_REG_PPC_TM_XER | 64 - | | - MIPS | KVM_REG_MIPS_R0 | 64 - ... - MIPS | KVM_REG_MIPS_R31 | 64 - MIPS | KVM_REG_MIPS_HI | 64 - MIPS | KVM_REG_MIPS_LO | 64 - MIPS | KVM_REG_MIPS_PC | 64 - MIPS | KVM_REG_MIPS_CP0_INDEX | 32 - MIPS | KVM_REG_MIPS_CP0_ENTRYLO0 | 64 - MIPS | KVM_REG_MIPS_CP0_ENTRYLO1 | 64 - MIPS | KVM_REG_MIPS_CP0_CONTEXT | 64 - MIPS | KVM_REG_MIPS_CP0_CONTEXTCONFIG| 32 - MIPS | KVM_REG_MIPS_CP0_USERLOCAL | 64 - MIPS | KVM_REG_MIPS_CP0_XCONTEXTCONFIG| 64 - MIPS | KVM_REG_MIPS_CP0_PAGEMASK | 32 - MIPS | KVM_REG_MIPS_CP0_PAGEGRAIN | 32 - MIPS | KVM_REG_MIPS_CP0_SEGCTL0 | 64 - MIPS | KVM_REG_MIPS_CP0_SEGCTL1 | 64 - MIPS | KVM_REG_MIPS_CP0_SEGCTL2 | 64 - MIPS | KVM_REG_MIPS_CP0_PWBASE | 64 - MIPS | KVM_REG_MIPS_CP0_PWFIELD | 64 - MIPS | KVM_REG_MIPS_CP0_PWSIZE | 64 - MIPS | KVM_REG_MIPS_CP0_WIRED | 32 - MIPS | KVM_REG_MIPS_CP0_PWCTL | 32 - MIPS | KVM_REG_MIPS_CP0_HWRENA | 32 - MIPS | KVM_REG_MIPS_CP0_BADVADDR | 64 - MIPS | KVM_REG_MIPS_CP0_BADINSTR | 32 - MIPS | KVM_REG_MIPS_CP0_BADINSTRP | 32 - MIPS | KVM_REG_MIPS_CP0_COUNT | 32 - MIPS | KVM_REG_MIPS_CP0_ENTRYHI | 64 - MIPS | KVM_REG_MIPS_CP0_COMPARE | 32 - MIPS | KVM_REG_MIPS_CP0_STATUS | 32 - MIPS | KVM_REG_MIPS_CP0_INTCTL | 32 - MIPS | KVM_REG_MIPS_CP0_CAUSE | 32 - MIPS | KVM_REG_MIPS_CP0_EPC | 64 - MIPS | KVM_REG_MIPS_CP0_PRID | 32 - MIPS | KVM_REG_MIPS_CP0_EBASE | 64 - MIPS | KVM_REG_MIPS_CP0_CONFIG | 32 - MIPS | KVM_REG_MIPS_CP0_CONFIG1 | 32 - MIPS | KVM_REG_MIPS_CP0_CONFIG2 | 32 - MIPS | KVM_REG_MIPS_CP0_CONFIG3 | 32 - MIPS | KVM_REG_MIPS_CP0_CONFIG4 | 32 - MIPS | KVM_REG_MIPS_CP0_CONFIG5 | 32 - MIPS | KVM_REG_MIPS_CP0_CONFIG7 | 32 - MIPS | KVM_REG_MIPS_CP0_XCONTEXT | 64 - MIPS | KVM_REG_MIPS_CP0_ERROREPC | 64 - MIPS | KVM_REG_MIPS_CP0_KSCRATCH1 | 64 - MIPS | KVM_REG_MIPS_CP0_KSCRATCH2 | 64 - MIPS | KVM_REG_MIPS_CP0_KSCRATCH3 | 64 - MIPS | KVM_REG_MIPS_CP0_KSCRATCH4 | 64 - MIPS | KVM_REG_MIPS_CP0_KSCRATCH5 | 64 - MIPS | KVM_REG_MIPS_CP0_KSCRATCH6 | 64 - MIPS | KVM_REG_MIPS_CP0_MAAR(0..63) | 64 - MIPS | KVM_REG_MIPS_COUNT_CTL | 64 - MIPS | KVM_REG_MIPS_COUNT_RESUME | 64 - MIPS | KVM_REG_MIPS_COUNT_HZ | 64 - MIPS | KVM_REG_MIPS_FPR_32(0..31) | 32 - MIPS | KVM_REG_MIPS_FPR_64(0..31) | 64 - MIPS | KVM_REG_MIPS_VEC_128(0..31) | 128 - MIPS | KVM_REG_MIPS_FCR_IR | 32 - MIPS | KVM_REG_MIPS_FCR_CSR | 32 - MIPS | KVM_REG_MIPS_MSA_IR | 32 - MIPS | KVM_REG_MIPS_MSA_CSR | 32 - -ARM registers are mapped using the lower 32 bits. The upper 16 of that -is the register group type, or coprocessor number: - -ARM core registers have the following id bit patterns: - 0x4020 0000 0010 - -ARM 32-bit CP15 registers have the following id bit patterns: - 0x4020 0000 000F - -ARM 64-bit CP15 registers have the following id bit patterns: - 0x4030 0000 000F - -ARM CCSIDR registers are demultiplexed by CSSELR value: - 0x4020 0000 0011 00 - -ARM 32-bit VFP control registers have the following id bit patterns: - 0x4020 0000 0012 1 - -ARM 64-bit FP registers have the following id bit patterns: - 0x4030 0000 0012 0 - -ARM firmware pseudo-registers have the following bit pattern: - 0x4030 0000 0014 - - -arm64 registers are mapped using the lower 32 bits. The upper 16 of -that is the register group type, or coprocessor number: - -arm64 core/FP-SIMD registers have the following id bit patterns. Note -that the size of the access is variable, as the kvm_regs structure -contains elements ranging from 32 to 128 bits. The index is a 32bit -value in the kvm_regs structure seen as a 32bit array. - 0x60x0 0000 0010 - -Specifically: - Encoding Register Bits kvm_regs member ----------------------------------------------------------------- - 0x6030 0000 0010 0000 X0 64 regs.regs[0] - 0x6030 0000 0010 0002 X1 64 regs.regs[1] - ... - 0x6030 0000 0010 003c X30 64 regs.regs[30] - 0x6030 0000 0010 003e SP 64 regs.sp - 0x6030 0000 0010 0040 PC 64 regs.pc - 0x6030 0000 0010 0042 PSTATE 64 regs.pstate - 0x6030 0000 0010 0044 SP_EL1 64 sp_el1 - 0x6030 0000 0010 0046 ELR_EL1 64 elr_el1 - 0x6030 0000 0010 0048 SPSR_EL1 64 spsr[KVM_SPSR_EL1] (alias SPSR_SVC) - 0x6030 0000 0010 004a SPSR_ABT 64 spsr[KVM_SPSR_ABT] - 0x6030 0000 0010 004c SPSR_UND 64 spsr[KVM_SPSR_UND] - 0x6030 0000 0010 004e SPSR_IRQ 64 spsr[KVM_SPSR_IRQ] - 0x6060 0000 0010 0050 SPSR_FIQ 64 spsr[KVM_SPSR_FIQ] - 0x6040 0000 0010 0054 V0 128 fp_regs.vregs[0] (*) - 0x6040 0000 0010 0058 V1 128 fp_regs.vregs[1] (*) - ... - 0x6040 0000 0010 00d0 V31 128 fp_regs.vregs[31] (*) - 0x6020 0000 0010 00d4 FPSR 32 fp_regs.fpsr - 0x6020 0000 0010 00d5 FPCR 32 fp_regs.fpcr - -(*) These encodings are not accepted for SVE-enabled vcpus. See - KVM_ARM_VCPU_INIT. - - The equivalent register content can be accessed via bits [127:0] of - the corresponding SVE Zn registers instead for vcpus that have SVE - enabled (see below). - -arm64 CCSIDR registers are demultiplexed by CSSELR value: - 0x6020 0000 0011 00 - -arm64 system registers have the following id bit patterns: - 0x6030 0000 0013 - -arm64 firmware pseudo-registers have the following bit pattern: - 0x6030 0000 0014 - -arm64 SVE registers have the following bit patterns: - 0x6080 0000 0015 00 Zn bits[2048*slice + 2047 : 2048*slice] - 0x6050 0000 0015 04 Pn bits[256*slice + 255 : 256*slice] - 0x6050 0000 0015 060 FFR bits[256*slice + 255 : 256*slice] - 0x6060 0000 0015 ffff KVM_REG_ARM64_SVE_VLS pseudo-register - -Access to register IDs where 2048 * slice >= 128 * max_vq will fail with -ENOENT. max_vq is the vcpu's maximum supported vector length in 128-bit -quadwords: see (**) below. - -These registers are only accessible on vcpus for which SVE is enabled. -See KVM_ARM_VCPU_INIT for details. - -In addition, except for KVM_REG_ARM64_SVE_VLS, these registers are not -accessible until the vcpu's SVE configuration has been finalized -using KVM_ARM_VCPU_FINALIZE(KVM_ARM_VCPU_SVE). See KVM_ARM_VCPU_INIT -and KVM_ARM_VCPU_FINALIZE for more information about this procedure. - -KVM_REG_ARM64_SVE_VLS is a pseudo-register that allows the set of vector -lengths supported by the vcpu to be discovered and configured by -userspace. When transferred to or from user memory via KVM_GET_ONE_REG -or KVM_SET_ONE_REG, the value of this register is of type -__u64[KVM_ARM64_SVE_VLS_WORDS], and encodes the set of vector lengths as -follows: - -__u64 vector_lengths[KVM_ARM64_SVE_VLS_WORDS]; - -if (vq >= SVE_VQ_MIN && vq <= SVE_VQ_MAX && - ((vector_lengths[(vq - KVM_ARM64_SVE_VQ_MIN) / 64] >> - ((vq - KVM_ARM64_SVE_VQ_MIN) % 64)) & 1)) - /* Vector length vq * 16 bytes supported */ -else - /* Vector length vq * 16 bytes not supported */ - -(**) The maximum value vq for which the above condition is true is -max_vq. This is the maximum vector length available to the guest on -this vcpu, and determines which register slices are visible through -this ioctl interface. - -(See Documentation/arm64/sve.rst for an explanation of the "vq" -nomenclature.) - -KVM_REG_ARM64_SVE_VLS is only accessible after KVM_ARM_VCPU_INIT. -KVM_ARM_VCPU_INIT initialises it to the best set of vector lengths that -the host supports. - -Userspace may subsequently modify it if desired until the vcpu's SVE -configuration is finalized using KVM_ARM_VCPU_FINALIZE(KVM_ARM_VCPU_SVE). - -Apart from simply removing all vector lengths from the host set that -exceed some value, support for arbitrarily chosen sets of vector lengths -is hardware-dependent and may not be available. Attempting to configure -an invalid set of vector lengths via KVM_SET_ONE_REG will fail with -EINVAL. - -After the vcpu's SVE configuration is finalized, further attempts to -write this register will fail with EPERM. - - -MIPS registers are mapped using the lower 32 bits. The upper 16 of that is -the register group type: - -MIPS core registers (see above) have the following id bit patterns: - 0x7030 0000 0000 - -MIPS CP0 registers (see KVM_REG_MIPS_CP0_* above) have the following id bit -patterns depending on whether they're 32-bit or 64-bit registers: - 0x7020 0000 0001 00 (32-bit) - 0x7030 0000 0001 00 (64-bit) - -Note: KVM_REG_MIPS_CP0_ENTRYLO0 and KVM_REG_MIPS_CP0_ENTRYLO1 are the MIPS64 -versions of the EntryLo registers regardless of the word size of the host -hardware, host kernel, guest, and whether XPA is present in the guest, i.e. -with the RI and XI bits (if they exist) in bits 63 and 62 respectively, and -the PFNX field starting at bit 30. - -MIPS MAARs (see KVM_REG_MIPS_CP0_MAAR(*) above) have the following id bit -patterns: - 0x7030 0000 0001 01 - -MIPS KVM control registers (see above) have the following id bit patterns: - 0x7030 0000 0002 - -MIPS FPU registers (see KVM_REG_MIPS_FPR_{32,64}() above) have the following -id bit patterns depending on the size of the register being accessed. They are -always accessed according to the current guest FPU mode (Status.FR and -Config5.FRE), i.e. as the guest would see them, and they become unpredictable -if the guest FPU mode is changed. MIPS SIMD Architecture (MSA) vector -registers (see KVM_REG_MIPS_VEC_128() above) have similar patterns as they -overlap the FPU registers: - 0x7020 0000 0003 00 <0:3> (32-bit FPU registers) - 0x7030 0000 0003 00 <0:3> (64-bit FPU registers) - 0x7040 0000 0003 00 <0:3> (128-bit MSA vector registers) - -MIPS FPU control registers (see KVM_REG_MIPS_FCR_{IR,CSR} above) have the -following id bit patterns: - 0x7020 0000 0003 01 <0:3> - -MIPS MSA control registers (see KVM_REG_MIPS_MSA_{IR,CSR} above) have the -following id bit patterns: - 0x7020 0000 0003 02 <0:3> - - -4.69 KVM_GET_ONE_REG - -Capability: KVM_CAP_ONE_REG -Architectures: all -Type: vcpu ioctl -Parameters: struct kvm_one_reg (in and out) -Returns: 0 on success, negative value on failure -Errors include: -  ENOENT:   no such register -  EINVAL:   invalid register ID, or no such register -  EPERM:    (arm64) register access not allowed before vcpu finalization -(These error codes are indicative only: do not rely on a specific error -code being returned in a specific situation.) - -This ioctl allows to receive the value of a single register implemented -in a vcpu. The register to read is indicated by the "id" field of the -kvm_one_reg struct passed in. On success, the register value can be found -at the memory location pointed to by "addr". - -The list of registers accessible using this interface is identical to the -list in 4.68. - - -4.70 KVM_KVMCLOCK_CTRL - -Capability: KVM_CAP_KVMCLOCK_CTRL -Architectures: Any that implement pvclocks (currently x86 only) -Type: vcpu ioctl -Parameters: None -Returns: 0 on success, -1 on error - -This signals to the host kernel that the specified guest is being paused by -userspace. The host will set a flag in the pvclock structure that is checked -from the soft lockup watchdog. The flag is part of the pvclock structure that -is shared between guest and host, specifically the second bit of the flags -field of the pvclock_vcpu_time_info structure. It will be set exclusively by -the host and read/cleared exclusively by the guest. The guest operation of -checking and clearing the flag must an atomic operation so -load-link/store-conditional, or equivalent must be used. There are two cases -where the guest will clear the flag: when the soft lockup watchdog timer resets -itself or when a soft lockup is detected. This ioctl can be called any time -after pausing the vcpu, but before it is resumed. - - -4.71 KVM_SIGNAL_MSI - -Capability: KVM_CAP_SIGNAL_MSI -Architectures: x86 arm arm64 -Type: vm ioctl -Parameters: struct kvm_msi (in) -Returns: >0 on delivery, 0 if guest blocked the MSI, and -1 on error - -Directly inject a MSI message. Only valid with in-kernel irqchip that handles -MSI messages. - -struct kvm_msi { - __u32 address_lo; - __u32 address_hi; - __u32 data; - __u32 flags; - __u32 devid; - __u8 pad[12]; -}; - -flags: KVM_MSI_VALID_DEVID: devid contains a valid value. The per-VM - KVM_CAP_MSI_DEVID capability advertises the requirement to provide - the device ID. If this capability is not available, userspace - should never set the KVM_MSI_VALID_DEVID flag as the ioctl might fail. - -If KVM_MSI_VALID_DEVID is set, devid contains a unique device identifier -for the device that wrote the MSI message. For PCI, this is usually a -BFD identifier in the lower 16 bits. - -On x86, address_hi is ignored unless the KVM_X2APIC_API_USE_32BIT_IDS -feature of KVM_CAP_X2APIC_API capability is enabled. If it is enabled, -address_hi bits 31-8 provide bits 31-8 of the destination id. Bits 7-0 of -address_hi must be zero. - - -4.71 KVM_CREATE_PIT2 - -Capability: KVM_CAP_PIT2 -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_pit_config (in) -Returns: 0 on success, -1 on error - -Creates an in-kernel device model for the i8254 PIT. This call is only valid -after enabling in-kernel irqchip support via KVM_CREATE_IRQCHIP. The following -parameters have to be passed: - -struct kvm_pit_config { - __u32 flags; - __u32 pad[15]; -}; - -Valid flags are: - -#define KVM_PIT_SPEAKER_DUMMY 1 /* emulate speaker port stub */ - -PIT timer interrupts may use a per-VM kernel thread for injection. If it -exists, this thread will have a name of the following pattern: - -kvm-pit/ - -When running a guest with elevated priorities, the scheduling parameters of -this thread may have to be adjusted accordingly. - -This IOCTL replaces the obsolete KVM_CREATE_PIT. - - -4.72 KVM_GET_PIT2 - -Capability: KVM_CAP_PIT_STATE2 -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_pit_state2 (out) -Returns: 0 on success, -1 on error - -Retrieves the state of the in-kernel PIT model. Only valid after -KVM_CREATE_PIT2. The state is returned in the following structure: - -struct kvm_pit_state2 { - struct kvm_pit_channel_state channels[3]; - __u32 flags; - __u32 reserved[9]; -}; - -Valid flags are: - -/* disable PIT in HPET legacy mode */ -#define KVM_PIT_FLAGS_HPET_LEGACY 0x00000001 - -This IOCTL replaces the obsolete KVM_GET_PIT. - - -4.73 KVM_SET_PIT2 - -Capability: KVM_CAP_PIT_STATE2 -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_pit_state2 (in) -Returns: 0 on success, -1 on error - -Sets the state of the in-kernel PIT model. Only valid after KVM_CREATE_PIT2. -See KVM_GET_PIT2 for details on struct kvm_pit_state2. - -This IOCTL replaces the obsolete KVM_SET_PIT. - - -4.74 KVM_PPC_GET_SMMU_INFO - -Capability: KVM_CAP_PPC_GET_SMMU_INFO -Architectures: powerpc -Type: vm ioctl -Parameters: None -Returns: 0 on success, -1 on error - -This populates and returns a structure describing the features of -the "Server" class MMU emulation supported by KVM. -This can in turn be used by userspace to generate the appropriate -device-tree properties for the guest operating system. - -The structure contains some global information, followed by an -array of supported segment page sizes: - - struct kvm_ppc_smmu_info { - __u64 flags; - __u32 slb_size; - __u32 pad; - struct kvm_ppc_one_seg_page_size sps[KVM_PPC_PAGE_SIZES_MAX_SZ]; - }; - -The supported flags are: - - - KVM_PPC_PAGE_SIZES_REAL: - When that flag is set, guest page sizes must "fit" the backing - store page sizes. When not set, any page size in the list can - be used regardless of how they are backed by userspace. - - - KVM_PPC_1T_SEGMENTS - The emulated MMU supports 1T segments in addition to the - standard 256M ones. - - - KVM_PPC_NO_HASH - This flag indicates that HPT guests are not supported by KVM, - thus all guests must use radix MMU mode. - -The "slb_size" field indicates how many SLB entries are supported - -The "sps" array contains 8 entries indicating the supported base -page sizes for a segment in increasing order. Each entry is defined -as follow: - - struct kvm_ppc_one_seg_page_size { - __u32 page_shift; /* Base page shift of segment (or 0) */ - __u32 slb_enc; /* SLB encoding for BookS */ - struct kvm_ppc_one_page_size enc[KVM_PPC_PAGE_SIZES_MAX_SZ]; - }; - -An entry with a "page_shift" of 0 is unused. Because the array is -organized in increasing order, a lookup can stop when encoutering -such an entry. - -The "slb_enc" field provides the encoding to use in the SLB for the -page size. The bits are in positions such as the value can directly -be OR'ed into the "vsid" argument of the slbmte instruction. - -The "enc" array is a list which for each of those segment base page -size provides the list of supported actual page sizes (which can be -only larger or equal to the base page size), along with the -corresponding encoding in the hash PTE. Similarly, the array is -8 entries sorted by increasing sizes and an entry with a "0" shift -is an empty entry and a terminator: - - struct kvm_ppc_one_page_size { - __u32 page_shift; /* Page shift (or 0) */ - __u32 pte_enc; /* Encoding in the HPTE (>>12) */ - }; - -The "pte_enc" field provides a value that can OR'ed into the hash -PTE's RPN field (ie, it needs to be shifted left by 12 to OR it -into the hash PTE second double word). - -4.75 KVM_IRQFD - -Capability: KVM_CAP_IRQFD -Architectures: x86 s390 arm arm64 -Type: vm ioctl -Parameters: struct kvm_irqfd (in) -Returns: 0 on success, -1 on error - -Allows setting an eventfd to directly trigger a guest interrupt. -kvm_irqfd.fd specifies the file descriptor to use as the eventfd and -kvm_irqfd.gsi specifies the irqchip pin toggled by this event. When -an event is triggered on the eventfd, an interrupt is injected into -the guest using the specified gsi pin. The irqfd is removed using -the KVM_IRQFD_FLAG_DEASSIGN flag, specifying both kvm_irqfd.fd -and kvm_irqfd.gsi. - -With KVM_CAP_IRQFD_RESAMPLE, KVM_IRQFD supports a de-assert and notify -mechanism allowing emulation of level-triggered, irqfd-based -interrupts. When KVM_IRQFD_FLAG_RESAMPLE is set the user must pass an -additional eventfd in the kvm_irqfd.resamplefd field. When operating -in resample mode, posting of an interrupt through kvm_irq.fd asserts -the specified gsi in the irqchip. When the irqchip is resampled, such -as from an EOI, the gsi is de-asserted and the user is notified via -kvm_irqfd.resamplefd. It is the user's responsibility to re-queue -the interrupt if the device making use of it still requires service. -Note that closing the resamplefd is not sufficient to disable the -irqfd. The KVM_IRQFD_FLAG_RESAMPLE is only necessary on assignment -and need not be specified with KVM_IRQFD_FLAG_DEASSIGN. - -On arm/arm64, gsi routing being supported, the following can happen: -- in case no routing entry is associated to this gsi, injection fails -- in case the gsi is associated to an irqchip routing entry, - irqchip.pin + 32 corresponds to the injected SPI ID. -- in case the gsi is associated to an MSI routing entry, the MSI - message and device ID are translated into an LPI (support restricted - to GICv3 ITS in-kernel emulation). - -4.76 KVM_PPC_ALLOCATE_HTAB - -Capability: KVM_CAP_PPC_ALLOC_HTAB -Architectures: powerpc -Type: vm ioctl -Parameters: Pointer to u32 containing hash table order (in/out) -Returns: 0 on success, -1 on error - -This requests the host kernel to allocate an MMU hash table for a -guest using the PAPR paravirtualization interface. This only does -anything if the kernel is configured to use the Book 3S HV style of -virtualization. Otherwise the capability doesn't exist and the ioctl -returns an ENOTTY error. The rest of this description assumes Book 3S -HV. - -There must be no vcpus running when this ioctl is called; if there -are, it will do nothing and return an EBUSY error. - -The parameter is a pointer to a 32-bit unsigned integer variable -containing the order (log base 2) of the desired size of the hash -table, which must be between 18 and 46. On successful return from the -ioctl, the value will not be changed by the kernel. - -If no hash table has been allocated when any vcpu is asked to run -(with the KVM_RUN ioctl), the host kernel will allocate a -default-sized hash table (16 MB). - -If this ioctl is called when a hash table has already been allocated, -with a different order from the existing hash table, the existing hash -table will be freed and a new one allocated. If this is ioctl is -called when a hash table has already been allocated of the same order -as specified, the kernel will clear out the existing hash table (zero -all HPTEs). In either case, if the guest is using the virtualized -real-mode area (VRMA) facility, the kernel will re-create the VMRA -HPTEs on the next KVM_RUN of any vcpu. - -4.77 KVM_S390_INTERRUPT - -Capability: basic -Architectures: s390 -Type: vm ioctl, vcpu ioctl -Parameters: struct kvm_s390_interrupt (in) -Returns: 0 on success, -1 on error - -Allows to inject an interrupt to the guest. Interrupts can be floating -(vm ioctl) or per cpu (vcpu ioctl), depending on the interrupt type. - -Interrupt parameters are passed via kvm_s390_interrupt: - -struct kvm_s390_interrupt { - __u32 type; - __u32 parm; - __u64 parm64; -}; - -type can be one of the following: - -KVM_S390_SIGP_STOP (vcpu) - sigp stop; optional flags in parm -KVM_S390_PROGRAM_INT (vcpu) - program check; code in parm -KVM_S390_SIGP_SET_PREFIX (vcpu) - sigp set prefix; prefix address in parm -KVM_S390_RESTART (vcpu) - restart -KVM_S390_INT_CLOCK_COMP (vcpu) - clock comparator interrupt -KVM_S390_INT_CPU_TIMER (vcpu) - CPU timer interrupt -KVM_S390_INT_VIRTIO (vm) - virtio external interrupt; external interrupt - parameters in parm and parm64 -KVM_S390_INT_SERVICE (vm) - sclp external interrupt; sclp parameter in parm -KVM_S390_INT_EMERGENCY (vcpu) - sigp emergency; source cpu in parm -KVM_S390_INT_EXTERNAL_CALL (vcpu) - sigp external call; source cpu in parm -KVM_S390_INT_IO(ai,cssid,ssid,schid) (vm) - compound value to indicate an - I/O interrupt (ai - adapter interrupt; cssid,ssid,schid - subchannel); - I/O interruption parameters in parm (subchannel) and parm64 (intparm, - interruption subclass) -KVM_S390_MCHK (vm, vcpu) - machine check interrupt; cr 14 bits in parm, - machine check interrupt code in parm64 (note that - machine checks needing further payload are not - supported by this ioctl) - -This is an asynchronous vcpu ioctl and can be invoked from any thread. - -4.78 KVM_PPC_GET_HTAB_FD - -Capability: KVM_CAP_PPC_HTAB_FD -Architectures: powerpc -Type: vm ioctl -Parameters: Pointer to struct kvm_get_htab_fd (in) -Returns: file descriptor number (>= 0) on success, -1 on error - -This returns a file descriptor that can be used either to read out the -entries in the guest's hashed page table (HPT), or to write entries to -initialize the HPT. The returned fd can only be written to if the -KVM_GET_HTAB_WRITE bit is set in the flags field of the argument, and -can only be read if that bit is clear. The argument struct looks like -this: - -/* For KVM_PPC_GET_HTAB_FD */ -struct kvm_get_htab_fd { - __u64 flags; - __u64 start_index; - __u64 reserved[2]; -}; - -/* Values for kvm_get_htab_fd.flags */ -#define KVM_GET_HTAB_BOLTED_ONLY ((__u64)0x1) -#define KVM_GET_HTAB_WRITE ((__u64)0x2) - -The `start_index' field gives the index in the HPT of the entry at -which to start reading. It is ignored when writing. - -Reads on the fd will initially supply information about all -"interesting" HPT entries. Interesting entries are those with the -bolted bit set, if the KVM_GET_HTAB_BOLTED_ONLY bit is set, otherwise -all entries. When the end of the HPT is reached, the read() will -return. If read() is called again on the fd, it will start again from -the beginning of the HPT, but will only return HPT entries that have -changed since they were last read. - -Data read or written is structured as a header (8 bytes) followed by a -series of valid HPT entries (16 bytes) each. The header indicates how -many valid HPT entries there are and how many invalid entries follow -the valid entries. The invalid entries are not represented explicitly -in the stream. The header format is: - -struct kvm_get_htab_header { - __u32 index; - __u16 n_valid; - __u16 n_invalid; -}; - -Writes to the fd create HPT entries starting at the index given in the -header; first `n_valid' valid entries with contents from the data -written, then `n_invalid' invalid entries, invalidating any previously -valid entries found. - -4.79 KVM_CREATE_DEVICE - -Capability: KVM_CAP_DEVICE_CTRL -Type: vm ioctl -Parameters: struct kvm_create_device (in/out) -Returns: 0 on success, -1 on error -Errors: - ENODEV: The device type is unknown or unsupported - EEXIST: Device already created, and this type of device may not - be instantiated multiple times - - Other error conditions may be defined by individual device types or - have their standard meanings. - -Creates an emulated device in the kernel. The file descriptor returned -in fd can be used with KVM_SET/GET/HAS_DEVICE_ATTR. - -If the KVM_CREATE_DEVICE_TEST flag is set, only test whether the -device type is supported (not necessarily whether it can be created -in the current vm). - -Individual devices should not define flags. Attributes should be used -for specifying any behavior that is not implied by the device type -number. - -struct kvm_create_device { - __u32 type; /* in: KVM_DEV_TYPE_xxx */ - __u32 fd; /* out: device handle */ - __u32 flags; /* in: KVM_CREATE_DEVICE_xxx */ -}; - -4.80 KVM_SET_DEVICE_ATTR/KVM_GET_DEVICE_ATTR - -Capability: KVM_CAP_DEVICE_CTRL, KVM_CAP_VM_ATTRIBUTES for vm device, - KVM_CAP_VCPU_ATTRIBUTES for vcpu device -Type: device ioctl, vm ioctl, vcpu ioctl -Parameters: struct kvm_device_attr -Returns: 0 on success, -1 on error -Errors: - ENXIO: The group or attribute is unknown/unsupported for this device - or hardware support is missing. - EPERM: The attribute cannot (currently) be accessed this way - (e.g. read-only attribute, or attribute that only makes - sense when the device is in a different state) - - Other error conditions may be defined by individual device types. - -Gets/sets a specified piece of device configuration and/or state. The -semantics are device-specific. See individual device documentation in -the "devices" directory. As with ONE_REG, the size of the data -transferred is defined by the particular attribute. - -struct kvm_device_attr { - __u32 flags; /* no flags currently defined */ - __u32 group; /* device-defined */ - __u64 attr; /* group-defined */ - __u64 addr; /* userspace address of attr data */ -}; - -4.81 KVM_HAS_DEVICE_ATTR - -Capability: KVM_CAP_DEVICE_CTRL, KVM_CAP_VM_ATTRIBUTES for vm device, - KVM_CAP_VCPU_ATTRIBUTES for vcpu device -Type: device ioctl, vm ioctl, vcpu ioctl -Parameters: struct kvm_device_attr -Returns: 0 on success, -1 on error -Errors: - ENXIO: The group or attribute is unknown/unsupported for this device - or hardware support is missing. - -Tests whether a device supports a particular attribute. A successful -return indicates the attribute is implemented. It does not necessarily -indicate that the attribute can be read or written in the device's -current state. "addr" is ignored. - -4.82 KVM_ARM_VCPU_INIT - -Capability: basic -Architectures: arm, arm64 -Type: vcpu ioctl -Parameters: struct kvm_vcpu_init (in) -Returns: 0 on success; -1 on error -Errors: -  EINVAL:    the target is unknown, or the combination of features is invalid. -  ENOENT:    a features bit specified is unknown. - -This tells KVM what type of CPU to present to the guest, and what -optional features it should have.  This will cause a reset of the cpu -registers to their initial values.  If this is not called, KVM_RUN will -return ENOEXEC for that vcpu. - -Note that because some registers reflect machine topology, all vcpus -should be created before this ioctl is invoked. - -Userspace can call this function multiple times for a given vcpu, including -after the vcpu has been run. This will reset the vcpu to its initial -state. All calls to this function after the initial call must use the same -target and same set of feature flags, otherwise EINVAL will be returned. - -Possible features: - - KVM_ARM_VCPU_POWER_OFF: Starts the CPU in a power-off state. - Depends on KVM_CAP_ARM_PSCI. If not set, the CPU will be powered on - and execute guest code when KVM_RUN is called. - - KVM_ARM_VCPU_EL1_32BIT: Starts the CPU in a 32bit mode. - Depends on KVM_CAP_ARM_EL1_32BIT (arm64 only). - - KVM_ARM_VCPU_PSCI_0_2: Emulate PSCI v0.2 (or a future revision - backward compatible with v0.2) for the CPU. - Depends on KVM_CAP_ARM_PSCI_0_2. - - KVM_ARM_VCPU_PMU_V3: Emulate PMUv3 for the CPU. - Depends on KVM_CAP_ARM_PMU_V3. - - - KVM_ARM_VCPU_PTRAUTH_ADDRESS: Enables Address Pointer authentication - for arm64 only. - Depends on KVM_CAP_ARM_PTRAUTH_ADDRESS. - If KVM_CAP_ARM_PTRAUTH_ADDRESS and KVM_CAP_ARM_PTRAUTH_GENERIC are - both present, then both KVM_ARM_VCPU_PTRAUTH_ADDRESS and - KVM_ARM_VCPU_PTRAUTH_GENERIC must be requested or neither must be - requested. - - - KVM_ARM_VCPU_PTRAUTH_GENERIC: Enables Generic Pointer authentication - for arm64 only. - Depends on KVM_CAP_ARM_PTRAUTH_GENERIC. - If KVM_CAP_ARM_PTRAUTH_ADDRESS and KVM_CAP_ARM_PTRAUTH_GENERIC are - both present, then both KVM_ARM_VCPU_PTRAUTH_ADDRESS and - KVM_ARM_VCPU_PTRAUTH_GENERIC must be requested or neither must be - requested. - - - KVM_ARM_VCPU_SVE: Enables SVE for the CPU (arm64 only). - Depends on KVM_CAP_ARM_SVE. - Requires KVM_ARM_VCPU_FINALIZE(KVM_ARM_VCPU_SVE): - - * After KVM_ARM_VCPU_INIT: - - - KVM_REG_ARM64_SVE_VLS may be read using KVM_GET_ONE_REG: the - initial value of this pseudo-register indicates the best set of - vector lengths possible for a vcpu on this host. - - * Before KVM_ARM_VCPU_FINALIZE(KVM_ARM_VCPU_SVE): - - - KVM_RUN and KVM_GET_REG_LIST are not available; - - - KVM_GET_ONE_REG and KVM_SET_ONE_REG cannot be used to access - the scalable archietctural SVE registers - KVM_REG_ARM64_SVE_ZREG(), KVM_REG_ARM64_SVE_PREG() or - KVM_REG_ARM64_SVE_FFR; - - - KVM_REG_ARM64_SVE_VLS may optionally be written using - KVM_SET_ONE_REG, to modify the set of vector lengths available - for the vcpu. - - * After KVM_ARM_VCPU_FINALIZE(KVM_ARM_VCPU_SVE): - - - the KVM_REG_ARM64_SVE_VLS pseudo-register is immutable, and can - no longer be written using KVM_SET_ONE_REG. - -4.83 KVM_ARM_PREFERRED_TARGET - -Capability: basic -Architectures: arm, arm64 -Type: vm ioctl -Parameters: struct struct kvm_vcpu_init (out) -Returns: 0 on success; -1 on error -Errors: - ENODEV: no preferred target available for the host - -This queries KVM for preferred CPU target type which can be emulated -by KVM on underlying host. - -The ioctl returns struct kvm_vcpu_init instance containing information -about preferred CPU target type and recommended features for it. The -kvm_vcpu_init->features bitmap returned will have feature bits set if -the preferred target recommends setting these features, but this is -not mandatory. - -The information returned by this ioctl can be used to prepare an instance -of struct kvm_vcpu_init for KVM_ARM_VCPU_INIT ioctl which will result in -in VCPU matching underlying host. - - -4.84 KVM_GET_REG_LIST - -Capability: basic -Architectures: arm, arm64, mips -Type: vcpu ioctl -Parameters: struct kvm_reg_list (in/out) -Returns: 0 on success; -1 on error -Errors: -  E2BIG:     the reg index list is too big to fit in the array specified by -             the user (the number required will be written into n). - -struct kvm_reg_list { - __u64 n; /* number of registers in reg[] */ - __u64 reg[0]; -}; - -This ioctl returns the guest registers that are supported for the -KVM_GET_ONE_REG/KVM_SET_ONE_REG calls. - - -4.85 KVM_ARM_SET_DEVICE_ADDR (deprecated) - -Capability: KVM_CAP_ARM_SET_DEVICE_ADDR -Architectures: arm, arm64 -Type: vm ioctl -Parameters: struct kvm_arm_device_address (in) -Returns: 0 on success, -1 on error -Errors: - ENODEV: The device id is unknown - ENXIO: Device not supported on current system - EEXIST: Address already set - E2BIG: Address outside guest physical address space - EBUSY: Address overlaps with other device range - -struct kvm_arm_device_addr { - __u64 id; - __u64 addr; -}; - -Specify a device address in the guest's physical address space where guests -can access emulated or directly exposed devices, which the host kernel needs -to know about. The id field is an architecture specific identifier for a -specific device. - -ARM/arm64 divides the id field into two parts, a device id and an -address type id specific to the individual device. - -  bits: | 63 ... 32 | 31 ... 16 | 15 ... 0 | - field: | 0x00000000 | device id | addr type id | - -ARM/arm64 currently only require this when using the in-kernel GIC -support for the hardware VGIC features, using KVM_ARM_DEVICE_VGIC_V2 -as the device id. When setting the base address for the guest's -mapping of the VGIC virtual CPU and distributor interface, the ioctl -must be called after calling KVM_CREATE_IRQCHIP, but before calling -KVM_RUN on any of the VCPUs. Calling this ioctl twice for any of the -base addresses will return -EEXIST. - -Note, this IOCTL is deprecated and the more flexible SET/GET_DEVICE_ATTR API -should be used instead. - - -4.86 KVM_PPC_RTAS_DEFINE_TOKEN - -Capability: KVM_CAP_PPC_RTAS -Architectures: ppc -Type: vm ioctl -Parameters: struct kvm_rtas_token_args -Returns: 0 on success, -1 on error - -Defines a token value for a RTAS (Run Time Abstraction Services) -service in order to allow it to be handled in the kernel. The -argument struct gives the name of the service, which must be the name -of a service that has a kernel-side implementation. If the token -value is non-zero, it will be associated with that service, and -subsequent RTAS calls by the guest specifying that token will be -handled by the kernel. If the token value is 0, then any token -associated with the service will be forgotten, and subsequent RTAS -calls by the guest for that service will be passed to userspace to be -handled. - -4.87 KVM_SET_GUEST_DEBUG - -Capability: KVM_CAP_SET_GUEST_DEBUG -Architectures: x86, s390, ppc, arm64 -Type: vcpu ioctl -Parameters: struct kvm_guest_debug (in) -Returns: 0 on success; -1 on error - -struct kvm_guest_debug { - __u32 control; - __u32 pad; - struct kvm_guest_debug_arch arch; -}; - -Set up the processor specific debug registers and configure vcpu for -handling guest debug events. There are two parts to the structure, the -first a control bitfield indicates the type of debug events to handle -when running. Common control bits are: - - - KVM_GUESTDBG_ENABLE: guest debugging is enabled - - KVM_GUESTDBG_SINGLESTEP: the next run should single-step - -The top 16 bits of the control field are architecture specific control -flags which can include the following: - - - KVM_GUESTDBG_USE_SW_BP: using software breakpoints [x86, arm64] - - KVM_GUESTDBG_USE_HW_BP: using hardware breakpoints [x86, s390, arm64] - - KVM_GUESTDBG_INJECT_DB: inject DB type exception [x86] - - KVM_GUESTDBG_INJECT_BP: inject BP type exception [x86] - - KVM_GUESTDBG_EXIT_PENDING: trigger an immediate guest exit [s390] - -For example KVM_GUESTDBG_USE_SW_BP indicates that software breakpoints -are enabled in memory so we need to ensure breakpoint exceptions are -correctly trapped and the KVM run loop exits at the breakpoint and not -running off into the normal guest vector. For KVM_GUESTDBG_USE_HW_BP -we need to ensure the guest vCPUs architecture specific registers are -updated to the correct (supplied) values. - -The second part of the structure is architecture specific and -typically contains a set of debug registers. - -For arm64 the number of debug registers is implementation defined and -can be determined by querying the KVM_CAP_GUEST_DEBUG_HW_BPS and -KVM_CAP_GUEST_DEBUG_HW_WPS capabilities which return a positive number -indicating the number of supported registers. - -When debug events exit the main run loop with the reason -KVM_EXIT_DEBUG with the kvm_debug_exit_arch part of the kvm_run -structure containing architecture specific debug information. - -4.88 KVM_GET_EMULATED_CPUID - -Capability: KVM_CAP_EXT_EMUL_CPUID -Architectures: x86 -Type: system ioctl -Parameters: struct kvm_cpuid2 (in/out) -Returns: 0 on success, -1 on error - -struct kvm_cpuid2 { - __u32 nent; - __u32 flags; - struct kvm_cpuid_entry2 entries[0]; -}; - -The member 'flags' is used for passing flags from userspace. - -#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX BIT(0) -#define KVM_CPUID_FLAG_STATEFUL_FUNC BIT(1) -#define KVM_CPUID_FLAG_STATE_READ_NEXT BIT(2) - -struct kvm_cpuid_entry2 { - __u32 function; - __u32 index; - __u32 flags; - __u32 eax; - __u32 ebx; - __u32 ecx; - __u32 edx; - __u32 padding[3]; -}; - -This ioctl returns x86 cpuid features which are emulated by -kvm.Userspace can use the information returned by this ioctl to query -which features are emulated by kvm instead of being present natively. - -Userspace invokes KVM_GET_EMULATED_CPUID by passing a kvm_cpuid2 -structure with the 'nent' field indicating the number of entries in -the variable-size array 'entries'. If the number of entries is too low -to describe the cpu capabilities, an error (E2BIG) is returned. If the -number is too high, the 'nent' field is adjusted and an error (ENOMEM) -is returned. If the number is just right, the 'nent' field is adjusted -to the number of valid entries in the 'entries' array, which is then -filled. - -The entries returned are the set CPUID bits of the respective features -which kvm emulates, as returned by the CPUID instruction, with unknown -or unsupported feature bits cleared. - -Features like x2apic, for example, may not be present in the host cpu -but are exposed by kvm in KVM_GET_SUPPORTED_CPUID because they can be -emulated efficiently and thus not included here. - -The fields in each entry are defined as follows: - - function: the eax value used to obtain the entry - index: the ecx value used to obtain the entry (for entries that are - affected by ecx) - flags: an OR of zero or more of the following: - KVM_CPUID_FLAG_SIGNIFCANT_INDEX: - if the index field is valid - KVM_CPUID_FLAG_STATEFUL_FUNC: - if cpuid for this function returns different values for successive - invocations; there will be several entries with the same function, - all with this flag set - KVM_CPUID_FLAG_STATE_READ_NEXT: - for KVM_CPUID_FLAG_STATEFUL_FUNC entries, set if this entry is - the first entry to be read by a cpu - eax, ebx, ecx, edx: the values returned by the cpuid instruction for - this function/index combination - -4.89 KVM_S390_MEM_OP - -Capability: KVM_CAP_S390_MEM_OP -Architectures: s390 -Type: vcpu ioctl -Parameters: struct kvm_s390_mem_op (in) -Returns: = 0 on success, - < 0 on generic error (e.g. -EFAULT or -ENOMEM), - > 0 if an exception occurred while walking the page tables - -Read or write data from/to the logical (virtual) memory of a VCPU. - -Parameters are specified via the following structure: - -struct kvm_s390_mem_op { - __u64 gaddr; /* the guest address */ - __u64 flags; /* flags */ - __u32 size; /* amount of bytes */ - __u32 op; /* type of operation */ - __u64 buf; /* buffer in userspace */ - __u8 ar; /* the access register number */ - __u8 reserved[31]; /* should be set to 0 */ -}; - -The type of operation is specified in the "op" field. It is either -KVM_S390_MEMOP_LOGICAL_READ for reading from logical memory space or -KVM_S390_MEMOP_LOGICAL_WRITE for writing to logical memory space. The -KVM_S390_MEMOP_F_CHECK_ONLY flag can be set in the "flags" field to check -whether the corresponding memory access would create an access exception -(without touching the data in the memory at the destination). In case an -access exception occurred while walking the MMU tables of the guest, the -ioctl returns a positive error number to indicate the type of exception. -This exception is also raised directly at the corresponding VCPU if the -flag KVM_S390_MEMOP_F_INJECT_EXCEPTION is set in the "flags" field. - -The start address of the memory region has to be specified in the "gaddr" -field, and the length of the region in the "size" field. "buf" is the buffer -supplied by the userspace application where the read data should be written -to for KVM_S390_MEMOP_LOGICAL_READ, or where the data that should be written -is stored for a KVM_S390_MEMOP_LOGICAL_WRITE. "buf" is unused and can be NULL -when KVM_S390_MEMOP_F_CHECK_ONLY is specified. "ar" designates the access -register number to be used. - -The "reserved" field is meant for future extensions. It is not used by -KVM with the currently defined set of flags. - -4.90 KVM_S390_GET_SKEYS - -Capability: KVM_CAP_S390_SKEYS -Architectures: s390 -Type: vm ioctl -Parameters: struct kvm_s390_skeys -Returns: 0 on success, KVM_S390_GET_KEYS_NONE if guest is not using storage - keys, negative value on error - -This ioctl is used to get guest storage key values on the s390 -architecture. The ioctl takes parameters via the kvm_s390_skeys struct. - -struct kvm_s390_skeys { - __u64 start_gfn; - __u64 count; - __u64 skeydata_addr; - __u32 flags; - __u32 reserved[9]; -}; - -The start_gfn field is the number of the first guest frame whose storage keys -you want to get. - -The count field is the number of consecutive frames (starting from start_gfn) -whose storage keys to get. The count field must be at least 1 and the maximum -allowed value is defined as KVM_S390_SKEYS_ALLOC_MAX. Values outside this range -will cause the ioctl to return -EINVAL. - -The skeydata_addr field is the address to a buffer large enough to hold count -bytes. This buffer will be filled with storage key data by the ioctl. - -4.91 KVM_S390_SET_SKEYS - -Capability: KVM_CAP_S390_SKEYS -Architectures: s390 -Type: vm ioctl -Parameters: struct kvm_s390_skeys -Returns: 0 on success, negative value on error - -This ioctl is used to set guest storage key values on the s390 -architecture. The ioctl takes parameters via the kvm_s390_skeys struct. -See section on KVM_S390_GET_SKEYS for struct definition. - -The start_gfn field is the number of the first guest frame whose storage keys -you want to set. - -The count field is the number of consecutive frames (starting from start_gfn) -whose storage keys to get. The count field must be at least 1 and the maximum -allowed value is defined as KVM_S390_SKEYS_ALLOC_MAX. Values outside this range -will cause the ioctl to return -EINVAL. - -The skeydata_addr field is the address to a buffer containing count bytes of -storage keys. Each byte in the buffer will be set as the storage key for a -single frame starting at start_gfn for count frames. - -Note: If any architecturally invalid key value is found in the given data then -the ioctl will return -EINVAL. - -4.92 KVM_S390_IRQ - -Capability: KVM_CAP_S390_INJECT_IRQ -Architectures: s390 -Type: vcpu ioctl -Parameters: struct kvm_s390_irq (in) -Returns: 0 on success, -1 on error -Errors: - EINVAL: interrupt type is invalid - type is KVM_S390_SIGP_STOP and flag parameter is invalid value - type is KVM_S390_INT_EXTERNAL_CALL and code is bigger - than the maximum of VCPUs - EBUSY: type is KVM_S390_SIGP_SET_PREFIX and vcpu is not stopped - type is KVM_S390_SIGP_STOP and a stop irq is already pending - type is KVM_S390_INT_EXTERNAL_CALL and an external call interrupt - is already pending - -Allows to inject an interrupt to the guest. - -Using struct kvm_s390_irq as a parameter allows -to inject additional payload which is not -possible via KVM_S390_INTERRUPT. - -Interrupt parameters are passed via kvm_s390_irq: - -struct kvm_s390_irq { - __u64 type; - union { - struct kvm_s390_io_info io; - struct kvm_s390_ext_info ext; - struct kvm_s390_pgm_info pgm; - struct kvm_s390_emerg_info emerg; - struct kvm_s390_extcall_info extcall; - struct kvm_s390_prefix_info prefix; - struct kvm_s390_stop_info stop; - struct kvm_s390_mchk_info mchk; - char reserved[64]; - } u; -}; - -type can be one of the following: - -KVM_S390_SIGP_STOP - sigp stop; parameter in .stop -KVM_S390_PROGRAM_INT - program check; parameters in .pgm -KVM_S390_SIGP_SET_PREFIX - sigp set prefix; parameters in .prefix -KVM_S390_RESTART - restart; no parameters -KVM_S390_INT_CLOCK_COMP - clock comparator interrupt; no parameters -KVM_S390_INT_CPU_TIMER - CPU timer interrupt; no parameters -KVM_S390_INT_EMERGENCY - sigp emergency; parameters in .emerg -KVM_S390_INT_EXTERNAL_CALL - sigp external call; parameters in .extcall -KVM_S390_MCHK - machine check interrupt; parameters in .mchk - -This is an asynchronous vcpu ioctl and can be invoked from any thread. - -4.94 KVM_S390_GET_IRQ_STATE - -Capability: KVM_CAP_S390_IRQ_STATE -Architectures: s390 -Type: vcpu ioctl -Parameters: struct kvm_s390_irq_state (out) -Returns: >= number of bytes copied into buffer, - -EINVAL if buffer size is 0, - -ENOBUFS if buffer size is too small to fit all pending interrupts, - -EFAULT if the buffer address was invalid - -This ioctl allows userspace to retrieve the complete state of all currently -pending interrupts in a single buffer. Use cases include migration -and introspection. The parameter structure contains the address of a -userspace buffer and its length: - -struct kvm_s390_irq_state { - __u64 buf; - __u32 flags; /* will stay unused for compatibility reasons */ - __u32 len; - __u32 reserved[4]; /* will stay unused for compatibility reasons */ -}; - -Userspace passes in the above struct and for each pending interrupt a -struct kvm_s390_irq is copied to the provided buffer. - -The structure contains a flags and a reserved field for future extensions. As -the kernel never checked for flags == 0 and QEMU never pre-zeroed flags and -reserved, these fields can not be used in the future without breaking -compatibility. - -If -ENOBUFS is returned the buffer provided was too small and userspace -may retry with a bigger buffer. - -4.95 KVM_S390_SET_IRQ_STATE - -Capability: KVM_CAP_S390_IRQ_STATE -Architectures: s390 -Type: vcpu ioctl -Parameters: struct kvm_s390_irq_state (in) -Returns: 0 on success, - -EFAULT if the buffer address was invalid, - -EINVAL for an invalid buffer length (see below), - -EBUSY if there were already interrupts pending, - errors occurring when actually injecting the - interrupt. See KVM_S390_IRQ. - -This ioctl allows userspace to set the complete state of all cpu-local -interrupts currently pending for the vcpu. It is intended for restoring -interrupt state after a migration. The input parameter is a userspace buffer -containing a struct kvm_s390_irq_state: - -struct kvm_s390_irq_state { - __u64 buf; - __u32 flags; /* will stay unused for compatibility reasons */ - __u32 len; - __u32 reserved[4]; /* will stay unused for compatibility reasons */ -}; - -The restrictions for flags and reserved apply as well. -(see KVM_S390_GET_IRQ_STATE) - -The userspace memory referenced by buf contains a struct kvm_s390_irq -for each interrupt to be injected into the guest. -If one of the interrupts could not be injected for some reason the -ioctl aborts. - -len must be a multiple of sizeof(struct kvm_s390_irq). It must be > 0 -and it must not exceed (max_vcpus + 32) * sizeof(struct kvm_s390_irq), -which is the maximum number of possibly pending cpu-local interrupts. - -4.96 KVM_SMI - -Capability: KVM_CAP_X86_SMM -Architectures: x86 -Type: vcpu ioctl -Parameters: none -Returns: 0 on success, -1 on error - -Queues an SMI on the thread's vcpu. - -4.97 KVM_CAP_PPC_MULTITCE - -Capability: KVM_CAP_PPC_MULTITCE -Architectures: ppc -Type: vm - -This capability means the kernel is capable of handling hypercalls -H_PUT_TCE_INDIRECT and H_STUFF_TCE without passing those into the user -space. This significantly accelerates DMA operations for PPC KVM guests. -User space should expect that its handlers for these hypercalls -are not going to be called if user space previously registered LIOBN -in KVM (via KVM_CREATE_SPAPR_TCE or similar calls). - -In order to enable H_PUT_TCE_INDIRECT and H_STUFF_TCE use in the guest, -user space might have to advertise it for the guest. For example, -IBM pSeries (sPAPR) guest starts using them if "hcall-multi-tce" is -present in the "ibm,hypertas-functions" device-tree property. - -The hypercalls mentioned above may or may not be processed successfully -in the kernel based fast path. If they can not be handled by the kernel, -they will get passed on to user space. So user space still has to have -an implementation for these despite the in kernel acceleration. - -This capability is always enabled. - -4.98 KVM_CREATE_SPAPR_TCE_64 - -Capability: KVM_CAP_SPAPR_TCE_64 -Architectures: powerpc -Type: vm ioctl -Parameters: struct kvm_create_spapr_tce_64 (in) -Returns: file descriptor for manipulating the created TCE table - -This is an extension for KVM_CAP_SPAPR_TCE which only supports 32bit -windows, described in 4.62 KVM_CREATE_SPAPR_TCE - -This capability uses extended struct in ioctl interface: - -/* for KVM_CAP_SPAPR_TCE_64 */ -struct kvm_create_spapr_tce_64 { - __u64 liobn; - __u32 page_shift; - __u32 flags; - __u64 offset; /* in pages */ - __u64 size; /* in pages */ -}; - -The aim of extension is to support an additional bigger DMA window with -a variable page size. -KVM_CREATE_SPAPR_TCE_64 receives a 64bit window size, an IOMMU page shift and -a bus offset of the corresponding DMA window, @size and @offset are numbers -of IOMMU pages. - -@flags are not used at the moment. - -The rest of functionality is identical to KVM_CREATE_SPAPR_TCE. - -4.99 KVM_REINJECT_CONTROL - -Capability: KVM_CAP_REINJECT_CONTROL -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_reinject_control (in) -Returns: 0 on success, - -EFAULT if struct kvm_reinject_control cannot be read, - -ENXIO if KVM_CREATE_PIT or KVM_CREATE_PIT2 didn't succeed earlier. - -i8254 (PIT) has two modes, reinject and !reinject. The default is reinject, -where KVM queues elapsed i8254 ticks and monitors completion of interrupt from -vector(s) that i8254 injects. Reinject mode dequeues a tick and injects its -interrupt whenever there isn't a pending interrupt from i8254. -!reinject mode injects an interrupt as soon as a tick arrives. - -struct kvm_reinject_control { - __u8 pit_reinject; - __u8 reserved[31]; -}; - -pit_reinject = 0 (!reinject mode) is recommended, unless running an old -operating system that uses the PIT for timing (e.g. Linux 2.4.x). - -4.100 KVM_PPC_CONFIGURE_V3_MMU - -Capability: KVM_CAP_PPC_RADIX_MMU or KVM_CAP_PPC_HASH_MMU_V3 -Architectures: ppc -Type: vm ioctl -Parameters: struct kvm_ppc_mmuv3_cfg (in) -Returns: 0 on success, - -EFAULT if struct kvm_ppc_mmuv3_cfg cannot be read, - -EINVAL if the configuration is invalid - -This ioctl controls whether the guest will use radix or HPT (hashed -page table) translation, and sets the pointer to the process table for -the guest. - -struct kvm_ppc_mmuv3_cfg { - __u64 flags; - __u64 process_table; -}; - -There are two bits that can be set in flags; KVM_PPC_MMUV3_RADIX and -KVM_PPC_MMUV3_GTSE. KVM_PPC_MMUV3_RADIX, if set, configures the guest -to use radix tree translation, and if clear, to use HPT translation. -KVM_PPC_MMUV3_GTSE, if set and if KVM permits it, configures the guest -to be able to use the global TLB and SLB invalidation instructions; -if clear, the guest may not use these instructions. - -The process_table field specifies the address and size of the guest -process table, which is in the guest's space. This field is formatted -as the second doubleword of the partition table entry, as defined in -the Power ISA V3.00, Book III section 5.7.6.1. - -4.101 KVM_PPC_GET_RMMU_INFO - -Capability: KVM_CAP_PPC_RADIX_MMU -Architectures: ppc -Type: vm ioctl -Parameters: struct kvm_ppc_rmmu_info (out) -Returns: 0 on success, - -EFAULT if struct kvm_ppc_rmmu_info cannot be written, - -EINVAL if no useful information can be returned - -This ioctl returns a structure containing two things: (a) a list -containing supported radix tree geometries, and (b) a list that maps -page sizes to put in the "AP" (actual page size) field for the tlbie -(TLB invalidate entry) instruction. - -struct kvm_ppc_rmmu_info { - struct kvm_ppc_radix_geom { - __u8 page_shift; - __u8 level_bits[4]; - __u8 pad[3]; - } geometries[8]; - __u32 ap_encodings[8]; -}; - -The geometries[] field gives up to 8 supported geometries for the -radix page table, in terms of the log base 2 of the smallest page -size, and the number of bits indexed at each level of the tree, from -the PTE level up to the PGD level in that order. Any unused entries -will have 0 in the page_shift field. - -The ap_encodings gives the supported page sizes and their AP field -encodings, encoded with the AP value in the top 3 bits and the log -base 2 of the page size in the bottom 6 bits. - -4.102 KVM_PPC_RESIZE_HPT_PREPARE - -Capability: KVM_CAP_SPAPR_RESIZE_HPT -Architectures: powerpc -Type: vm ioctl -Parameters: struct kvm_ppc_resize_hpt (in) -Returns: 0 on successful completion, - >0 if a new HPT is being prepared, the value is an estimated - number of milliseconds until preparation is complete - -EFAULT if struct kvm_reinject_control cannot be read, - -EINVAL if the supplied shift or flags are invalid - -ENOMEM if unable to allocate the new HPT - -ENOSPC if there was a hash collision when moving existing - HPT entries to the new HPT - -EIO on other error conditions - -Used to implement the PAPR extension for runtime resizing of a guest's -Hashed Page Table (HPT). Specifically this starts, stops or monitors -the preparation of a new potential HPT for the guest, essentially -implementing the H_RESIZE_HPT_PREPARE hypercall. - -If called with shift > 0 when there is no pending HPT for the guest, -this begins preparation of a new pending HPT of size 2^(shift) bytes. -It then returns a positive integer with the estimated number of -milliseconds until preparation is complete. - -If called when there is a pending HPT whose size does not match that -requested in the parameters, discards the existing pending HPT and -creates a new one as above. - -If called when there is a pending HPT of the size requested, will: - * If preparation of the pending HPT is already complete, return 0 - * If preparation of the pending HPT has failed, return an error - code, then discard the pending HPT. - * If preparation of the pending HPT is still in progress, return an - estimated number of milliseconds until preparation is complete. - -If called with shift == 0, discards any currently pending HPT and -returns 0 (i.e. cancels any in-progress preparation). - -flags is reserved for future expansion, currently setting any bits in -flags will result in an -EINVAL. - -Normally this will be called repeatedly with the same parameters until -it returns <= 0. The first call will initiate preparation, subsequent -ones will monitor preparation until it completes or fails. - -struct kvm_ppc_resize_hpt { - __u64 flags; - __u32 shift; - __u32 pad; -}; - -4.103 KVM_PPC_RESIZE_HPT_COMMIT - -Capability: KVM_CAP_SPAPR_RESIZE_HPT -Architectures: powerpc -Type: vm ioctl -Parameters: struct kvm_ppc_resize_hpt (in) -Returns: 0 on successful completion, - -EFAULT if struct kvm_reinject_control cannot be read, - -EINVAL if the supplied shift or flags are invalid - -ENXIO is there is no pending HPT, or the pending HPT doesn't - have the requested size - -EBUSY if the pending HPT is not fully prepared - -ENOSPC if there was a hash collision when moving existing - HPT entries to the new HPT - -EIO on other error conditions - -Used to implement the PAPR extension for runtime resizing of a guest's -Hashed Page Table (HPT). Specifically this requests that the guest be -transferred to working with the new HPT, essentially implementing the -H_RESIZE_HPT_COMMIT hypercall. - -This should only be called after KVM_PPC_RESIZE_HPT_PREPARE has -returned 0 with the same parameters. In other cases -KVM_PPC_RESIZE_HPT_COMMIT will return an error (usually -ENXIO or --EBUSY, though others may be possible if the preparation was started, -but failed). - -This will have undefined effects on the guest if it has not already -placed itself in a quiescent state where no vcpu will make MMU enabled -memory accesses. - -On succsful completion, the pending HPT will become the guest's active -HPT and the previous HPT will be discarded. - -On failure, the guest will still be operating on its previous HPT. - -struct kvm_ppc_resize_hpt { - __u64 flags; - __u32 shift; - __u32 pad; -}; - -4.104 KVM_X86_GET_MCE_CAP_SUPPORTED - -Capability: KVM_CAP_MCE -Architectures: x86 -Type: system ioctl -Parameters: u64 mce_cap (out) -Returns: 0 on success, -1 on error - -Returns supported MCE capabilities. The u64 mce_cap parameter -has the same format as the MSR_IA32_MCG_CAP register. Supported -capabilities will have the corresponding bits set. - -4.105 KVM_X86_SETUP_MCE - -Capability: KVM_CAP_MCE -Architectures: x86 -Type: vcpu ioctl -Parameters: u64 mcg_cap (in) -Returns: 0 on success, - -EFAULT if u64 mcg_cap cannot be read, - -EINVAL if the requested number of banks is invalid, - -EINVAL if requested MCE capability is not supported. - -Initializes MCE support for use. The u64 mcg_cap parameter -has the same format as the MSR_IA32_MCG_CAP register and -specifies which capabilities should be enabled. The maximum -supported number of error-reporting banks can be retrieved when -checking for KVM_CAP_MCE. The supported capabilities can be -retrieved with KVM_X86_GET_MCE_CAP_SUPPORTED. - -4.106 KVM_X86_SET_MCE - -Capability: KVM_CAP_MCE -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_x86_mce (in) -Returns: 0 on success, - -EFAULT if struct kvm_x86_mce cannot be read, - -EINVAL if the bank number is invalid, - -EINVAL if VAL bit is not set in status field. - -Inject a machine check error (MCE) into the guest. The input -parameter is: - -struct kvm_x86_mce { - __u64 status; - __u64 addr; - __u64 misc; - __u64 mcg_status; - __u8 bank; - __u8 pad1[7]; - __u64 pad2[3]; -}; - -If the MCE being reported is an uncorrected error, KVM will -inject it as an MCE exception into the guest. If the guest -MCG_STATUS register reports that an MCE is in progress, KVM -causes an KVM_EXIT_SHUTDOWN vmexit. - -Otherwise, if the MCE is a corrected error, KVM will just -store it in the corresponding bank (provided this bank is -not holding a previously reported uncorrected error). - -4.107 KVM_S390_GET_CMMA_BITS - -Capability: KVM_CAP_S390_CMMA_MIGRATION -Architectures: s390 -Type: vm ioctl -Parameters: struct kvm_s390_cmma_log (in, out) -Returns: 0 on success, a negative value on error - -This ioctl is used to get the values of the CMMA bits on the s390 -architecture. It is meant to be used in two scenarios: -- During live migration to save the CMMA values. Live migration needs - to be enabled via the KVM_REQ_START_MIGRATION VM property. -- To non-destructively peek at the CMMA values, with the flag - KVM_S390_CMMA_PEEK set. - -The ioctl takes parameters via the kvm_s390_cmma_log struct. The desired -values are written to a buffer whose location is indicated via the "values" -member in the kvm_s390_cmma_log struct. The values in the input struct are -also updated as needed. -Each CMMA value takes up one byte. - -struct kvm_s390_cmma_log { - __u64 start_gfn; - __u32 count; - __u32 flags; - union { - __u64 remaining; - __u64 mask; - }; - __u64 values; -}; - -start_gfn is the number of the first guest frame whose CMMA values are -to be retrieved, - -count is the length of the buffer in bytes, - -values points to the buffer where the result will be written to. - -If count is greater than KVM_S390_SKEYS_MAX, then it is considered to be -KVM_S390_SKEYS_MAX. KVM_S390_SKEYS_MAX is re-used for consistency with -other ioctls. - -The result is written in the buffer pointed to by the field values, and -the values of the input parameter are updated as follows. - -Depending on the flags, different actions are performed. The only -supported flag so far is KVM_S390_CMMA_PEEK. - -The default behaviour if KVM_S390_CMMA_PEEK is not set is: -start_gfn will indicate the first page frame whose CMMA bits were dirty. -It is not necessarily the same as the one passed as input, as clean pages -are skipped. - -count will indicate the number of bytes actually written in the buffer. -It can (and very often will) be smaller than the input value, since the -buffer is only filled until 16 bytes of clean values are found (which -are then not copied in the buffer). Since a CMMA migration block needs -the base address and the length, for a total of 16 bytes, we will send -back some clean data if there is some dirty data afterwards, as long as -the size of the clean data does not exceed the size of the header. This -allows to minimize the amount of data to be saved or transferred over -the network at the expense of more roundtrips to userspace. The next -invocation of the ioctl will skip over all the clean values, saving -potentially more than just the 16 bytes we found. - -If KVM_S390_CMMA_PEEK is set: -the existing storage attributes are read even when not in migration -mode, and no other action is performed; - -the output start_gfn will be equal to the input start_gfn, - -the output count will be equal to the input count, except if the end of -memory has been reached. - -In both cases: -the field "remaining" will indicate the total number of dirty CMMA values -still remaining, or 0 if KVM_S390_CMMA_PEEK is set and migration mode is -not enabled. - -mask is unused. - -values points to the userspace buffer where the result will be stored. - -This ioctl can fail with -ENOMEM if not enough memory can be allocated to -complete the task, with -ENXIO if CMMA is not enabled, with -EINVAL if -KVM_S390_CMMA_PEEK is not set but migration mode was not enabled, with --EFAULT if the userspace address is invalid or if no page table is -present for the addresses (e.g. when using hugepages). - -4.108 KVM_S390_SET_CMMA_BITS - -Capability: KVM_CAP_S390_CMMA_MIGRATION -Architectures: s390 -Type: vm ioctl -Parameters: struct kvm_s390_cmma_log (in) -Returns: 0 on success, a negative value on error - -This ioctl is used to set the values of the CMMA bits on the s390 -architecture. It is meant to be used during live migration to restore -the CMMA values, but there are no restrictions on its use. -The ioctl takes parameters via the kvm_s390_cmma_values struct. -Each CMMA value takes up one byte. - -struct kvm_s390_cmma_log { - __u64 start_gfn; - __u32 count; - __u32 flags; - union { - __u64 remaining; - __u64 mask; - }; - __u64 values; -}; - -start_gfn indicates the starting guest frame number, - -count indicates how many values are to be considered in the buffer, - -flags is not used and must be 0. - -mask indicates which PGSTE bits are to be considered. - -remaining is not used. - -values points to the buffer in userspace where to store the values. - -This ioctl can fail with -ENOMEM if not enough memory can be allocated to -complete the task, with -ENXIO if CMMA is not enabled, with -EINVAL if -the count field is too large (e.g. more than KVM_S390_CMMA_SIZE_MAX) or -if the flags field was not 0, with -EFAULT if the userspace address is -invalid, if invalid pages are written to (e.g. after the end of memory) -or if no page table is present for the addresses (e.g. when using -hugepages). - -4.109 KVM_PPC_GET_CPU_CHAR - -Capability: KVM_CAP_PPC_GET_CPU_CHAR -Architectures: powerpc -Type: vm ioctl -Parameters: struct kvm_ppc_cpu_char (out) -Returns: 0 on successful completion - -EFAULT if struct kvm_ppc_cpu_char cannot be written - -This ioctl gives userspace information about certain characteristics -of the CPU relating to speculative execution of instructions and -possible information leakage resulting from speculative execution (see -CVE-2017-5715, CVE-2017-5753 and CVE-2017-5754). The information is -returned in struct kvm_ppc_cpu_char, which looks like this: - -struct kvm_ppc_cpu_char { - __u64 character; /* characteristics of the CPU */ - __u64 behaviour; /* recommended software behaviour */ - __u64 character_mask; /* valid bits in character */ - __u64 behaviour_mask; /* valid bits in behaviour */ -}; - -For extensibility, the character_mask and behaviour_mask fields -indicate which bits of character and behaviour have been filled in by -the kernel. If the set of defined bits is extended in future then -userspace will be able to tell whether it is running on a kernel that -knows about the new bits. - -The character field describes attributes of the CPU which can help -with preventing inadvertent information disclosure - specifically, -whether there is an instruction to flash-invalidate the L1 data cache -(ori 30,30,0 or mtspr SPRN_TRIG2,rN), whether the L1 data cache is set -to a mode where entries can only be used by the thread that created -them, whether the bcctr[l] instruction prevents speculation, and -whether a speculation barrier instruction (ori 31,31,0) is provided. - -The behaviour field describes actions that software should take to -prevent inadvertent information disclosure, and thus describes which -vulnerabilities the hardware is subject to; specifically whether the -L1 data cache should be flushed when returning to user mode from the -kernel, and whether a speculation barrier should be placed between an -array bounds check and the array access. - -These fields use the same bit definitions as the new -H_GET_CPU_CHARACTERISTICS hypercall. - -4.110 KVM_MEMORY_ENCRYPT_OP - -Capability: basic -Architectures: x86 -Type: system -Parameters: an opaque platform specific structure (in/out) -Returns: 0 on success; -1 on error - -If the platform supports creating encrypted VMs then this ioctl can be used -for issuing platform-specific memory encryption commands to manage those -encrypted VMs. - -Currently, this ioctl is used for issuing Secure Encrypted Virtualization -(SEV) commands on AMD Processors. The SEV commands are defined in -Documentation/virtual/kvm/amd-memory-encryption.rst. - -4.111 KVM_MEMORY_ENCRYPT_REG_REGION - -Capability: basic -Architectures: x86 -Type: system -Parameters: struct kvm_enc_region (in) -Returns: 0 on success; -1 on error - -This ioctl can be used to register a guest memory region which may -contain encrypted data (e.g. guest RAM, SMRAM etc). - -It is used in the SEV-enabled guest. When encryption is enabled, a guest -memory region may contain encrypted data. The SEV memory encryption -engine uses a tweak such that two identical plaintext pages, each at -different locations will have differing ciphertexts. So swapping or -moving ciphertext of those pages will not result in plaintext being -swapped. So relocating (or migrating) physical backing pages for the SEV -guest will require some additional steps. - -Note: The current SEV key management spec does not provide commands to -swap or migrate (move) ciphertext pages. Hence, for now we pin the guest -memory region registered with the ioctl. - -4.112 KVM_MEMORY_ENCRYPT_UNREG_REGION - -Capability: basic -Architectures: x86 -Type: system -Parameters: struct kvm_enc_region (in) -Returns: 0 on success; -1 on error - -This ioctl can be used to unregister the guest memory region registered -with KVM_MEMORY_ENCRYPT_REG_REGION ioctl above. - -4.113 KVM_HYPERV_EVENTFD - -Capability: KVM_CAP_HYPERV_EVENTFD -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_hyperv_eventfd (in) - -This ioctl (un)registers an eventfd to receive notifications from the guest on -the specified Hyper-V connection id through the SIGNAL_EVENT hypercall, without -causing a user exit. SIGNAL_EVENT hypercall with non-zero event flag number -(bits 24-31) still triggers a KVM_EXIT_HYPERV_HCALL user exit. - -struct kvm_hyperv_eventfd { - __u32 conn_id; - __s32 fd; - __u32 flags; - __u32 padding[3]; -}; - -The conn_id field should fit within 24 bits: - -#define KVM_HYPERV_CONN_ID_MASK 0x00ffffff - -The acceptable values for the flags field are: - -#define KVM_HYPERV_EVENTFD_DEASSIGN (1 << 0) - -Returns: 0 on success, - -EINVAL if conn_id or flags is outside the allowed range - -ENOENT on deassign if the conn_id isn't registered - -EEXIST on assign if the conn_id is already registered - -4.114 KVM_GET_NESTED_STATE - -Capability: KVM_CAP_NESTED_STATE -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_nested_state (in/out) -Returns: 0 on success, -1 on error -Errors: - E2BIG: the total state size exceeds the value of 'size' specified by - the user; the size required will be written into size. - -struct kvm_nested_state { - __u16 flags; - __u16 format; - __u32 size; - - union { - struct kvm_vmx_nested_state_hdr vmx; - struct kvm_svm_nested_state_hdr svm; - - /* Pad the header to 128 bytes. */ - __u8 pad[120]; - } hdr; - - union { - struct kvm_vmx_nested_state_data vmx[0]; - struct kvm_svm_nested_state_data svm[0]; - } data; -}; - -#define KVM_STATE_NESTED_GUEST_MODE 0x00000001 -#define KVM_STATE_NESTED_RUN_PENDING 0x00000002 -#define KVM_STATE_NESTED_EVMCS 0x00000004 - -#define KVM_STATE_NESTED_FORMAT_VMX 0 -#define KVM_STATE_NESTED_FORMAT_SVM 1 - -#define KVM_STATE_NESTED_VMX_VMCS_SIZE 0x1000 - -#define KVM_STATE_NESTED_VMX_SMM_GUEST_MODE 0x00000001 -#define KVM_STATE_NESTED_VMX_SMM_VMXON 0x00000002 - -struct kvm_vmx_nested_state_hdr { - __u64 vmxon_pa; - __u64 vmcs12_pa; - - struct { - __u16 flags; - } smm; -}; - -struct kvm_vmx_nested_state_data { - __u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE]; - __u8 shadow_vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE]; -}; - -This ioctl copies the vcpu's nested virtualization state from the kernel to -userspace. - -The maximum size of the state can be retrieved by passing KVM_CAP_NESTED_STATE -to the KVM_CHECK_EXTENSION ioctl(). - -4.115 KVM_SET_NESTED_STATE - -Capability: KVM_CAP_NESTED_STATE -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_nested_state (in) -Returns: 0 on success, -1 on error - -This copies the vcpu's kvm_nested_state struct from userspace to the kernel. -For the definition of struct kvm_nested_state, see KVM_GET_NESTED_STATE. - -4.116 KVM_(UN)REGISTER_COALESCED_MMIO - -Capability: KVM_CAP_COALESCED_MMIO (for coalesced mmio) - KVM_CAP_COALESCED_PIO (for coalesced pio) -Architectures: all -Type: vm ioctl -Parameters: struct kvm_coalesced_mmio_zone -Returns: 0 on success, < 0 on error - -Coalesced I/O is a performance optimization that defers hardware -register write emulation so that userspace exits are avoided. It is -typically used to reduce the overhead of emulating frequently accessed -hardware registers. - -When a hardware register is configured for coalesced I/O, write accesses -do not exit to userspace and their value is recorded in a ring buffer -that is shared between kernel and userspace. - -Coalesced I/O is used if one or more write accesses to a hardware -register can be deferred until a read or a write to another hardware -register on the same device. This last access will cause a vmexit and -userspace will process accesses from the ring buffer before emulating -it. That will avoid exiting to userspace on repeated writes. - -Coalesced pio is based on coalesced mmio. There is little difference -between coalesced mmio and pio except that coalesced pio records accesses -to I/O ports. - -4.117 KVM_CLEAR_DIRTY_LOG (vm ioctl) - -Capability: KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 -Architectures: x86, arm, arm64, mips -Type: vm ioctl -Parameters: struct kvm_dirty_log (in) -Returns: 0 on success, -1 on error - -/* for KVM_CLEAR_DIRTY_LOG */ -struct kvm_clear_dirty_log { - __u32 slot; - __u32 num_pages; - __u64 first_page; - union { - void __user *dirty_bitmap; /* one bit per page */ - __u64 padding; - }; -}; - -The ioctl clears the dirty status of pages in a memory slot, according to -the bitmap that is passed in struct kvm_clear_dirty_log's dirty_bitmap -field. Bit 0 of the bitmap corresponds to page "first_page" in the -memory slot, and num_pages is the size in bits of the input bitmap. -first_page must be a multiple of 64; num_pages must also be a multiple of -64 unless first_page + num_pages is the size of the memory slot. For each -bit that is set in the input bitmap, the corresponding page is marked "clean" -in KVM's dirty bitmap, and dirty tracking is re-enabled for that page -(for example via write-protection, or by clearing the dirty bit in -a page table entry). - -If KVM_CAP_MULTI_ADDRESS_SPACE is available, bits 16-31 specifies -the address space for which you want to return the dirty bitmap. -They must be less than the value that KVM_CHECK_EXTENSION returns for -the KVM_CAP_MULTI_ADDRESS_SPACE capability. - -This ioctl is mostly useful when KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 -is enabled; for more information, see the description of the capability. -However, it can always be used as long as KVM_CHECK_EXTENSION confirms -that KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 is present. - -4.118 KVM_GET_SUPPORTED_HV_CPUID - -Capability: KVM_CAP_HYPERV_CPUID -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_cpuid2 (in/out) -Returns: 0 on success, -1 on error - -struct kvm_cpuid2 { - __u32 nent; - __u32 padding; - struct kvm_cpuid_entry2 entries[0]; -}; - -struct kvm_cpuid_entry2 { - __u32 function; - __u32 index; - __u32 flags; - __u32 eax; - __u32 ebx; - __u32 ecx; - __u32 edx; - __u32 padding[3]; -}; - -This ioctl returns x86 cpuid features leaves related to Hyper-V emulation in -KVM. Userspace can use the information returned by this ioctl to construct -cpuid information presented to guests consuming Hyper-V enlightenments (e.g. -Windows or Hyper-V guests). - -CPUID feature leaves returned by this ioctl are defined by Hyper-V Top Level -Functional Specification (TLFS). These leaves can't be obtained with -KVM_GET_SUPPORTED_CPUID ioctl because some of them intersect with KVM feature -leaves (0x40000000, 0x40000001). - -Currently, the following list of CPUID leaves are returned: - HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS - HYPERV_CPUID_INTERFACE - HYPERV_CPUID_VERSION - HYPERV_CPUID_FEATURES - HYPERV_CPUID_ENLIGHTMENT_INFO - HYPERV_CPUID_IMPLEMENT_LIMITS - HYPERV_CPUID_NESTED_FEATURES - -HYPERV_CPUID_NESTED_FEATURES leaf is only exposed when Enlightened VMCS was -enabled on the corresponding vCPU (KVM_CAP_HYPERV_ENLIGHTENED_VMCS). - -Userspace invokes KVM_GET_SUPPORTED_CPUID by passing a kvm_cpuid2 structure -with the 'nent' field indicating the number of entries in the variable-size -array 'entries'. If the number of entries is too low to describe all Hyper-V -feature leaves, an error (E2BIG) is returned. If the number is more or equal -to the number of Hyper-V feature leaves, the 'nent' field is adjusted to the -number of valid entries in the 'entries' array, which is then filled. - -'index' and 'flags' fields in 'struct kvm_cpuid_entry2' are currently reserved, -userspace should not expect to get any particular value there. - -4.119 KVM_ARM_VCPU_FINALIZE - -Architectures: arm, arm64 -Type: vcpu ioctl -Parameters: int feature (in) -Returns: 0 on success, -1 on error -Errors: - EPERM: feature not enabled, needs configuration, or already finalized - EINVAL: feature unknown or not present - -Recognised values for feature: - arm64 KVM_ARM_VCPU_SVE (requires KVM_CAP_ARM_SVE) - -Finalizes the configuration of the specified vcpu feature. - -The vcpu must already have been initialised, enabling the affected feature, by -means of a successful KVM_ARM_VCPU_INIT call with the appropriate flag set in -features[]. - -For affected vcpu features, this is a mandatory step that must be performed -before the vcpu is fully usable. - -Between KVM_ARM_VCPU_INIT and KVM_ARM_VCPU_FINALIZE, the feature may be -configured by use of ioctls such as KVM_SET_ONE_REG. The exact configuration -that should be performaned and how to do it are feature-dependent. - -Other calls that depend on a particular feature being finalized, such as -KVM_RUN, KVM_GET_REG_LIST, KVM_GET_ONE_REG and KVM_SET_ONE_REG, will fail with --EPERM unless the feature has already been finalized by means of a -KVM_ARM_VCPU_FINALIZE call. - -See KVM_ARM_VCPU_INIT for details of vcpu features that require finalization -using this ioctl. - -4.120 KVM_SET_PMU_EVENT_FILTER - -Capability: KVM_CAP_PMU_EVENT_FILTER -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_pmu_event_filter (in) -Returns: 0 on success, -1 on error - -struct kvm_pmu_event_filter { - __u32 action; - __u32 nevents; - __u32 fixed_counter_bitmap; - __u32 flags; - __u32 pad[4]; - __u64 events[0]; -}; - -This ioctl restricts the set of PMU events that the guest can program. -The argument holds a list of events which will be allowed or denied. -The eventsel+umask of each event the guest attempts to program is compared -against the events field to determine whether the guest should have access. -The events field only controls general purpose counters; fixed purpose -counters are controlled by the fixed_counter_bitmap. - -No flags are defined yet, the field must be zero. - -Valid values for 'action': -#define KVM_PMU_EVENT_ALLOW 0 -#define KVM_PMU_EVENT_DENY 1 - - -5. The kvm_run structure ------------------------- - -Application code obtains a pointer to the kvm_run structure by -mmap()ing a vcpu fd. From that point, application code can control -execution by changing fields in kvm_run prior to calling the KVM_RUN -ioctl, and obtain information about the reason KVM_RUN returned by -looking up structure members. - -struct kvm_run { - /* in */ - __u8 request_interrupt_window; - -Request that KVM_RUN return when it becomes possible to inject external -interrupts into the guest. Useful in conjunction with KVM_INTERRUPT. - - __u8 immediate_exit; - -This field is polled once when KVM_RUN starts; if non-zero, KVM_RUN -exits immediately, returning -EINTR. In the common scenario where a -signal is used to "kick" a VCPU out of KVM_RUN, this field can be used -to avoid usage of KVM_SET_SIGNAL_MASK, which has worse scalability. -Rather than blocking the signal outside KVM_RUN, userspace can set up -a signal handler that sets run->immediate_exit to a non-zero value. - -This field is ignored if KVM_CAP_IMMEDIATE_EXIT is not available. - - __u8 padding1[6]; - - /* out */ - __u32 exit_reason; - -When KVM_RUN has returned successfully (return value 0), this informs -application code why KVM_RUN has returned. Allowable values for this -field are detailed below. - - __u8 ready_for_interrupt_injection; - -If request_interrupt_window has been specified, this field indicates -an interrupt can be injected now with KVM_INTERRUPT. - - __u8 if_flag; - -The value of the current interrupt flag. Only valid if in-kernel -local APIC is not used. - - __u16 flags; - -More architecture-specific flags detailing state of the VCPU that may -affect the device's behavior. The only currently defined flag is -KVM_RUN_X86_SMM, which is valid on x86 machines and is set if the -VCPU is in system management mode. - - /* in (pre_kvm_run), out (post_kvm_run) */ - __u64 cr8; - -The value of the cr8 register. Only valid if in-kernel local APIC is -not used. Both input and output. - - __u64 apic_base; - -The value of the APIC BASE msr. Only valid if in-kernel local -APIC is not used. Both input and output. - - union { - /* KVM_EXIT_UNKNOWN */ - struct { - __u64 hardware_exit_reason; - } hw; - -If exit_reason is KVM_EXIT_UNKNOWN, the vcpu has exited due to unknown -reasons. Further architecture-specific information is available in -hardware_exit_reason. - - /* KVM_EXIT_FAIL_ENTRY */ - struct { - __u64 hardware_entry_failure_reason; - } fail_entry; - -If exit_reason is KVM_EXIT_FAIL_ENTRY, the vcpu could not be run due -to unknown reasons. Further architecture-specific information is -available in hardware_entry_failure_reason. - - /* KVM_EXIT_EXCEPTION */ - struct { - __u32 exception; - __u32 error_code; - } ex; - -Unused. - - /* KVM_EXIT_IO */ - struct { -#define KVM_EXIT_IO_IN 0 -#define KVM_EXIT_IO_OUT 1 - __u8 direction; - __u8 size; /* bytes */ - __u16 port; - __u32 count; - __u64 data_offset; /* relative to kvm_run start */ - } io; - -If exit_reason is KVM_EXIT_IO, then the vcpu has -executed a port I/O instruction which could not be satisfied by kvm. -data_offset describes where the data is located (KVM_EXIT_IO_OUT) or -where kvm expects application code to place the data for the next -KVM_RUN invocation (KVM_EXIT_IO_IN). Data format is a packed array. - - /* KVM_EXIT_DEBUG */ - struct { - struct kvm_debug_exit_arch arch; - } debug; - -If the exit_reason is KVM_EXIT_DEBUG, then a vcpu is processing a debug event -for which architecture specific information is returned. - - /* KVM_EXIT_MMIO */ - struct { - __u64 phys_addr; - __u8 data[8]; - __u32 len; - __u8 is_write; - } mmio; - -If exit_reason is KVM_EXIT_MMIO, then the vcpu has -executed a memory-mapped I/O instruction which could not be satisfied -by kvm. The 'data' member contains the written data if 'is_write' is -true, and should be filled by application code otherwise. - -The 'data' member contains, in its first 'len' bytes, the value as it would -appear if the VCPU performed a load or store of the appropriate width directly -to the byte array. - -NOTE: For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR and - KVM_EXIT_EPR the corresponding -operations are complete (and guest state is consistent) only after userspace -has re-entered the kernel with KVM_RUN. The kernel side will first finish -incomplete operations and then check for pending signals. Userspace -can re-enter the guest with an unmasked signal pending to complete -pending operations. - - /* KVM_EXIT_HYPERCALL */ - struct { - __u64 nr; - __u64 args[6]; - __u64 ret; - __u32 longmode; - __u32 pad; - } hypercall; - -Unused. This was once used for 'hypercall to userspace'. To implement -such functionality, use KVM_EXIT_IO (x86) or KVM_EXIT_MMIO (all except s390). -Note KVM_EXIT_IO is significantly faster than KVM_EXIT_MMIO. - - /* KVM_EXIT_TPR_ACCESS */ - struct { - __u64 rip; - __u32 is_write; - __u32 pad; - } tpr_access; - -To be documented (KVM_TPR_ACCESS_REPORTING). - - /* KVM_EXIT_S390_SIEIC */ - struct { - __u8 icptcode; - __u64 mask; /* psw upper half */ - __u64 addr; /* psw lower half */ - __u16 ipa; - __u32 ipb; - } s390_sieic; - -s390 specific. - - /* KVM_EXIT_S390_RESET */ -#define KVM_S390_RESET_POR 1 -#define KVM_S390_RESET_CLEAR 2 -#define KVM_S390_RESET_SUBSYSTEM 4 -#define KVM_S390_RESET_CPU_INIT 8 -#define KVM_S390_RESET_IPL 16 - __u64 s390_reset_flags; - -s390 specific. - - /* KVM_EXIT_S390_UCONTROL */ - struct { - __u64 trans_exc_code; - __u32 pgm_code; - } s390_ucontrol; - -s390 specific. A page fault has occurred for a user controlled virtual -machine (KVM_VM_S390_UNCONTROL) on it's host page table that cannot be -resolved by the kernel. -The program code and the translation exception code that were placed -in the cpu's lowcore are presented here as defined by the z Architecture -Principles of Operation Book in the Chapter for Dynamic Address Translation -(DAT) - - /* KVM_EXIT_DCR */ - struct { - __u32 dcrn; - __u32 data; - __u8 is_write; - } dcr; - -Deprecated - was used for 440 KVM. - - /* KVM_EXIT_OSI */ - struct { - __u64 gprs[32]; - } osi; - -MOL uses a special hypercall interface it calls 'OSI'. To enable it, we catch -hypercalls and exit with this exit struct that contains all the guest gprs. - -If exit_reason is KVM_EXIT_OSI, then the vcpu has triggered such a hypercall. -Userspace can now handle the hypercall and when it's done modify the gprs as -necessary. Upon guest entry all guest GPRs will then be replaced by the values -in this struct. - - /* KVM_EXIT_PAPR_HCALL */ - struct { - __u64 nr; - __u64 ret; - __u64 args[9]; - } papr_hcall; - -This is used on 64-bit PowerPC when emulating a pSeries partition, -e.g. with the 'pseries' machine type in qemu. It occurs when the -guest does a hypercall using the 'sc 1' instruction. The 'nr' field -contains the hypercall number (from the guest R3), and 'args' contains -the arguments (from the guest R4 - R12). Userspace should put the -return code in 'ret' and any extra returned values in args[]. -The possible hypercalls are defined in the Power Architecture Platform -Requirements (PAPR) document available from www.power.org (free -developer registration required to access it). - - /* KVM_EXIT_S390_TSCH */ - struct { - __u16 subchannel_id; - __u16 subchannel_nr; - __u32 io_int_parm; - __u32 io_int_word; - __u32 ipb; - __u8 dequeued; - } s390_tsch; - -s390 specific. This exit occurs when KVM_CAP_S390_CSS_SUPPORT has been enabled -and TEST SUBCHANNEL was intercepted. If dequeued is set, a pending I/O -interrupt for the target subchannel has been dequeued and subchannel_id, -subchannel_nr, io_int_parm and io_int_word contain the parameters for that -interrupt. ipb is needed for instruction parameter decoding. - - /* KVM_EXIT_EPR */ - struct { - __u32 epr; - } epr; - -On FSL BookE PowerPC chips, the interrupt controller has a fast patch -interrupt acknowledge path to the core. When the core successfully -delivers an interrupt, it automatically populates the EPR register with -the interrupt vector number and acknowledges the interrupt inside -the interrupt controller. - -In case the interrupt controller lives in user space, we need to do -the interrupt acknowledge cycle through it to fetch the next to be -delivered interrupt vector using this exit. - -It gets triggered whenever both KVM_CAP_PPC_EPR are enabled and an -external interrupt has just been delivered into the guest. User space -should put the acknowledged interrupt vector into the 'epr' field. - - /* KVM_EXIT_SYSTEM_EVENT */ - struct { -#define KVM_SYSTEM_EVENT_SHUTDOWN 1 -#define KVM_SYSTEM_EVENT_RESET 2 -#define KVM_SYSTEM_EVENT_CRASH 3 - __u32 type; - __u64 flags; - } system_event; - -If exit_reason is KVM_EXIT_SYSTEM_EVENT then the vcpu has triggered -a system-level event using some architecture specific mechanism (hypercall -or some special instruction). In case of ARM/ARM64, this is triggered using -HVC instruction based PSCI call from the vcpu. The 'type' field describes -the system-level event type. The 'flags' field describes architecture -specific flags for the system-level event. - -Valid values for 'type' are: - KVM_SYSTEM_EVENT_SHUTDOWN -- the guest has requested a shutdown of the - VM. Userspace is not obliged to honour this, and if it does honour - this does not need to destroy the VM synchronously (ie it may call - KVM_RUN again before shutdown finally occurs). - KVM_SYSTEM_EVENT_RESET -- the guest has requested a reset of the VM. - As with SHUTDOWN, userspace can choose to ignore the request, or - to schedule the reset to occur in the future and may call KVM_RUN again. - KVM_SYSTEM_EVENT_CRASH -- the guest crash occurred and the guest - has requested a crash condition maintenance. Userspace can choose - to ignore the request, or to gather VM memory core dump and/or - reset/shutdown of the VM. - - /* KVM_EXIT_IOAPIC_EOI */ - struct { - __u8 vector; - } eoi; - -Indicates that the VCPU's in-kernel local APIC received an EOI for a -level-triggered IOAPIC interrupt. This exit only triggers when the -IOAPIC is implemented in userspace (i.e. KVM_CAP_SPLIT_IRQCHIP is enabled); -the userspace IOAPIC should process the EOI and retrigger the interrupt if -it is still asserted. Vector is the LAPIC interrupt vector for which the -EOI was received. - - struct kvm_hyperv_exit { -#define KVM_EXIT_HYPERV_SYNIC 1 -#define KVM_EXIT_HYPERV_HCALL 2 - __u32 type; - union { - struct { - __u32 msr; - __u64 control; - __u64 evt_page; - __u64 msg_page; - } synic; - struct { - __u64 input; - __u64 result; - __u64 params[2]; - } hcall; - } u; - }; - /* KVM_EXIT_HYPERV */ - struct kvm_hyperv_exit hyperv; -Indicates that the VCPU exits into userspace to process some tasks -related to Hyper-V emulation. -Valid values for 'type' are: - KVM_EXIT_HYPERV_SYNIC -- synchronously notify user-space about -Hyper-V SynIC state change. Notification is used to remap SynIC -event/message pages and to enable/disable SynIC messages/events processing -in userspace. - - /* Fix the size of the union. */ - char padding[256]; - }; - - /* - * shared registers between kvm and userspace. - * kvm_valid_regs specifies the register classes set by the host - * kvm_dirty_regs specified the register classes dirtied by userspace - * struct kvm_sync_regs is architecture specific, as well as the - * bits for kvm_valid_regs and kvm_dirty_regs - */ - __u64 kvm_valid_regs; - __u64 kvm_dirty_regs; - union { - struct kvm_sync_regs regs; - char padding[SYNC_REGS_SIZE_BYTES]; - } s; - -If KVM_CAP_SYNC_REGS is defined, these fields allow userspace to access -certain guest registers without having to call SET/GET_*REGS. Thus we can -avoid some system call overhead if userspace has to handle the exit. -Userspace can query the validity of the structure by checking -kvm_valid_regs for specific bits. These bits are architecture specific -and usually define the validity of a groups of registers. (e.g. one bit - for general purpose registers) - -Please note that the kernel is allowed to use the kvm_run structure as the -primary storage for certain register types. Therefore, the kernel may use the -values in kvm_run even if the corresponding bit in kvm_dirty_regs is not set. - -}; - - - -6. Capabilities that can be enabled on vCPUs --------------------------------------------- - -There are certain capabilities that change the behavior of the virtual CPU or -the virtual machine when enabled. To enable them, please see section 4.37. -Below you can find a list of capabilities and what their effect on the vCPU or -the virtual machine is when enabling them. - -The following information is provided along with the description: - - Architectures: which instruction set architectures provide this ioctl. - x86 includes both i386 and x86_64. - - Target: whether this is a per-vcpu or per-vm capability. - - Parameters: what parameters are accepted by the capability. - - Returns: the return value. General error numbers (EBADF, ENOMEM, EINVAL) - are not detailed, but errors with specific meanings are. - - -6.1 KVM_CAP_PPC_OSI - -Architectures: ppc -Target: vcpu -Parameters: none -Returns: 0 on success; -1 on error - -This capability enables interception of OSI hypercalls that otherwise would -be treated as normal system calls to be injected into the guest. OSI hypercalls -were invented by Mac-on-Linux to have a standardized communication mechanism -between the guest and the host. - -When this capability is enabled, KVM_EXIT_OSI can occur. - - -6.2 KVM_CAP_PPC_PAPR - -Architectures: ppc -Target: vcpu -Parameters: none -Returns: 0 on success; -1 on error - -This capability enables interception of PAPR hypercalls. PAPR hypercalls are -done using the hypercall instruction "sc 1". - -It also sets the guest privilege level to "supervisor" mode. Usually the guest -runs in "hypervisor" privilege mode with a few missing features. - -In addition to the above, it changes the semantics of SDR1. In this mode, the -HTAB address part of SDR1 contains an HVA instead of a GPA, as PAPR keeps the -HTAB invisible to the guest. - -When this capability is enabled, KVM_EXIT_PAPR_HCALL can occur. - - -6.3 KVM_CAP_SW_TLB - -Architectures: ppc -Target: vcpu -Parameters: args[0] is the address of a struct kvm_config_tlb -Returns: 0 on success; -1 on error - -struct kvm_config_tlb { - __u64 params; - __u64 array; - __u32 mmu_type; - __u32 array_len; -}; - -Configures the virtual CPU's TLB array, establishing a shared memory area -between userspace and KVM. The "params" and "array" fields are userspace -addresses of mmu-type-specific data structures. The "array_len" field is an -safety mechanism, and should be set to the size in bytes of the memory that -userspace has reserved for the array. It must be at least the size dictated -by "mmu_type" and "params". - -While KVM_RUN is active, the shared region is under control of KVM. Its -contents are undefined, and any modification by userspace results in -boundedly undefined behavior. - -On return from KVM_RUN, the shared region will reflect the current state of -the guest's TLB. If userspace makes any changes, it must call KVM_DIRTY_TLB -to tell KVM which entries have been changed, prior to calling KVM_RUN again -on this vcpu. - -For mmu types KVM_MMU_FSL_BOOKE_NOHV and KVM_MMU_FSL_BOOKE_HV: - - The "params" field is of type "struct kvm_book3e_206_tlb_params". - - The "array" field points to an array of type "struct - kvm_book3e_206_tlb_entry". - - The array consists of all entries in the first TLB, followed by all - entries in the second TLB. - - Within a TLB, entries are ordered first by increasing set number. Within a - set, entries are ordered by way (increasing ESEL). - - The hash for determining set number in TLB0 is: (MAS2 >> 12) & (num_sets - 1) - where "num_sets" is the tlb_sizes[] value divided by the tlb_ways[] value. - - The tsize field of mas1 shall be set to 4K on TLB0, even though the - hardware ignores this value for TLB0. - -6.4 KVM_CAP_S390_CSS_SUPPORT - -Architectures: s390 -Target: vcpu -Parameters: none -Returns: 0 on success; -1 on error - -This capability enables support for handling of channel I/O instructions. - -TEST PENDING INTERRUPTION and the interrupt portion of TEST SUBCHANNEL are -handled in-kernel, while the other I/O instructions are passed to userspace. - -When this capability is enabled, KVM_EXIT_S390_TSCH will occur on TEST -SUBCHANNEL intercepts. - -Note that even though this capability is enabled per-vcpu, the complete -virtual machine is affected. - -6.5 KVM_CAP_PPC_EPR - -Architectures: ppc -Target: vcpu -Parameters: args[0] defines whether the proxy facility is active -Returns: 0 on success; -1 on error - -This capability enables or disables the delivery of interrupts through the -external proxy facility. - -When enabled (args[0] != 0), every time the guest gets an external interrupt -delivered, it automatically exits into user space with a KVM_EXIT_EPR exit -to receive the topmost interrupt vector. - -When disabled (args[0] == 0), behavior is as if this facility is unsupported. - -When this capability is enabled, KVM_EXIT_EPR can occur. - -6.6 KVM_CAP_IRQ_MPIC - -Architectures: ppc -Parameters: args[0] is the MPIC device fd - args[1] is the MPIC CPU number for this vcpu - -This capability connects the vcpu to an in-kernel MPIC device. - -6.7 KVM_CAP_IRQ_XICS - -Architectures: ppc -Target: vcpu -Parameters: args[0] is the XICS device fd - args[1] is the XICS CPU number (server ID) for this vcpu - -This capability connects the vcpu to an in-kernel XICS device. - -6.8 KVM_CAP_S390_IRQCHIP - -Architectures: s390 -Target: vm -Parameters: none - -This capability enables the in-kernel irqchip for s390. Please refer to -"4.24 KVM_CREATE_IRQCHIP" for details. - -6.9 KVM_CAP_MIPS_FPU - -Architectures: mips -Target: vcpu -Parameters: args[0] is reserved for future use (should be 0). - -This capability allows the use of the host Floating Point Unit by the guest. It -allows the Config1.FP bit to be set to enable the FPU in the guest. Once this is -done the KVM_REG_MIPS_FPR_* and KVM_REG_MIPS_FCR_* registers can be accessed -(depending on the current guest FPU register mode), and the Status.FR, -Config5.FRE bits are accessible via the KVM API and also from the guest, -depending on them being supported by the FPU. - -6.10 KVM_CAP_MIPS_MSA - -Architectures: mips -Target: vcpu -Parameters: args[0] is reserved for future use (should be 0). - -This capability allows the use of the MIPS SIMD Architecture (MSA) by the guest. -It allows the Config3.MSAP bit to be set to enable the use of MSA by the guest. -Once this is done the KVM_REG_MIPS_VEC_* and KVM_REG_MIPS_MSA_* registers can be -accessed, and the Config5.MSAEn bit is accessible via the KVM API and also from -the guest. - -6.74 KVM_CAP_SYNC_REGS -Architectures: s390, x86 -Target: s390: always enabled, x86: vcpu -Parameters: none -Returns: x86: KVM_CHECK_EXTENSION returns a bit-array indicating which register -sets are supported (bitfields defined in arch/x86/include/uapi/asm/kvm.h). - -As described above in the kvm_sync_regs struct info in section 5 (kvm_run): -KVM_CAP_SYNC_REGS "allow[s] userspace to access certain guest registers -without having to call SET/GET_*REGS". This reduces overhead by eliminating -repeated ioctl calls for setting and/or getting register values. This is -particularly important when userspace is making synchronous guest state -modifications, e.g. when emulating and/or intercepting instructions in -userspace. - -For s390 specifics, please refer to the source code. - -For x86: -- the register sets to be copied out to kvm_run are selectable - by userspace (rather that all sets being copied out for every exit). -- vcpu_events are available in addition to regs and sregs. - -For x86, the 'kvm_valid_regs' field of struct kvm_run is overloaded to -function as an input bit-array field set by userspace to indicate the -specific register sets to be copied out on the next exit. - -To indicate when userspace has modified values that should be copied into -the vCPU, the all architecture bitarray field, 'kvm_dirty_regs' must be set. -This is done using the same bitflags as for the 'kvm_valid_regs' field. -If the dirty bit is not set, then the register set values will not be copied -into the vCPU even if they've been modified. - -Unused bitfields in the bitarrays must be set to zero. - -struct kvm_sync_regs { - struct kvm_regs regs; - struct kvm_sregs sregs; - struct kvm_vcpu_events events; -}; - -6.75 KVM_CAP_PPC_IRQ_XIVE - -Architectures: ppc -Target: vcpu -Parameters: args[0] is the XIVE device fd - args[1] is the XIVE CPU number (server ID) for this vcpu - -This capability connects the vcpu to an in-kernel XIVE device. - -7. Capabilities that can be enabled on VMs ------------------------------------------- - -There are certain capabilities that change the behavior of the virtual -machine when enabled. To enable them, please see section 4.37. Below -you can find a list of capabilities and what their effect on the VM -is when enabling them. - -The following information is provided along with the description: - - Architectures: which instruction set architectures provide this ioctl. - x86 includes both i386 and x86_64. - - Parameters: what parameters are accepted by the capability. - - Returns: the return value. General error numbers (EBADF, ENOMEM, EINVAL) - are not detailed, but errors with specific meanings are. - - -7.1 KVM_CAP_PPC_ENABLE_HCALL - -Architectures: ppc -Parameters: args[0] is the sPAPR hcall number - args[1] is 0 to disable, 1 to enable in-kernel handling - -This capability controls whether individual sPAPR hypercalls (hcalls) -get handled by the kernel or not. Enabling or disabling in-kernel -handling of an hcall is effective across the VM. On creation, an -initial set of hcalls are enabled for in-kernel handling, which -consists of those hcalls for which in-kernel handlers were implemented -before this capability was implemented. If disabled, the kernel will -not to attempt to handle the hcall, but will always exit to userspace -to handle it. Note that it may not make sense to enable some and -disable others of a group of related hcalls, but KVM does not prevent -userspace from doing that. - -If the hcall number specified is not one that has an in-kernel -implementation, the KVM_ENABLE_CAP ioctl will fail with an EINVAL -error. - -7.2 KVM_CAP_S390_USER_SIGP - -Architectures: s390 -Parameters: none - -This capability controls which SIGP orders will be handled completely in user -space. With this capability enabled, all fast orders will be handled completely -in the kernel: -- SENSE -- SENSE RUNNING -- EXTERNAL CALL -- EMERGENCY SIGNAL -- CONDITIONAL EMERGENCY SIGNAL - -All other orders will be handled completely in user space. - -Only privileged operation exceptions will be checked for in the kernel (or even -in the hardware prior to interception). If this capability is not enabled, the -old way of handling SIGP orders is used (partially in kernel and user space). - -7.3 KVM_CAP_S390_VECTOR_REGISTERS - -Architectures: s390 -Parameters: none -Returns: 0 on success, negative value on error - -Allows use of the vector registers introduced with z13 processor, and -provides for the synchronization between host and user space. Will -return -EINVAL if the machine does not support vectors. - -7.4 KVM_CAP_S390_USER_STSI - -Architectures: s390 -Parameters: none - -This capability allows post-handlers for the STSI instruction. After -initial handling in the kernel, KVM exits to user space with -KVM_EXIT_S390_STSI to allow user space to insert further data. - -Before exiting to userspace, kvm handlers should fill in s390_stsi field of -vcpu->run: -struct { - __u64 addr; - __u8 ar; - __u8 reserved; - __u8 fc; - __u8 sel1; - __u16 sel2; -} s390_stsi; - -@addr - guest address of STSI SYSIB -@fc - function code -@sel1 - selector 1 -@sel2 - selector 2 -@ar - access register number - -KVM handlers should exit to userspace with rc = -EREMOTE. - -7.5 KVM_CAP_SPLIT_IRQCHIP - -Architectures: x86 -Parameters: args[0] - number of routes reserved for userspace IOAPICs -Returns: 0 on success, -1 on error - -Create a local apic for each processor in the kernel. This can be used -instead of KVM_CREATE_IRQCHIP if the userspace VMM wishes to emulate the -IOAPIC and PIC (and also the PIT, even though this has to be enabled -separately). - -This capability also enables in kernel routing of interrupt requests; -when KVM_CAP_SPLIT_IRQCHIP only routes of KVM_IRQ_ROUTING_MSI type are -used in the IRQ routing table. The first args[0] MSI routes are reserved -for the IOAPIC pins. Whenever the LAPIC receives an EOI for these routes, -a KVM_EXIT_IOAPIC_EOI vmexit will be reported to userspace. - -Fails if VCPU has already been created, or if the irqchip is already in the -kernel (i.e. KVM_CREATE_IRQCHIP has already been called). - -7.6 KVM_CAP_S390_RI - -Architectures: s390 -Parameters: none - -Allows use of runtime-instrumentation introduced with zEC12 processor. -Will return -EINVAL if the machine does not support runtime-instrumentation. -Will return -EBUSY if a VCPU has already been created. - -7.7 KVM_CAP_X2APIC_API - -Architectures: x86 -Parameters: args[0] - features that should be enabled -Returns: 0 on success, -EINVAL when args[0] contains invalid features - -Valid feature flags in args[0] are - -#define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0) -#define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1) - -Enabling KVM_X2APIC_API_USE_32BIT_IDS changes the behavior of -KVM_SET_GSI_ROUTING, KVM_SIGNAL_MSI, KVM_SET_LAPIC, and KVM_GET_LAPIC, -allowing the use of 32-bit APIC IDs. See KVM_CAP_X2APIC_API in their -respective sections. - -KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK must be enabled for x2APIC to work -in logical mode or with more than 255 VCPUs. Otherwise, KVM treats 0xff -as a broadcast even in x2APIC mode in order to support physical x2APIC -without interrupt remapping. This is undesirable in logical mode, -where 0xff represents CPUs 0-7 in cluster 0. - -7.8 KVM_CAP_S390_USER_INSTR0 - -Architectures: s390 -Parameters: none - -With this capability enabled, all illegal instructions 0x0000 (2 bytes) will -be intercepted and forwarded to user space. User space can use this -mechanism e.g. to realize 2-byte software breakpoints. The kernel will -not inject an operating exception for these instructions, user space has -to take care of that. - -This capability can be enabled dynamically even if VCPUs were already -created and are running. - -7.9 KVM_CAP_S390_GS - -Architectures: s390 -Parameters: none -Returns: 0 on success; -EINVAL if the machine does not support - guarded storage; -EBUSY if a VCPU has already been created. - -Allows use of guarded storage for the KVM guest. - -7.10 KVM_CAP_S390_AIS - -Architectures: s390 -Parameters: none - -Allow use of adapter-interruption suppression. -Returns: 0 on success; -EBUSY if a VCPU has already been created. - -7.11 KVM_CAP_PPC_SMT - -Architectures: ppc -Parameters: vsmt_mode, flags - -Enabling this capability on a VM provides userspace with a way to set -the desired virtual SMT mode (i.e. the number of virtual CPUs per -virtual core). The virtual SMT mode, vsmt_mode, must be a power of 2 -between 1 and 8. On POWER8, vsmt_mode must also be no greater than -the number of threads per subcore for the host. Currently flags must -be 0. A successful call to enable this capability will result in -vsmt_mode being returned when the KVM_CAP_PPC_SMT capability is -subsequently queried for the VM. This capability is only supported by -HV KVM, and can only be set before any VCPUs have been created. -The KVM_CAP_PPC_SMT_POSSIBLE capability indicates which virtual SMT -modes are available. - -7.12 KVM_CAP_PPC_FWNMI - -Architectures: ppc -Parameters: none - -With this capability a machine check exception in the guest address -space will cause KVM to exit the guest with NMI exit reason. This -enables QEMU to build error log and branch to guest kernel registered -machine check handling routine. Without this capability KVM will -branch to guests' 0x200 interrupt vector. - -7.13 KVM_CAP_X86_DISABLE_EXITS - -Architectures: x86 -Parameters: args[0] defines which exits are disabled -Returns: 0 on success, -EINVAL when args[0] contains invalid exits - -Valid bits in args[0] are - -#define KVM_X86_DISABLE_EXITS_MWAIT (1 << 0) -#define KVM_X86_DISABLE_EXITS_HLT (1 << 1) -#define KVM_X86_DISABLE_EXITS_PAUSE (1 << 2) -#define KVM_X86_DISABLE_EXITS_CSTATE (1 << 3) - -Enabling this capability on a VM provides userspace with a way to no -longer intercept some instructions for improved latency in some -workloads, and is suggested when vCPUs are associated to dedicated -physical CPUs. More bits can be added in the future; userspace can -just pass the KVM_CHECK_EXTENSION result to KVM_ENABLE_CAP to disable -all such vmexits. - -Do not enable KVM_FEATURE_PV_UNHALT if you disable HLT exits. - -7.14 KVM_CAP_S390_HPAGE_1M - -Architectures: s390 -Parameters: none -Returns: 0 on success, -EINVAL if hpage module parameter was not set - or cmma is enabled, or the VM has the KVM_VM_S390_UCONTROL - flag set - -With this capability the KVM support for memory backing with 1m pages -through hugetlbfs can be enabled for a VM. After the capability is -enabled, cmma can't be enabled anymore and pfmfi and the storage key -interpretation are disabled. If cmma has already been enabled or the -hpage module parameter is not set to 1, -EINVAL is returned. - -While it is generally possible to create a huge page backed VM without -this capability, the VM will not be able to run. - -7.15 KVM_CAP_MSR_PLATFORM_INFO - -Architectures: x86 -Parameters: args[0] whether feature should be enabled or not - -With this capability, a guest may read the MSR_PLATFORM_INFO MSR. Otherwise, -a #GP would be raised when the guest tries to access. Currently, this -capability does not enable write permissions of this MSR for the guest. - -7.16 KVM_CAP_PPC_NESTED_HV - -Architectures: ppc -Parameters: none -Returns: 0 on success, -EINVAL when the implementation doesn't support - nested-HV virtualization. - -HV-KVM on POWER9 and later systems allows for "nested-HV" -virtualization, which provides a way for a guest VM to run guests that -can run using the CPU's supervisor mode (privileged non-hypervisor -state). Enabling this capability on a VM depends on the CPU having -the necessary functionality and on the facility being enabled with a -kvm-hv module parameter. - -7.17 KVM_CAP_EXCEPTION_PAYLOAD - -Architectures: x86 -Parameters: args[0] whether feature should be enabled or not - -With this capability enabled, CR2 will not be modified prior to the -emulated VM-exit when L1 intercepts a #PF exception that occurs in -L2. Similarly, for kvm-intel only, DR6 will not be modified prior to -the emulated VM-exit when L1 intercepts a #DB exception that occurs in -L2. As a result, when KVM_GET_VCPU_EVENTS reports a pending #PF (or -#DB) exception for L2, exception.has_payload will be set and the -faulting address (or the new DR6 bits*) will be reported in the -exception_payload field. Similarly, when userspace injects a #PF (or -#DB) into L2 using KVM_SET_VCPU_EVENTS, it is expected to set -exception.has_payload and to put the faulting address (or the new DR6 -bits*) in the exception_payload field. - -This capability also enables exception.pending in struct -kvm_vcpu_events, which allows userspace to distinguish between pending -and injected exceptions. - - -* For the new DR6 bits, note that bit 16 is set iff the #DB exception - will clear DR6.RTM. - -7.18 KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 - -Architectures: x86, arm, arm64, mips -Parameters: args[0] whether feature should be enabled or not - -With this capability enabled, KVM_GET_DIRTY_LOG will not automatically -clear and write-protect all pages that are returned as dirty. -Rather, userspace will have to do this operation separately using -KVM_CLEAR_DIRTY_LOG. - -At the cost of a slightly more complicated operation, this provides better -scalability and responsiveness for two reasons. First, -KVM_CLEAR_DIRTY_LOG ioctl can operate on a 64-page granularity rather -than requiring to sync a full memslot; this ensures that KVM does not -take spinlocks for an extended period of time. Second, in some cases a -large amount of time can pass between a call to KVM_GET_DIRTY_LOG and -userspace actually using the data in the page. Pages can be modified -during this time, which is inefficint for both the guest and userspace: -the guest will incur a higher penalty due to write protection faults, -while userspace can see false reports of dirty pages. Manual reprotection -helps reducing this time, improving guest performance and reducing the -number of dirty log false positives. - -KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 was previously available under the name -KVM_CAP_MANUAL_DIRTY_LOG_PROTECT, but the implementation had bugs that make -it hard or impossible to use it correctly. The availability of -KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 signals that those bugs are fixed. -Userspace should not try to use KVM_CAP_MANUAL_DIRTY_LOG_PROTECT. - -8. Other capabilities. ----------------------- - -This section lists capabilities that give information about other -features of the KVM implementation. - -8.1 KVM_CAP_PPC_HWRNG - -Architectures: ppc - -This capability, if KVM_CHECK_EXTENSION indicates that it is -available, means that that the kernel has an implementation of the -H_RANDOM hypercall backed by a hardware random-number generator. -If present, the kernel H_RANDOM handler can be enabled for guest use -with the KVM_CAP_PPC_ENABLE_HCALL capability. - -8.2 KVM_CAP_HYPERV_SYNIC - -Architectures: x86 -This capability, if KVM_CHECK_EXTENSION indicates that it is -available, means that that the kernel has an implementation of the -Hyper-V Synthetic interrupt controller(SynIC). Hyper-V SynIC is -used to support Windows Hyper-V based guest paravirt drivers(VMBus). - -In order to use SynIC, it has to be activated by setting this -capability via KVM_ENABLE_CAP ioctl on the vcpu fd. Note that this -will disable the use of APIC hardware virtualization even if supported -by the CPU, as it's incompatible with SynIC auto-EOI behavior. - -8.3 KVM_CAP_PPC_RADIX_MMU - -Architectures: ppc - -This capability, if KVM_CHECK_EXTENSION indicates that it is -available, means that that the kernel can support guests using the -radix MMU defined in Power ISA V3.00 (as implemented in the POWER9 -processor). - -8.4 KVM_CAP_PPC_HASH_MMU_V3 - -Architectures: ppc - -This capability, if KVM_CHECK_EXTENSION indicates that it is -available, means that that the kernel can support guests using the -hashed page table MMU defined in Power ISA V3.00 (as implemented in -the POWER9 processor), including in-memory segment tables. - -8.5 KVM_CAP_MIPS_VZ - -Architectures: mips - -This capability, if KVM_CHECK_EXTENSION on the main kvm handle indicates that -it is available, means that full hardware assisted virtualization capabilities -of the hardware are available for use through KVM. An appropriate -KVM_VM_MIPS_* type must be passed to KVM_CREATE_VM to create a VM which -utilises it. - -If KVM_CHECK_EXTENSION on a kvm VM handle indicates that this capability is -available, it means that the VM is using full hardware assisted virtualization -capabilities of the hardware. This is useful to check after creating a VM with -KVM_VM_MIPS_DEFAULT. - -The value returned by KVM_CHECK_EXTENSION should be compared against known -values (see below). All other values are reserved. This is to allow for the -possibility of other hardware assisted virtualization implementations which -may be incompatible with the MIPS VZ ASE. - - 0: The trap & emulate implementation is in use to run guest code in user - mode. Guest virtual memory segments are rearranged to fit the guest in the - user mode address space. - - 1: The MIPS VZ ASE is in use, providing full hardware assisted - virtualization, including standard guest virtual memory segments. - -8.6 KVM_CAP_MIPS_TE - -Architectures: mips - -This capability, if KVM_CHECK_EXTENSION on the main kvm handle indicates that -it is available, means that the trap & emulate implementation is available to -run guest code in user mode, even if KVM_CAP_MIPS_VZ indicates that hardware -assisted virtualisation is also available. KVM_VM_MIPS_TE (0) must be passed -to KVM_CREATE_VM to create a VM which utilises it. - -If KVM_CHECK_EXTENSION on a kvm VM handle indicates that this capability is -available, it means that the VM is using trap & emulate. - -8.7 KVM_CAP_MIPS_64BIT - -Architectures: mips - -This capability indicates the supported architecture type of the guest, i.e. the -supported register and address width. - -The values returned when this capability is checked by KVM_CHECK_EXTENSION on a -kvm VM handle correspond roughly to the CP0_Config.AT register field, and should -be checked specifically against known values (see below). All other values are -reserved. - - 0: MIPS32 or microMIPS32. - Both registers and addresses are 32-bits wide. - It will only be possible to run 32-bit guest code. - - 1: MIPS64 or microMIPS64 with access only to 32-bit compatibility segments. - Registers are 64-bits wide, but addresses are 32-bits wide. - 64-bit guest code may run but cannot access MIPS64 memory segments. - It will also be possible to run 32-bit guest code. - - 2: MIPS64 or microMIPS64 with access to all address segments. - Both registers and addresses are 64-bits wide. - It will be possible to run 64-bit or 32-bit guest code. - -8.9 KVM_CAP_ARM_USER_IRQ - -Architectures: arm, arm64 -This capability, if KVM_CHECK_EXTENSION indicates that it is available, means -that if userspace creates a VM without an in-kernel interrupt controller, it -will be notified of changes to the output level of in-kernel emulated devices, -which can generate virtual interrupts, presented to the VM. -For such VMs, on every return to userspace, the kernel -updates the vcpu's run->s.regs.device_irq_level field to represent the actual -output level of the device. - -Whenever kvm detects a change in the device output level, kvm guarantees at -least one return to userspace before running the VM. This exit could either -be a KVM_EXIT_INTR or any other exit event, like KVM_EXIT_MMIO. This way, -userspace can always sample the device output level and re-compute the state of -the userspace interrupt controller. Userspace should always check the state -of run->s.regs.device_irq_level on every kvm exit. -The value in run->s.regs.device_irq_level can represent both level and edge -triggered interrupt signals, depending on the device. Edge triggered interrupt -signals will exit to userspace with the bit in run->s.regs.device_irq_level -set exactly once per edge signal. - -The field run->s.regs.device_irq_level is available independent of -run->kvm_valid_regs or run->kvm_dirty_regs bits. - -If KVM_CAP_ARM_USER_IRQ is supported, the KVM_CHECK_EXTENSION ioctl returns a -number larger than 0 indicating the version of this capability is implemented -and thereby which bits in in run->s.regs.device_irq_level can signal values. - -Currently the following bits are defined for the device_irq_level bitmap: - - KVM_CAP_ARM_USER_IRQ >= 1: - - KVM_ARM_DEV_EL1_VTIMER - EL1 virtual timer - KVM_ARM_DEV_EL1_PTIMER - EL1 physical timer - KVM_ARM_DEV_PMU - ARM PMU overflow interrupt signal - -Future versions of kvm may implement additional events. These will get -indicated by returning a higher number from KVM_CHECK_EXTENSION and will be -listed above. - -8.10 KVM_CAP_PPC_SMT_POSSIBLE - -Architectures: ppc - -Querying this capability returns a bitmap indicating the possible -virtual SMT modes that can be set using KVM_CAP_PPC_SMT. If bit N -(counting from the right) is set, then a virtual SMT mode of 2^N is -available. - -8.11 KVM_CAP_HYPERV_SYNIC2 - -Architectures: x86 - -This capability enables a newer version of Hyper-V Synthetic interrupt -controller (SynIC). The only difference with KVM_CAP_HYPERV_SYNIC is that KVM -doesn't clear SynIC message and event flags pages when they are enabled by -writing to the respective MSRs. - -8.12 KVM_CAP_HYPERV_VP_INDEX - -Architectures: x86 - -This capability indicates that userspace can load HV_X64_MSR_VP_INDEX msr. Its -value is used to denote the target vcpu for a SynIC interrupt. For -compatibilty, KVM initializes this msr to KVM's internal vcpu index. When this -capability is absent, userspace can still query this msr's value. - -8.13 KVM_CAP_S390_AIS_MIGRATION - -Architectures: s390 -Parameters: none - -This capability indicates if the flic device will be able to get/set the -AIS states for migration via the KVM_DEV_FLIC_AISM_ALL attribute and allows -to discover this without having to create a flic device. - -8.14 KVM_CAP_S390_PSW - -Architectures: s390 - -This capability indicates that the PSW is exposed via the kvm_run structure. - -8.15 KVM_CAP_S390_GMAP - -Architectures: s390 - -This capability indicates that the user space memory used as guest mapping can -be anywhere in the user memory address space, as long as the memory slots are -aligned and sized to a segment (1MB) boundary. - -8.16 KVM_CAP_S390_COW - -Architectures: s390 - -This capability indicates that the user space memory used as guest mapping can -use copy-on-write semantics as well as dirty pages tracking via read-only page -tables. - -8.17 KVM_CAP_S390_BPB - -Architectures: s390 - -This capability indicates that kvm will implement the interfaces to handle -reset, migration and nested KVM for branch prediction blocking. The stfle -facility 82 should not be provided to the guest without this capability. - -8.18 KVM_CAP_HYPERV_TLBFLUSH - -Architectures: x86 - -This capability indicates that KVM supports paravirtualized Hyper-V TLB Flush -hypercalls: -HvFlushVirtualAddressSpace, HvFlushVirtualAddressSpaceEx, -HvFlushVirtualAddressList, HvFlushVirtualAddressListEx. - -8.19 KVM_CAP_ARM_INJECT_SERROR_ESR - -Architectures: arm, arm64 - -This capability indicates that userspace can specify (via the -KVM_SET_VCPU_EVENTS ioctl) the syndrome value reported to the guest when it -takes a virtual SError interrupt exception. -If KVM advertises this capability, userspace can only specify the ISS field for -the ESR syndrome. Other parts of the ESR, such as the EC are generated by the -CPU when the exception is taken. If this virtual SError is taken to EL1 using -AArch64, this value will be reported in the ISS field of ESR_ELx. - -See KVM_CAP_VCPU_EVENTS for more details. -8.20 KVM_CAP_HYPERV_SEND_IPI - -Architectures: x86 - -This capability indicates that KVM supports paravirtualized Hyper-V IPI send -hypercalls: -HvCallSendSyntheticClusterIpi, HvCallSendSyntheticClusterIpiEx. diff --git a/Documentation/virtual/kvm/arm/hyp-abi.txt b/Documentation/virtual/kvm/arm/hyp-abi.txt deleted file mode 100644 index a20a0bee268d..000000000000 --- a/Documentation/virtual/kvm/arm/hyp-abi.txt +++ /dev/null @@ -1,53 +0,0 @@ -* Internal ABI between the kernel and HYP - -This file documents the interaction between the Linux kernel and the -hypervisor layer when running Linux as a hypervisor (for example -KVM). It doesn't cover the interaction of the kernel with the -hypervisor when running as a guest (under Xen, KVM or any other -hypervisor), or any hypervisor-specific interaction when the kernel is -used as a host. - -On arm and arm64 (without VHE), the kernel doesn't run in hypervisor -mode, but still needs to interact with it, allowing a built-in -hypervisor to be either installed or torn down. - -In order to achieve this, the kernel must be booted at HYP (arm) or -EL2 (arm64), allowing it to install a set of stubs before dropping to -SVC/EL1. These stubs are accessible by using a 'hvc #0' instruction, -and only act on individual CPUs. - -Unless specified otherwise, any built-in hypervisor must implement -these functions (see arch/arm{,64}/include/asm/virt.h): - -* r0/x0 = HVC_SET_VECTORS - r1/x1 = vectors - - Set HVBAR/VBAR_EL2 to 'vectors' to enable a hypervisor. 'vectors' - must be a physical address, and respect the alignment requirements - of the architecture. Only implemented by the initial stubs, not by - Linux hypervisors. - -* r0/x0 = HVC_RESET_VECTORS - - Turn HYP/EL2 MMU off, and reset HVBAR/VBAR_EL2 to the initials - stubs' exception vector value. This effectively disables an existing - hypervisor. - -* r0/x0 = HVC_SOFT_RESTART - r1/x1 = restart address - x2 = x0's value when entering the next payload (arm64) - x3 = x1's value when entering the next payload (arm64) - x4 = x2's value when entering the next payload (arm64) - - Mask all exceptions, disable the MMU, move the arguments into place - (arm64 only), and jump to the restart address while at HYP/EL2. This - hypercall is not expected to return to its caller. - -Any other value of r0/x0 triggers a hypervisor-specific handling, -which is not documented here. - -The return value of a stub hypercall is held by r0/x0, and is 0 on -success, and HVC_STUB_ERR on error. A stub hypercall is allowed to -clobber any of the caller-saved registers (x0-x18 on arm64, r0-r3 and -ip on arm). It is thus recommended to use a function call to perform -the hypercall. diff --git a/Documentation/virtual/kvm/arm/psci.txt b/Documentation/virtual/kvm/arm/psci.txt deleted file mode 100644 index 559586fc9d37..000000000000 --- a/Documentation/virtual/kvm/arm/psci.txt +++ /dev/null @@ -1,61 +0,0 @@ -KVM implements the PSCI (Power State Coordination Interface) -specification in order to provide services such as CPU on/off, reset -and power-off to the guest. - -The PSCI specification is regularly updated to provide new features, -and KVM implements these updates if they make sense from a virtualization -point of view. - -This means that a guest booted on two different versions of KVM can -observe two different "firmware" revisions. This could cause issues if -a given guest is tied to a particular PSCI revision (unlikely), or if -a migration causes a different PSCI version to be exposed out of the -blue to an unsuspecting guest. - -In order to remedy this situation, KVM exposes a set of "firmware -pseudo-registers" that can be manipulated using the GET/SET_ONE_REG -interface. These registers can be saved/restored by userspace, and set -to a convenient value if required. - -The following register is defined: - -* KVM_REG_ARM_PSCI_VERSION: - - - Only valid if the vcpu has the KVM_ARM_VCPU_PSCI_0_2 feature set - (and thus has already been initialized) - - Returns the current PSCI version on GET_ONE_REG (defaulting to the - highest PSCI version implemented by KVM and compatible with v0.2) - - Allows any PSCI version implemented by KVM and compatible with - v0.2 to be set with SET_ONE_REG - - Affects the whole VM (even if the register view is per-vcpu) - -* KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1: - Holds the state of the firmware support to mitigate CVE-2017-5715, as - offered by KVM to the guest via a HVC call. The workaround is described - under SMCCC_ARCH_WORKAROUND_1 in [1]. - Accepted values are: - KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL: KVM does not offer - firmware support for the workaround. The mitigation status for the - guest is unknown. - KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_AVAIL: The workaround HVC call is - available to the guest and required for the mitigation. - KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_REQUIRED: The workaround HVC call - is available to the guest, but it is not needed on this VCPU. - -* KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2: - Holds the state of the firmware support to mitigate CVE-2018-3639, as - offered by KVM to the guest via a HVC call. The workaround is described - under SMCCC_ARCH_WORKAROUND_2 in [1]. - Accepted values are: - KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL: A workaround is not - available. KVM does not offer firmware support for the workaround. - KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_UNKNOWN: The workaround state is - unknown. KVM does not offer firmware support for the workaround. - KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL: The workaround is available, - and can be disabled by a vCPU. If - KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED is set, it is active for - this vCPU. - KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED: The workaround is - always active on this vCPU or it is not needed. - -[1] https://developer.arm.com/-/media/developer/pdf/ARM_DEN_0070A_Firmware_interfaces_for_mitigating_CVE-2017-5715.pdf diff --git a/Documentation/virtual/kvm/cpuid.rst b/Documentation/virtual/kvm/cpuid.rst deleted file mode 100644 index 01b081f6e7ea..000000000000 --- a/Documentation/virtual/kvm/cpuid.rst +++ /dev/null @@ -1,107 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -============== -KVM CPUID bits -============== - -:Author: Glauber Costa - -A guest running on a kvm host, can check some of its features using -cpuid. This is not always guaranteed to work, since userspace can -mask-out some, or even all KVM-related cpuid features before launching -a guest. - -KVM cpuid functions are: - -function: KVM_CPUID_SIGNATURE (0x40000000) - -returns:: - - eax = 0x40000001 - ebx = 0x4b4d564b - ecx = 0x564b4d56 - edx = 0x4d - -Note that this value in ebx, ecx and edx corresponds to the string "KVMKVMKVM". -The value in eax corresponds to the maximum cpuid function present in this leaf, -and will be updated if more functions are added in the future. -Note also that old hosts set eax value to 0x0. This should -be interpreted as if the value was 0x40000001. -This function queries the presence of KVM cpuid leafs. - -function: define KVM_CPUID_FEATURES (0x40000001) - -returns:: - - ebx, ecx - eax = an OR'ed group of (1 << flag) - -where ``flag`` is defined as below: - -================================= =========== ================================ -flag value meaning -================================= =========== ================================ -KVM_FEATURE_CLOCKSOURCE 0 kvmclock available at msrs - 0x11 and 0x12 - -KVM_FEATURE_NOP_IO_DELAY 1 not necessary to perform delays - on PIO operations - -KVM_FEATURE_MMU_OP 2 deprecated - -KVM_FEATURE_CLOCKSOURCE2 3 kvmclock available at msrs - - 0x4b564d00 and 0x4b564d01 -KVM_FEATURE_ASYNC_PF 4 async pf can be enabled by - writing to msr 0x4b564d02 - -KVM_FEATURE_STEAL_TIME 5 steal time can be enabled by - writing to msr 0x4b564d03 - -KVM_FEATURE_PV_EOI 6 paravirtualized end of interrupt - handler can be enabled by - writing to msr 0x4b564d04 - -KVM_FEATURE_PV_UNHAULT 7 guest checks this feature bit - before enabling paravirtualized - spinlock support - -KVM_FEATURE_PV_TLB_FLUSH 9 guest checks this feature bit - before enabling paravirtualized - tlb flush - -KVM_FEATURE_ASYNC_PF_VMEXIT 10 paravirtualized async PF VM EXIT - can be enabled by setting bit 2 - when writing to msr 0x4b564d02 - -KVM_FEATURE_PV_SEND_IPI 11 guest checks this feature bit - before enabling paravirtualized - sebd IPIs - -KVM_FEATURE_PV_POLL_CONTROL 12 host-side polling on HLT can - be disabled by writing - to msr 0x4b564d05. - -KVM_FEATURE_PV_SCHED_YIELD 13 guest checks this feature bit - before using paravirtualized - sched yield. - -KVM_FEATURE_CLOCSOURCE_STABLE_BIT 24 host will warn if no guest-side - per-cpu warps are expeced in - kvmclock -================================= =========== ================================ - -:: - - edx = an OR'ed group of (1 << flag) - -Where ``flag`` here is defined as below: - -================== ============ ================================= -flag value meaning -================== ============ ================================= -KVM_HINTS_REALTIME 0 guest checks this feature bit to - determine that vCPUs are never - preempted for an unlimited time - allowing optimizations -================== ============ ================================= diff --git a/Documentation/virtual/kvm/devices/README b/Documentation/virtual/kvm/devices/README deleted file mode 100644 index 34a69834124a..000000000000 --- a/Documentation/virtual/kvm/devices/README +++ /dev/null @@ -1 +0,0 @@ -This directory contains specific device bindings for KVM_CAP_DEVICE_CTRL. diff --git a/Documentation/virtual/kvm/devices/arm-vgic-its.txt b/Documentation/virtual/kvm/devices/arm-vgic-its.txt deleted file mode 100644 index eeaa95b893a8..000000000000 --- a/Documentation/virtual/kvm/devices/arm-vgic-its.txt +++ /dev/null @@ -1,181 +0,0 @@ -ARM Virtual Interrupt Translation Service (ITS) -=============================================== - -Device types supported: - KVM_DEV_TYPE_ARM_VGIC_ITS ARM Interrupt Translation Service Controller - -The ITS allows MSI(-X) interrupts to be injected into guests. This extension is -optional. Creating a virtual ITS controller also requires a host GICv3 (see -arm-vgic-v3.txt), but does not depend on having physical ITS controllers. - -There can be multiple ITS controllers per guest, each of them has to have -a separate, non-overlapping MMIO region. - - -Groups: - KVM_DEV_ARM_VGIC_GRP_ADDR - Attributes: - KVM_VGIC_ITS_ADDR_TYPE (rw, 64-bit) - Base address in the guest physical address space of the GICv3 ITS - control register frame. - This address needs to be 64K aligned and the region covers 128K. - Errors: - -E2BIG: Address outside of addressable IPA range - -EINVAL: Incorrectly aligned address - -EEXIST: Address already configured - -EFAULT: Invalid user pointer for attr->addr. - -ENODEV: Incorrect attribute or the ITS is not supported. - - - KVM_DEV_ARM_VGIC_GRP_CTRL - Attributes: - KVM_DEV_ARM_VGIC_CTRL_INIT - request the initialization of the ITS, no additional parameter in - kvm_device_attr.addr. - - KVM_DEV_ARM_ITS_CTRL_RESET - reset the ITS, no additional parameter in kvm_device_attr.addr. - See "ITS Reset State" section. - - KVM_DEV_ARM_ITS_SAVE_TABLES - save the ITS table data into guest RAM, at the location provisioned - by the guest in corresponding registers/table entries. - - The layout of the tables in guest memory defines an ABI. The entries - are laid out in little endian format as described in the last paragraph. - - KVM_DEV_ARM_ITS_RESTORE_TABLES - restore the ITS tables from guest RAM to ITS internal structures. - - The GICV3 must be restored before the ITS and all ITS registers but - the GITS_CTLR must be restored before restoring the ITS tables. - - The GITS_IIDR read-only register must also be restored before - calling KVM_DEV_ARM_ITS_RESTORE_TABLES as the IIDR revision field - encodes the ABI revision. - - The expected ordering when restoring the GICv3/ITS is described in section - "ITS Restore Sequence". - - Errors: - -ENXIO: ITS not properly configured as required prior to setting - this attribute - -ENOMEM: Memory shortage when allocating ITS internal data - -EINVAL: Inconsistent restored data - -EFAULT: Invalid guest ram access - -EBUSY: One or more VCPUS are running - -EACCES: The virtual ITS is backed by a physical GICv4 ITS, and the - state is not available - - KVM_DEV_ARM_VGIC_GRP_ITS_REGS - Attributes: - The attr field of kvm_device_attr encodes the offset of the - ITS register, relative to the ITS control frame base address - (ITS_base). - - kvm_device_attr.addr points to a __u64 value whatever the width - of the addressed register (32/64 bits). 64 bit registers can only - be accessed with full length. - - Writes to read-only registers are ignored by the kernel except for: - - GITS_CREADR. It must be restored otherwise commands in the queue - will be re-executed after restoring CWRITER. GITS_CREADR must be - restored before restoring the GITS_CTLR which is likely to enable the - ITS. Also it must be restored after GITS_CBASER since a write to - GITS_CBASER resets GITS_CREADR. - - GITS_IIDR. The Revision field encodes the table layout ABI revision. - In the future we might implement direct injection of virtual LPIs. - This will require an upgrade of the table layout and an evolution of - the ABI. GITS_IIDR must be restored before calling - KVM_DEV_ARM_ITS_RESTORE_TABLES. - - For other registers, getting or setting a register has the same - effect as reading/writing the register on real hardware. - Errors: - -ENXIO: Offset does not correspond to any supported register - -EFAULT: Invalid user pointer for attr->addr - -EINVAL: Offset is not 64-bit aligned - -EBUSY: one or more VCPUS are running - - ITS Restore Sequence: - ------------------------- - -The following ordering must be followed when restoring the GIC and the ITS: -a) restore all guest memory and create vcpus -b) restore all redistributors -c) provide the ITS base address - (KVM_DEV_ARM_VGIC_GRP_ADDR) -d) restore the ITS in the following order: - 1. Restore GITS_CBASER - 2. Restore all other GITS_ registers, except GITS_CTLR! - 3. Load the ITS table data (KVM_DEV_ARM_ITS_RESTORE_TABLES) - 4. Restore GITS_CTLR - -Then vcpus can be started. - - ITS Table ABI REV0: - ------------------- - - Revision 0 of the ABI only supports the features of a virtual GICv3, and does - not support a virtual GICv4 with support for direct injection of virtual - interrupts for nested hypervisors. - - The device table and ITT are indexed by the DeviceID and EventID, - respectively. The collection table is not indexed by CollectionID, and the - entries in the collection are listed in no particular order. - All entries are 8 bytes. - - Device Table Entry (DTE): - - bits: | 63| 62 ... 49 | 48 ... 5 | 4 ... 0 | - values: | V | next | ITT_addr | Size | - - where; - - V indicates whether the entry is valid. If not, other fields - are not meaningful. - - next: equals to 0 if this entry is the last one; otherwise it - corresponds to the DeviceID offset to the next DTE, capped by - 2^14 -1. - - ITT_addr matches bits [51:8] of the ITT address (256 Byte aligned). - - Size specifies the supported number of bits for the EventID, - minus one - - Collection Table Entry (CTE): - - bits: | 63| 62 .. 52 | 51 ... 16 | 15 ... 0 | - values: | V | RES0 | RDBase | ICID | - - where: - - V indicates whether the entry is valid. If not, other fields are - not meaningful. - - RES0: reserved field with Should-Be-Zero-or-Preserved behavior. - - RDBase is the PE number (GICR_TYPER.Processor_Number semantic), - - ICID is the collection ID - - Interrupt Translation Entry (ITE): - - bits: | 63 ... 48 | 47 ... 16 | 15 ... 0 | - values: | next | pINTID | ICID | - - where: - - next: equals to 0 if this entry is the last one; otherwise it corresponds - to the EventID offset to the next ITE capped by 2^16 -1. - - pINTID is the physical LPI ID; if zero, it means the entry is not valid - and other fields are not meaningful. - - ICID is the collection ID - - ITS Reset State: - ---------------- - -RESET returns the ITS to the same state that it was when first created and -initialized. When the RESET command returns, the following things are -guaranteed: - -- The ITS is not enabled and quiescent - GITS_CTLR.Enabled = 0 .Quiescent=1 -- There is no internally cached state -- No collection or device table are used - GITS_BASER.Valid = 0 -- GITS_CBASER = 0, GITS_CREADR = 0, GITS_CWRITER = 0 -- The ABI version is unchanged and remains the one set when the ITS - device was first created. diff --git a/Documentation/virtual/kvm/devices/arm-vgic-v3.txt b/Documentation/virtual/kvm/devices/arm-vgic-v3.txt deleted file mode 100644 index ff290b43c8e5..000000000000 --- a/Documentation/virtual/kvm/devices/arm-vgic-v3.txt +++ /dev/null @@ -1,251 +0,0 @@ -ARM Virtual Generic Interrupt Controller v3 and later (VGICv3) -============================================================== - - -Device types supported: - KVM_DEV_TYPE_ARM_VGIC_V3 ARM Generic Interrupt Controller v3.0 - -Only one VGIC instance may be instantiated through this API. The created VGIC -will act as the VM interrupt controller, requiring emulated user-space devices -to inject interrupts to the VGIC instead of directly to CPUs. It is not -possible to create both a GICv3 and GICv2 on the same VM. - -Creating a guest GICv3 device requires a host GICv3 as well. - - -Groups: - KVM_DEV_ARM_VGIC_GRP_ADDR - Attributes: - KVM_VGIC_V3_ADDR_TYPE_DIST (rw, 64-bit) - Base address in the guest physical address space of the GICv3 distributor - register mappings. Only valid for KVM_DEV_TYPE_ARM_VGIC_V3. - This address needs to be 64K aligned and the region covers 64 KByte. - - KVM_VGIC_V3_ADDR_TYPE_REDIST (rw, 64-bit) - Base address in the guest physical address space of the GICv3 - redistributor register mappings. There are two 64K pages for each - VCPU and all of the redistributor pages are contiguous. - Only valid for KVM_DEV_TYPE_ARM_VGIC_V3. - This address needs to be 64K aligned. - - KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION (rw, 64-bit) - The attribute data pointed to by kvm_device_attr.addr is a __u64 value: - bits: | 63 .... 52 | 51 .... 16 | 15 - 12 |11 - 0 - values: | count | base | flags | index - - index encodes the unique redistributor region index - - flags: reserved for future use, currently 0 - - base field encodes bits [51:16] of the guest physical base address - of the first redistributor in the region. - - count encodes the number of redistributors in the region. Must be - greater than 0. - There are two 64K pages for each redistributor in the region and - redistributors are laid out contiguously within the region. Regions - are filled with redistributors in the index order. The sum of all - region count fields must be greater than or equal to the number of - VCPUs. Redistributor regions must be registered in the incremental - index order, starting from index 0. - The characteristics of a specific redistributor region can be read - by presetting the index field in the attr data. - Only valid for KVM_DEV_TYPE_ARM_VGIC_V3. - - It is invalid to mix calls with KVM_VGIC_V3_ADDR_TYPE_REDIST and - KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION attributes. - - Errors: - -E2BIG: Address outside of addressable IPA range - -EINVAL: Incorrectly aligned address, bad redistributor region - count/index, mixed redistributor region attribute usage - -EEXIST: Address already configured - -ENOENT: Attempt to read the characteristics of a non existing - redistributor region - -ENXIO: The group or attribute is unknown/unsupported for this device - or hardware support is missing. - -EFAULT: Invalid user pointer for attr->addr. - - - KVM_DEV_ARM_VGIC_GRP_DIST_REGS - KVM_DEV_ARM_VGIC_GRP_REDIST_REGS - Attributes: - The attr field of kvm_device_attr encodes two values: - bits: | 63 .... 32 | 31 .... 0 | - values: | mpidr | offset | - - All distributor regs are (rw, 32-bit) and kvm_device_attr.addr points to a - __u32 value. 64-bit registers must be accessed by separately accessing the - lower and higher word. - - Writes to read-only registers are ignored by the kernel. - - KVM_DEV_ARM_VGIC_GRP_DIST_REGS accesses the main distributor registers. - KVM_DEV_ARM_VGIC_GRP_REDIST_REGS accesses the redistributor of the CPU - specified by the mpidr. - - The offset is relative to the "[Re]Distributor base address" as defined - in the GICv3/4 specs. Getting or setting such a register has the same - effect as reading or writing the register on real hardware, except for the - following registers: GICD_STATUSR, GICR_STATUSR, GICD_ISPENDR, - GICR_ISPENDR0, GICD_ICPENDR, and GICR_ICPENDR0. These registers behave - differently when accessed via this interface compared to their - architecturally defined behavior to allow software a full view of the - VGIC's internal state. - - The mpidr field is used to specify which - redistributor is accessed. The mpidr is ignored for the distributor. - - The mpidr encoding is based on the affinity information in the - architecture defined MPIDR, and the field is encoded as follows: - | 63 .... 56 | 55 .... 48 | 47 .... 40 | 39 .... 32 | - | Aff3 | Aff2 | Aff1 | Aff0 | - - Note that distributor fields are not banked, but return the same value - regardless of the mpidr used to access the register. - - GICD_IIDR.Revision is updated when the KVM implementation is changed in a - way directly observable by the guest or userspace. Userspace should read - GICD_IIDR from KVM and write back the read value to confirm its expected - behavior is aligned with the KVM implementation. Userspace should set - GICD_IIDR before setting any other registers to ensure the expected - behavior. - - - The GICD_STATUSR and GICR_STATUSR registers are architecturally defined such - that a write of a clear bit has no effect, whereas a write with a set bit - clears that value. To allow userspace to freely set the values of these two - registers, setting the attributes with the register offsets for these two - registers simply sets the non-reserved bits to the value written. - - - Accesses (reads and writes) to the GICD_ISPENDR register region and - GICR_ISPENDR0 registers get/set the value of the latched pending state for - the interrupts. - - This is identical to the value returned by a guest read from ISPENDR for an - edge triggered interrupt, but may differ for level triggered interrupts. - For edge triggered interrupts, once an interrupt becomes pending (whether - because of an edge detected on the input line or because of a guest write - to ISPENDR) this state is "latched", and only cleared when either the - interrupt is activated or when the guest writes to ICPENDR. A level - triggered interrupt may be pending either because the level input is held - high by a device, or because of a guest write to the ISPENDR register. Only - ISPENDR writes are latched; if the device lowers the line level then the - interrupt is no longer pending unless the guest also wrote to ISPENDR, and - conversely writes to ICPENDR or activations of the interrupt do not clear - the pending status if the line level is still being held high. (These - rules are documented in the GICv3 specification descriptions of the ICPENDR - and ISPENDR registers.) For a level triggered interrupt the value accessed - here is that of the latch which is set by ISPENDR and cleared by ICPENDR or - interrupt activation, whereas the value returned by a guest read from - ISPENDR is the logical OR of the latch value and the input line level. - - Raw access to the latch state is provided to userspace so that it can save - and restore the entire GIC internal state (which is defined by the - combination of the current input line level and the latch state, and cannot - be deduced from purely the line level and the value of the ISPENDR - registers). - - Accesses to GICD_ICPENDR register region and GICR_ICPENDR0 registers have - RAZ/WI semantics, meaning that reads always return 0 and writes are always - ignored. - - Errors: - -ENXIO: Getting or setting this register is not yet supported - -EBUSY: One or more VCPUs are running - - - KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS - Attributes: - The attr field of kvm_device_attr encodes two values: - bits: | 63 .... 32 | 31 .... 16 | 15 .... 0 | - values: | mpidr | RES | instr | - - The mpidr field encodes the CPU ID based on the affinity information in the - architecture defined MPIDR, and the field is encoded as follows: - | 63 .... 56 | 55 .... 48 | 47 .... 40 | 39 .... 32 | - | Aff3 | Aff2 | Aff1 | Aff0 | - - The instr field encodes the system register to access based on the fields - defined in the A64 instruction set encoding for system register access - (RES means the bits are reserved for future use and should be zero): - - | 15 ... 14 | 13 ... 11 | 10 ... 7 | 6 ... 3 | 2 ... 0 | - | Op 0 | Op1 | CRn | CRm | Op2 | - - All system regs accessed through this API are (rw, 64-bit) and - kvm_device_attr.addr points to a __u64 value. - - KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS accesses the CPU interface registers for the - CPU specified by the mpidr field. - - CPU interface registers access is not implemented for AArch32 mode. - Error -ENXIO is returned when accessed in AArch32 mode. - Errors: - -ENXIO: Getting or setting this register is not yet supported - -EBUSY: VCPU is running - -EINVAL: Invalid mpidr or register value supplied - - - KVM_DEV_ARM_VGIC_GRP_NR_IRQS - Attributes: - A value describing the number of interrupts (SGI, PPI and SPI) for - this GIC instance, ranging from 64 to 1024, in increments of 32. - - kvm_device_attr.addr points to a __u32 value. - - Errors: - -EINVAL: Value set is out of the expected range - -EBUSY: Value has already be set. - - - KVM_DEV_ARM_VGIC_GRP_CTRL - Attributes: - KVM_DEV_ARM_VGIC_CTRL_INIT - request the initialization of the VGIC, no additional parameter in - kvm_device_attr.addr. - KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES - save all LPI pending bits into guest RAM pending tables. - - The first kB of the pending table is not altered by this operation. - Errors: - -ENXIO: VGIC not properly configured as required prior to calling - this attribute - -ENODEV: no online VCPU - -ENOMEM: memory shortage when allocating vgic internal data - -EFAULT: Invalid guest ram access - -EBUSY: One or more VCPUS are running - - - KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO - Attributes: - The attr field of kvm_device_attr encodes the following values: - bits: | 63 .... 32 | 31 .... 10 | 9 .... 0 | - values: | mpidr | info | vINTID | - - The vINTID specifies which set of IRQs is reported on. - - The info field specifies which information userspace wants to get or set - using this interface. Currently we support the following info values: - - VGIC_LEVEL_INFO_LINE_LEVEL: - Get/Set the input level of the IRQ line for a set of 32 contiguously - numbered interrupts. - vINTID must be a multiple of 32. - - kvm_device_attr.addr points to a __u32 value which will contain a - bitmap where a set bit means the interrupt level is asserted. - - Bit[n] indicates the status for interrupt vINTID + n. - - SGIs and any interrupt with a higher ID than the number of interrupts - supported, will be RAZ/WI. LPIs are always edge-triggered and are - therefore not supported by this interface. - - PPIs are reported per VCPU as specified in the mpidr field, and SPIs are - reported with the same value regardless of the mpidr specified. - - The mpidr field encodes the CPU ID based on the affinity information in the - architecture defined MPIDR, and the field is encoded as follows: - | 63 .... 56 | 55 .... 48 | 47 .... 40 | 39 .... 32 | - | Aff3 | Aff2 | Aff1 | Aff0 | - Errors: - -EINVAL: vINTID is not multiple of 32 or - info field is not VGIC_LEVEL_INFO_LINE_LEVEL diff --git a/Documentation/virtual/kvm/devices/arm-vgic.txt b/Documentation/virtual/kvm/devices/arm-vgic.txt deleted file mode 100644 index 97b6518148f8..000000000000 --- a/Documentation/virtual/kvm/devices/arm-vgic.txt +++ /dev/null @@ -1,127 +0,0 @@ -ARM Virtual Generic Interrupt Controller v2 (VGIC) -================================================== - -Device types supported: - KVM_DEV_TYPE_ARM_VGIC_V2 ARM Generic Interrupt Controller v2.0 - -Only one VGIC instance may be instantiated through either this API or the -legacy KVM_CREATE_IRQCHIP API. The created VGIC will act as the VM interrupt -controller, requiring emulated user-space devices to inject interrupts to the -VGIC instead of directly to CPUs. - -GICv3 implementations with hardware compatibility support allow creating a -guest GICv2 through this interface. For information on creating a guest GICv3 -device and guest ITS devices, see arm-vgic-v3.txt. It is not possible to -create both a GICv3 and GICv2 device on the same VM. - - -Groups: - KVM_DEV_ARM_VGIC_GRP_ADDR - Attributes: - KVM_VGIC_V2_ADDR_TYPE_DIST (rw, 64-bit) - Base address in the guest physical address space of the GIC distributor - register mappings. Only valid for KVM_DEV_TYPE_ARM_VGIC_V2. - This address needs to be 4K aligned and the region covers 4 KByte. - - KVM_VGIC_V2_ADDR_TYPE_CPU (rw, 64-bit) - Base address in the guest physical address space of the GIC virtual cpu - interface register mappings. Only valid for KVM_DEV_TYPE_ARM_VGIC_V2. - This address needs to be 4K aligned and the region covers 4 KByte. - Errors: - -E2BIG: Address outside of addressable IPA range - -EINVAL: Incorrectly aligned address - -EEXIST: Address already configured - -ENXIO: The group or attribute is unknown/unsupported for this device - or hardware support is missing. - -EFAULT: Invalid user pointer for attr->addr. - - KVM_DEV_ARM_VGIC_GRP_DIST_REGS - Attributes: - The attr field of kvm_device_attr encodes two values: - bits: | 63 .... 40 | 39 .. 32 | 31 .... 0 | - values: | reserved | vcpu_index | offset | - - All distributor regs are (rw, 32-bit) - - The offset is relative to the "Distributor base address" as defined in the - GICv2 specs. Getting or setting such a register has the same effect as - reading or writing the register on the actual hardware from the cpu whose - index is specified with the vcpu_index field. Note that most distributor - fields are not banked, but return the same value regardless of the - vcpu_index used to access the register. - - GICD_IIDR.Revision is updated when the KVM implementation of an emulated - GICv2 is changed in a way directly observable by the guest or userspace. - Userspace should read GICD_IIDR from KVM and write back the read value to - confirm its expected behavior is aligned with the KVM implementation. - Userspace should set GICD_IIDR before setting any other registers (both - KVM_DEV_ARM_VGIC_GRP_DIST_REGS and KVM_DEV_ARM_VGIC_GRP_CPU_REGS) to ensure - the expected behavior. Unless GICD_IIDR has been set from userspace, writes - to the interrupt group registers (GICD_IGROUPR) are ignored. - Errors: - -ENXIO: Getting or setting this register is not yet supported - -EBUSY: One or more VCPUs are running - -EINVAL: Invalid vcpu_index supplied - - KVM_DEV_ARM_VGIC_GRP_CPU_REGS - Attributes: - The attr field of kvm_device_attr encodes two values: - bits: | 63 .... 40 | 39 .. 32 | 31 .... 0 | - values: | reserved | vcpu_index | offset | - - All CPU interface regs are (rw, 32-bit) - - The offset specifies the offset from the "CPU interface base address" as - defined in the GICv2 specs. Getting or setting such a register has the - same effect as reading or writing the register on the actual hardware. - - The Active Priorities Registers APRn are implementation defined, so we set a - fixed format for our implementation that fits with the model of a "GICv2 - implementation without the security extensions" which we present to the - guest. This interface always exposes four register APR[0-3] describing the - maximum possible 128 preemption levels. The semantics of the register - indicate if any interrupts in a given preemption level are in the active - state by setting the corresponding bit. - - Thus, preemption level X has one or more active interrupts if and only if: - - APRn[X mod 32] == 0b1, where n = X / 32 - - Bits for undefined preemption levels are RAZ/WI. - - Note that this differs from a CPU's view of the APRs on hardware in which - a GIC without the security extensions expose group 0 and group 1 active - priorities in separate register groups, whereas we show a combined view - similar to GICv2's GICH_APR. - - For historical reasons and to provide ABI compatibility with userspace we - export the GICC_PMR register in the format of the GICH_VMCR.VMPriMask - field in the lower 5 bits of a word, meaning that userspace must always - use the lower 5 bits to communicate with the KVM device and must shift the - value left by 3 places to obtain the actual priority mask level. - - Errors: - -ENXIO: Getting or setting this register is not yet supported - -EBUSY: One or more VCPUs are running - -EINVAL: Invalid vcpu_index supplied - - KVM_DEV_ARM_VGIC_GRP_NR_IRQS - Attributes: - A value describing the number of interrupts (SGI, PPI and SPI) for - this GIC instance, ranging from 64 to 1024, in increments of 32. - - Errors: - -EINVAL: Value set is out of the expected range - -EBUSY: Value has already be set, or GIC has already been initialized - with default values. - - KVM_DEV_ARM_VGIC_GRP_CTRL - Attributes: - KVM_DEV_ARM_VGIC_CTRL_INIT - request the initialization of the VGIC or ITS, no additional parameter - in kvm_device_attr.addr. - Errors: - -ENXIO: VGIC not properly configured as required prior to calling - this attribute - -ENODEV: no online VCPU - -ENOMEM: memory shortage when allocating vgic internal data diff --git a/Documentation/virtual/kvm/devices/mpic.txt b/Documentation/virtual/kvm/devices/mpic.txt deleted file mode 100644 index 8257397adc3c..000000000000 --- a/Documentation/virtual/kvm/devices/mpic.txt +++ /dev/null @@ -1,53 +0,0 @@ -MPIC interrupt controller -========================= - -Device types supported: - KVM_DEV_TYPE_FSL_MPIC_20 Freescale MPIC v2.0 - KVM_DEV_TYPE_FSL_MPIC_42 Freescale MPIC v4.2 - -Only one MPIC instance, of any type, may be instantiated. The created -MPIC will act as the system interrupt controller, connecting to each -vcpu's interrupt inputs. - -Groups: - KVM_DEV_MPIC_GRP_MISC - Attributes: - KVM_DEV_MPIC_BASE_ADDR (rw, 64-bit) - Base address of the 256 KiB MPIC register space. Must be - naturally aligned. A value of zero disables the mapping. - Reset value is zero. - - KVM_DEV_MPIC_GRP_REGISTER (rw, 32-bit) - Access an MPIC register, as if the access were made from the guest. - "attr" is the byte offset into the MPIC register space. Accesses - must be 4-byte aligned. - - MSIs may be signaled by using this attribute group to write - to the relevant MSIIR. - - KVM_DEV_MPIC_GRP_IRQ_ACTIVE (rw, 32-bit) - IRQ input line for each standard openpic source. 0 is inactive and 1 - is active, regardless of interrupt sense. - - For edge-triggered interrupts: Writing 1 is considered an activating - edge, and writing 0 is ignored. Reading returns 1 if a previously - signaled edge has not been acknowledged, and 0 otherwise. - - "attr" is the IRQ number. IRQ numbers for standard sources are the - byte offset of the relevant IVPR from EIVPR0, divided by 32. - -IRQ Routing: - - The MPIC emulation supports IRQ routing. Only a single MPIC device can - be instantiated. Once that device has been created, it's available as - irqchip id 0. - - This irqchip 0 has 256 interrupt pins, which expose the interrupts in - the main array of interrupt sources (a.k.a. "SRC" interrupts). - - The numbering is the same as the MPIC device tree binding -- based on - the register offset from the beginning of the sources array, without - regard to any subdivisions in chip documentation such as "internal" - or "external" interrupts. - - Access to non-SRC interrupts is not implemented through IRQ routing mechanisms. diff --git a/Documentation/virtual/kvm/devices/s390_flic.txt b/Documentation/virtual/kvm/devices/s390_flic.txt deleted file mode 100644 index a4e20a090174..000000000000 --- a/Documentation/virtual/kvm/devices/s390_flic.txt +++ /dev/null @@ -1,163 +0,0 @@ -FLIC (floating interrupt controller) -==================================== - -FLIC handles floating (non per-cpu) interrupts, i.e. I/O, service and some -machine check interruptions. All interrupts are stored in a per-vm list of -pending interrupts. FLIC performs operations on this list. - -Only one FLIC instance may be instantiated. - -FLIC provides support to -- add interrupts (KVM_DEV_FLIC_ENQUEUE) -- inspect currently pending interrupts (KVM_FLIC_GET_ALL_IRQS) -- purge all pending floating interrupts (KVM_DEV_FLIC_CLEAR_IRQS) -- purge one pending floating I/O interrupt (KVM_DEV_FLIC_CLEAR_IO_IRQ) -- enable/disable for the guest transparent async page faults -- register and modify adapter interrupt sources (KVM_DEV_FLIC_ADAPTER_*) -- modify AIS (adapter-interruption-suppression) mode state (KVM_DEV_FLIC_AISM) -- inject adapter interrupts on a specified adapter (KVM_DEV_FLIC_AIRQ_INJECT) -- get/set all AIS mode states (KVM_DEV_FLIC_AISM_ALL) - -Groups: - KVM_DEV_FLIC_ENQUEUE - Passes a buffer and length into the kernel which are then injected into - the list of pending interrupts. - attr->addr contains the pointer to the buffer and attr->attr contains - the length of the buffer. - The format of the data structure kvm_s390_irq as it is copied from userspace - is defined in usr/include/linux/kvm.h. - - KVM_DEV_FLIC_GET_ALL_IRQS - Copies all floating interrupts into a buffer provided by userspace. - When the buffer is too small it returns -ENOMEM, which is the indication - for userspace to try again with a bigger buffer. - -ENOBUFS is returned when the allocation of a kernelspace buffer has - failed. - -EFAULT is returned when copying data to userspace failed. - All interrupts remain pending, i.e. are not deleted from the list of - currently pending interrupts. - attr->addr contains the userspace address of the buffer into which all - interrupt data will be copied. - attr->attr contains the size of the buffer in bytes. - - KVM_DEV_FLIC_CLEAR_IRQS - Simply deletes all elements from the list of currently pending floating - interrupts. No interrupts are injected into the guest. - - KVM_DEV_FLIC_CLEAR_IO_IRQ - Deletes one (if any) I/O interrupt for a subchannel identified by the - subsystem identification word passed via the buffer specified by - attr->addr (address) and attr->attr (length). - - KVM_DEV_FLIC_APF_ENABLE - Enables async page faults for the guest. So in case of a major page fault - the host is allowed to handle this async and continues the guest. - - KVM_DEV_FLIC_APF_DISABLE_WAIT - Disables async page faults for the guest and waits until already pending - async page faults are done. This is necessary to trigger a completion interrupt - for every init interrupt before migrating the interrupt list. - - KVM_DEV_FLIC_ADAPTER_REGISTER - Register an I/O adapter interrupt source. Takes a kvm_s390_io_adapter - describing the adapter to register: - -struct kvm_s390_io_adapter { - __u32 id; - __u8 isc; - __u8 maskable; - __u8 swap; - __u8 flags; -}; - - id contains the unique id for the adapter, isc the I/O interruption subclass - to use, maskable whether this adapter may be masked (interrupts turned off), - swap whether the indicators need to be byte swapped, and flags contains - further characteristics of the adapter. - Currently defined values for 'flags' are: - - KVM_S390_ADAPTER_SUPPRESSIBLE: adapter is subject to AIS - (adapter-interrupt-suppression) facility. This flag only has an effect if - the AIS capability is enabled. - Unknown flag values are ignored. - - - KVM_DEV_FLIC_ADAPTER_MODIFY - Modifies attributes of an existing I/O adapter interrupt source. Takes - a kvm_s390_io_adapter_req specifying the adapter and the operation: - -struct kvm_s390_io_adapter_req { - __u32 id; - __u8 type; - __u8 mask; - __u16 pad0; - __u64 addr; -}; - - id specifies the adapter and type the operation. The supported operations - are: - - KVM_S390_IO_ADAPTER_MASK - mask or unmask the adapter, as specified in mask - - KVM_S390_IO_ADAPTER_MAP - perform a gmap translation for the guest address provided in addr, - pin a userspace page for the translated address and add it to the - list of mappings - Note: A new mapping will be created unconditionally; therefore, - the calling code should avoid making duplicate mappings. - - KVM_S390_IO_ADAPTER_UNMAP - release a userspace page for the translated address specified in addr - from the list of mappings - - KVM_DEV_FLIC_AISM - modify the adapter-interruption-suppression mode for a given isc if the - AIS capability is enabled. Takes a kvm_s390_ais_req describing: - -struct kvm_s390_ais_req { - __u8 isc; - __u16 mode; -}; - - isc contains the target I/O interruption subclass, mode the target - adapter-interruption-suppression mode. The following modes are - currently supported: - - KVM_S390_AIS_MODE_ALL: ALL-Interruptions Mode, i.e. airq injection - is always allowed; - - KVM_S390_AIS_MODE_SINGLE: SINGLE-Interruption Mode, i.e. airq - injection is only allowed once and the following adapter interrupts - will be suppressed until the mode is set again to ALL-Interruptions - or SINGLE-Interruption mode. - - KVM_DEV_FLIC_AIRQ_INJECT - Inject adapter interrupts on a specified adapter. - attr->attr contains the unique id for the adapter, which allows for - adapter-specific checks and actions. - For adapters subject to AIS, handle the airq injection suppression for - an isc according to the adapter-interruption-suppression mode on condition - that the AIS capability is enabled. - - KVM_DEV_FLIC_AISM_ALL - Gets or sets the adapter-interruption-suppression mode for all ISCs. Takes - a kvm_s390_ais_all describing: - -struct kvm_s390_ais_all { - __u8 simm; /* Single-Interruption-Mode mask */ - __u8 nimm; /* No-Interruption-Mode mask * -}; - - simm contains Single-Interruption-Mode mask for all ISCs, nimm contains - No-Interruption-Mode mask for all ISCs. Each bit in simm and nimm corresponds - to an ISC (MSB0 bit 0 to ISC 0 and so on). The combination of simm bit and - nimm bit presents AIS mode for a ISC. - - KVM_DEV_FLIC_AISM_ALL is indicated by KVM_CAP_S390_AIS_MIGRATION. - -Note: The KVM_SET_DEVICE_ATTR/KVM_GET_DEVICE_ATTR device ioctls executed on -FLIC with an unknown group or attribute gives the error code EINVAL (instead of -ENXIO, as specified in the API documentation). It is not possible to conclude -that a FLIC operation is unavailable based on the error code resulting from a -usage attempt. - -Note: The KVM_DEV_FLIC_CLEAR_IO_IRQ ioctl will return EINVAL in case a zero -schid is specified. diff --git a/Documentation/virtual/kvm/devices/vcpu.txt b/Documentation/virtual/kvm/devices/vcpu.txt deleted file mode 100644 index 2b5dab16c4f2..000000000000 --- a/Documentation/virtual/kvm/devices/vcpu.txt +++ /dev/null @@ -1,62 +0,0 @@ -Generic vcpu interface -==================================== - -The virtual cpu "device" also accepts the ioctls KVM_SET_DEVICE_ATTR, -KVM_GET_DEVICE_ATTR, and KVM_HAS_DEVICE_ATTR. The interface uses the same struct -kvm_device_attr as other devices, but targets VCPU-wide settings and controls. - -The groups and attributes per virtual cpu, if any, are architecture specific. - -1. GROUP: KVM_ARM_VCPU_PMU_V3_CTRL -Architectures: ARM64 - -1.1. ATTRIBUTE: KVM_ARM_VCPU_PMU_V3_IRQ -Parameters: in kvm_device_attr.addr the address for PMU overflow interrupt is a - pointer to an int -Returns: -EBUSY: The PMU overflow interrupt is already set - -ENXIO: The overflow interrupt not set when attempting to get it - -ENODEV: PMUv3 not supported - -EINVAL: Invalid PMU overflow interrupt number supplied or - trying to set the IRQ number without using an in-kernel - irqchip. - -A value describing the PMUv3 (Performance Monitor Unit v3) overflow interrupt -number for this vcpu. This interrupt could be a PPI or SPI, but the interrupt -type must be same for each vcpu. As a PPI, the interrupt number is the same for -all vcpus, while as an SPI it must be a separate number per vcpu. - -1.2 ATTRIBUTE: KVM_ARM_VCPU_PMU_V3_INIT -Parameters: no additional parameter in kvm_device_attr.addr -Returns: -ENODEV: PMUv3 not supported or GIC not initialized - -ENXIO: PMUv3 not properly configured or in-kernel irqchip not - configured as required prior to calling this attribute - -EBUSY: PMUv3 already initialized - -Request the initialization of the PMUv3. If using the PMUv3 with an in-kernel -virtual GIC implementation, this must be done after initializing the in-kernel -irqchip. - - -2. GROUP: KVM_ARM_VCPU_TIMER_CTRL -Architectures: ARM,ARM64 - -2.1. ATTRIBUTE: KVM_ARM_VCPU_TIMER_IRQ_VTIMER -2.2. ATTRIBUTE: KVM_ARM_VCPU_TIMER_IRQ_PTIMER -Parameters: in kvm_device_attr.addr the address for the timer interrupt is a - pointer to an int -Returns: -EINVAL: Invalid timer interrupt number - -EBUSY: One or more VCPUs has already run - -A value describing the architected timer interrupt number when connected to an -in-kernel virtual GIC. These must be a PPI (16 <= intid < 32). Setting the -attribute overrides the default values (see below). - -KVM_ARM_VCPU_TIMER_IRQ_VTIMER: The EL1 virtual timer intid (default: 27) -KVM_ARM_VCPU_TIMER_IRQ_PTIMER: The EL1 physical timer intid (default: 30) - -Setting the same PPI for different timers will prevent the VCPUs from running. -Setting the interrupt number on a VCPU configures all VCPUs created at that -time to use the number provided for a given timer, overwriting any previously -configured values on other VCPUs. Userspace should configure the interrupt -numbers on at least one VCPU after creating all VCPUs and before running any -VCPUs. diff --git a/Documentation/virtual/kvm/devices/vfio.txt b/Documentation/virtual/kvm/devices/vfio.txt deleted file mode 100644 index 528c77c8022c..000000000000 --- a/Documentation/virtual/kvm/devices/vfio.txt +++ /dev/null @@ -1,36 +0,0 @@ -VFIO virtual device -=================== - -Device types supported: - KVM_DEV_TYPE_VFIO - -Only one VFIO instance may be created per VM. The created device -tracks VFIO groups in use by the VM and features of those groups -important to the correctness and acceleration of the VM. As groups -are enabled and disabled for use by the VM, KVM should be updated -about their presence. When registered with KVM, a reference to the -VFIO-group is held by KVM. - -Groups: - KVM_DEV_VFIO_GROUP - -KVM_DEV_VFIO_GROUP attributes: - KVM_DEV_VFIO_GROUP_ADD: Add a VFIO group to VFIO-KVM device tracking - kvm_device_attr.addr points to an int32_t file descriptor - for the VFIO group. - KVM_DEV_VFIO_GROUP_DEL: Remove a VFIO group from VFIO-KVM device tracking - kvm_device_attr.addr points to an int32_t file descriptor - for the VFIO group. - KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE: attaches a guest visible TCE table - allocated by sPAPR KVM. - kvm_device_attr.addr points to a struct: - - struct kvm_vfio_spapr_tce { - __s32 groupfd; - __s32 tablefd; - }; - - where - @groupfd is a file descriptor for a VFIO group; - @tablefd is a file descriptor for a TCE table allocated via - KVM_CREATE_SPAPR_TCE. diff --git a/Documentation/virtual/kvm/devices/vm.txt b/Documentation/virtual/kvm/devices/vm.txt deleted file mode 100644 index 4ffb82b02468..000000000000 --- a/Documentation/virtual/kvm/devices/vm.txt +++ /dev/null @@ -1,270 +0,0 @@ -Generic vm interface -==================================== - -The virtual machine "device" also accepts the ioctls KVM_SET_DEVICE_ATTR, -KVM_GET_DEVICE_ATTR, and KVM_HAS_DEVICE_ATTR. The interface uses the same -struct kvm_device_attr as other devices, but targets VM-wide settings -and controls. - -The groups and attributes per virtual machine, if any, are architecture -specific. - -1. GROUP: KVM_S390_VM_MEM_CTRL -Architectures: s390 - -1.1. ATTRIBUTE: KVM_S390_VM_MEM_ENABLE_CMMA -Parameters: none -Returns: -EBUSY if a vcpu is already defined, otherwise 0 - -Enables Collaborative Memory Management Assist (CMMA) for the virtual machine. - -1.2. ATTRIBUTE: KVM_S390_VM_MEM_CLR_CMMA -Parameters: none -Returns: -EINVAL if CMMA was not enabled - 0 otherwise - -Clear the CMMA status for all guest pages, so any pages the guest marked -as unused are again used any may not be reclaimed by the host. - -1.3. ATTRIBUTE KVM_S390_VM_MEM_LIMIT_SIZE -Parameters: in attr->addr the address for the new limit of guest memory -Returns: -EFAULT if the given address is not accessible - -EINVAL if the virtual machine is of type UCONTROL - -E2BIG if the given guest memory is to big for that machine - -EBUSY if a vcpu is already defined - -ENOMEM if not enough memory is available for a new shadow guest mapping - 0 otherwise - -Allows userspace to query the actual limit and set a new limit for -the maximum guest memory size. The limit will be rounded up to -2048 MB, 4096 GB, 8192 TB respectively, as this limit is governed by -the number of page table levels. In the case that there is no limit we will set -the limit to KVM_S390_NO_MEM_LIMIT (U64_MAX). - -2. GROUP: KVM_S390_VM_CPU_MODEL -Architectures: s390 - -2.1. ATTRIBUTE: KVM_S390_VM_CPU_MACHINE (r/o) - -Allows user space to retrieve machine and kvm specific cpu related information: - -struct kvm_s390_vm_cpu_machine { - __u64 cpuid; # CPUID of host - __u32 ibc; # IBC level range offered by host - __u8 pad[4]; - __u64 fac_mask[256]; # set of cpu facilities enabled by KVM - __u64 fac_list[256]; # set of cpu facilities offered by host -} - -Parameters: address of buffer to store the machine related cpu data - of type struct kvm_s390_vm_cpu_machine* -Returns: -EFAULT if the given address is not accessible from kernel space - -ENOMEM if not enough memory is available to process the ioctl - 0 in case of success - -2.2. ATTRIBUTE: KVM_S390_VM_CPU_PROCESSOR (r/w) - -Allows user space to retrieve or request to change cpu related information for a vcpu: - -struct kvm_s390_vm_cpu_processor { - __u64 cpuid; # CPUID currently (to be) used by this vcpu - __u16 ibc; # IBC level currently (to be) used by this vcpu - __u8 pad[6]; - __u64 fac_list[256]; # set of cpu facilities currently (to be) used - # by this vcpu -} - -KVM does not enforce or limit the cpu model data in any form. Take the information -retrieved by means of KVM_S390_VM_CPU_MACHINE as hint for reasonable configuration -setups. Instruction interceptions triggered by additionally set facility bits that -are not handled by KVM need to by imlemented in the VM driver code. - -Parameters: address of buffer to store/set the processor related cpu - data of type struct kvm_s390_vm_cpu_processor*. -Returns: -EBUSY in case 1 or more vcpus are already activated (only in write case) - -EFAULT if the given address is not accessible from kernel space - -ENOMEM if not enough memory is available to process the ioctl - 0 in case of success - -2.3. ATTRIBUTE: KVM_S390_VM_CPU_MACHINE_FEAT (r/o) - -Allows user space to retrieve available cpu features. A feature is available if -provided by the hardware and supported by kvm. In theory, cpu features could -even be completely emulated by kvm. - -struct kvm_s390_vm_cpu_feat { - __u64 feat[16]; # Bitmap (1 = feature available), MSB 0 bit numbering -}; - -Parameters: address of a buffer to load the feature list from. -Returns: -EFAULT if the given address is not accessible from kernel space. - 0 in case of success. - -2.4. ATTRIBUTE: KVM_S390_VM_CPU_PROCESSOR_FEAT (r/w) - -Allows user space to retrieve or change enabled cpu features for all VCPUs of a -VM. Features that are not available cannot be enabled. - -See 2.3. for a description of the parameter struct. - -Parameters: address of a buffer to store/load the feature list from. -Returns: -EFAULT if the given address is not accessible from kernel space. - -EINVAL if a cpu feature that is not available is to be enabled. - -EBUSY if at least one VCPU has already been defined. - 0 in case of success. - -2.5. ATTRIBUTE: KVM_S390_VM_CPU_MACHINE_SUBFUNC (r/o) - -Allows user space to retrieve available cpu subfunctions without any filtering -done by a set IBC. These subfunctions are indicated to the guest VCPU via -query or "test bit" subfunctions and used e.g. by cpacf functions, plo and ptff. - -A subfunction block is only valid if KVM_S390_VM_CPU_MACHINE contains the -STFL(E) bit introducing the affected instruction. If the affected instruction -indicates subfunctions via a "query subfunction", the response block is -contained in the returned struct. If the affected instruction -indicates subfunctions via a "test bit" mechanism, the subfunction codes are -contained in the returned struct in MSB 0 bit numbering. - -struct kvm_s390_vm_cpu_subfunc { - u8 plo[32]; # always valid (ESA/390 feature) - u8 ptff[16]; # valid with TOD-clock steering - u8 kmac[16]; # valid with Message-Security-Assist - u8 kmc[16]; # valid with Message-Security-Assist - u8 km[16]; # valid with Message-Security-Assist - u8 kimd[16]; # valid with Message-Security-Assist - u8 klmd[16]; # valid with Message-Security-Assist - u8 pckmo[16]; # valid with Message-Security-Assist-Extension 3 - u8 kmctr[16]; # valid with Message-Security-Assist-Extension 4 - u8 kmf[16]; # valid with Message-Security-Assist-Extension 4 - u8 kmo[16]; # valid with Message-Security-Assist-Extension 4 - u8 pcc[16]; # valid with Message-Security-Assist-Extension 4 - u8 ppno[16]; # valid with Message-Security-Assist-Extension 5 - u8 kma[16]; # valid with Message-Security-Assist-Extension 8 - u8 kdsa[16]; # valid with Message-Security-Assist-Extension 9 - u8 reserved[1792]; # reserved for future instructions -}; - -Parameters: address of a buffer to load the subfunction blocks from. -Returns: -EFAULT if the given address is not accessible from kernel space. - 0 in case of success. - -2.6. ATTRIBUTE: KVM_S390_VM_CPU_PROCESSOR_SUBFUNC (r/w) - -Allows user space to retrieve or change cpu subfunctions to be indicated for -all VCPUs of a VM. This attribute will only be available if kernel and -hardware support are in place. - -The kernel uses the configured subfunction blocks for indication to -the guest. A subfunction block will only be used if the associated STFL(E) bit -has not been disabled by user space (so the instruction to be queried is -actually available for the guest). - -As long as no data has been written, a read will fail. The IBC will be used -to determine available subfunctions in this case, this will guarantee backward -compatibility. - -See 2.5. for a description of the parameter struct. - -Parameters: address of a buffer to store/load the subfunction blocks from. -Returns: -EFAULT if the given address is not accessible from kernel space. - -EINVAL when reading, if there was no write yet. - -EBUSY if at least one VCPU has already been defined. - 0 in case of success. - -3. GROUP: KVM_S390_VM_TOD -Architectures: s390 - -3.1. ATTRIBUTE: KVM_S390_VM_TOD_HIGH - -Allows user space to set/get the TOD clock extension (u8) (superseded by -KVM_S390_VM_TOD_EXT). - -Parameters: address of a buffer in user space to store the data (u8) to -Returns: -EFAULT if the given address is not accessible from kernel space - -EINVAL if setting the TOD clock extension to != 0 is not supported - -3.2. ATTRIBUTE: KVM_S390_VM_TOD_LOW - -Allows user space to set/get bits 0-63 of the TOD clock register as defined in -the POP (u64). - -Parameters: address of a buffer in user space to store the data (u64) to -Returns: -EFAULT if the given address is not accessible from kernel space - -3.3. ATTRIBUTE: KVM_S390_VM_TOD_EXT -Allows user space to set/get bits 0-63 of the TOD clock register as defined in -the POP (u64). If the guest CPU model supports the TOD clock extension (u8), it -also allows user space to get/set it. If the guest CPU model does not support -it, it is stored as 0 and not allowed to be set to a value != 0. - -Parameters: address of a buffer in user space to store the data - (kvm_s390_vm_tod_clock) to -Returns: -EFAULT if the given address is not accessible from kernel space - -EINVAL if setting the TOD clock extension to != 0 is not supported - -4. GROUP: KVM_S390_VM_CRYPTO -Architectures: s390 - -4.1. ATTRIBUTE: KVM_S390_VM_CRYPTO_ENABLE_AES_KW (w/o) - -Allows user space to enable aes key wrapping, including generating a new -wrapping key. - -Parameters: none -Returns: 0 - -4.2. ATTRIBUTE: KVM_S390_VM_CRYPTO_ENABLE_DEA_KW (w/o) - -Allows user space to enable dea key wrapping, including generating a new -wrapping key. - -Parameters: none -Returns: 0 - -4.3. ATTRIBUTE: KVM_S390_VM_CRYPTO_DISABLE_AES_KW (w/o) - -Allows user space to disable aes key wrapping, clearing the wrapping key. - -Parameters: none -Returns: 0 - -4.4. ATTRIBUTE: KVM_S390_VM_CRYPTO_DISABLE_DEA_KW (w/o) - -Allows user space to disable dea key wrapping, clearing the wrapping key. - -Parameters: none -Returns: 0 - -5. GROUP: KVM_S390_VM_MIGRATION -Architectures: s390 - -5.1. ATTRIBUTE: KVM_S390_VM_MIGRATION_STOP (w/o) - -Allows userspace to stop migration mode, needed for PGSTE migration. -Setting this attribute when migration mode is not active will have no -effects. - -Parameters: none -Returns: 0 - -5.2. ATTRIBUTE: KVM_S390_VM_MIGRATION_START (w/o) - -Allows userspace to start migration mode, needed for PGSTE migration. -Setting this attribute when migration mode is already active will have -no effects. - -Parameters: none -Returns: -ENOMEM if there is not enough free memory to start migration mode - -EINVAL if the state of the VM is invalid (e.g. no memory defined) - 0 in case of success. - -5.3. ATTRIBUTE: KVM_S390_VM_MIGRATION_STATUS (r/o) - -Allows userspace to query the status of migration mode. - -Parameters: address of a buffer in user space to store the data (u64) to; - the data itself is either 0 if migration mode is disabled or 1 - if it is enabled -Returns: -EFAULT if the given address is not accessible from kernel space - 0 in case of success. diff --git a/Documentation/virtual/kvm/devices/xics.txt b/Documentation/virtual/kvm/devices/xics.txt deleted file mode 100644 index 42864935ac5d..000000000000 --- a/Documentation/virtual/kvm/devices/xics.txt +++ /dev/null @@ -1,66 +0,0 @@ -XICS interrupt controller - -Device type supported: KVM_DEV_TYPE_XICS - -Groups: - KVM_DEV_XICS_SOURCES - Attributes: One per interrupt source, indexed by the source number. - -This device emulates the XICS (eXternal Interrupt Controller -Specification) defined in PAPR. The XICS has a set of interrupt -sources, each identified by a 20-bit source number, and a set of -Interrupt Control Presentation (ICP) entities, also called "servers", -each associated with a virtual CPU. - -The ICP entities are created by enabling the KVM_CAP_IRQ_ARCH -capability for each vcpu, specifying KVM_CAP_IRQ_XICS in args[0] and -the interrupt server number (i.e. the vcpu number from the XICS's -point of view) in args[1] of the kvm_enable_cap struct. Each ICP has -64 bits of state which can be read and written using the -KVM_GET_ONE_REG and KVM_SET_ONE_REG ioctls on the vcpu. The 64 bit -state word has the following bitfields, starting at the -least-significant end of the word: - -* Unused, 16 bits - -* Pending interrupt priority, 8 bits - Zero is the highest priority, 255 means no interrupt is pending. - -* Pending IPI (inter-processor interrupt) priority, 8 bits - Zero is the highest priority, 255 means no IPI is pending. - -* Pending interrupt source number, 24 bits - Zero means no interrupt pending, 2 means an IPI is pending - -* Current processor priority, 8 bits - Zero is the highest priority, meaning no interrupts can be - delivered, and 255 is the lowest priority. - -Each source has 64 bits of state that can be read and written using -the KVM_GET_DEVICE_ATTR and KVM_SET_DEVICE_ATTR ioctls, specifying the -KVM_DEV_XICS_SOURCES attribute group, with the attribute number being -the interrupt source number. The 64 bit state word has the following -bitfields, starting from the least-significant end of the word: - -* Destination (server number), 32 bits - This specifies where the interrupt should be sent, and is the - interrupt server number specified for the destination vcpu. - -* Priority, 8 bits - This is the priority specified for this interrupt source, where 0 is - the highest priority and 255 is the lowest. An interrupt with a - priority of 255 will never be delivered. - -* Level sensitive flag, 1 bit - This bit is 1 for a level-sensitive interrupt source, or 0 for - edge-sensitive (or MSI). - -* Masked flag, 1 bit - This bit is set to 1 if the interrupt is masked (cannot be delivered - regardless of its priority), for example by the ibm,int-off RTAS - call, or 0 if it is not masked. - -* Pending flag, 1 bit - This bit is 1 if the source has a pending interrupt, otherwise 0. - -Only one XICS instance may be created per VM. diff --git a/Documentation/virtual/kvm/devices/xive.txt b/Documentation/virtual/kvm/devices/xive.txt deleted file mode 100644 index 9a24a4525253..000000000000 --- a/Documentation/virtual/kvm/devices/xive.txt +++ /dev/null @@ -1,197 +0,0 @@ -POWER9 eXternal Interrupt Virtualization Engine (XIVE Gen1) -========================================================== - -Device types supported: - KVM_DEV_TYPE_XIVE POWER9 XIVE Interrupt Controller generation 1 - -This device acts as a VM interrupt controller. It provides the KVM -interface to configure the interrupt sources of a VM in the underlying -POWER9 XIVE interrupt controller. - -Only one XIVE instance may be instantiated. A guest XIVE device -requires a POWER9 host and the guest OS should have support for the -XIVE native exploitation interrupt mode. If not, it should run using -the legacy interrupt mode, referred as XICS (POWER7/8). - -* Device Mappings - - The KVM device exposes different MMIO ranges of the XIVE HW which - are required for interrupt management. These are exposed to the - guest in VMAs populated with a custom VM fault handler. - - 1. Thread Interrupt Management Area (TIMA) - - Each thread has an associated Thread Interrupt Management context - composed of a set of registers. These registers let the thread - handle priority management and interrupt acknowledgment. The most - important are : - - - Interrupt Pending Buffer (IPB) - - Current Processor Priority (CPPR) - - Notification Source Register (NSR) - - They are exposed to software in four different pages each proposing - a view with a different privilege. The first page is for the - physical thread context and the second for the hypervisor. Only the - third (operating system) and the fourth (user level) are exposed the - guest. - - 2. Event State Buffer (ESB) - - Each source is associated with an Event State Buffer (ESB) with - either a pair of even/odd pair of pages which provides commands to - manage the source: to trigger, to EOI, to turn off the source for - instance. - - 3. Device pass-through - - When a device is passed-through into the guest, the source - interrupts are from a different HW controller (PHB4) and the ESB - pages exposed to the guest should accommadate this change. - - The passthru_irq helpers, kvmppc_xive_set_mapped() and - kvmppc_xive_clr_mapped() are called when the device HW irqs are - mapped into or unmapped from the guest IRQ number space. The KVM - device extends these helpers to clear the ESB pages of the guest IRQ - number being mapped and then lets the VM fault handler repopulate. - The handler will insert the ESB page corresponding to the HW - interrupt of the device being passed-through or the initial IPI ESB - page if the device has being removed. - - The ESB remapping is fully transparent to the guest and the OS - device driver. All handling is done within VFIO and the above - helpers in KVM-PPC. - -* Groups: - - 1. KVM_DEV_XIVE_GRP_CTRL - Provides global controls on the device - Attributes: - 1.1 KVM_DEV_XIVE_RESET (write only) - Resets the interrupt controller configuration for sources and event - queues. To be used by kexec and kdump. - Errors: none - - 1.2 KVM_DEV_XIVE_EQ_SYNC (write only) - Sync all the sources and queues and mark the EQ pages dirty. This - to make sure that a consistent memory state is captured when - migrating the VM. - Errors: none - - 2. KVM_DEV_XIVE_GRP_SOURCE (write only) - Initializes a new source in the XIVE device and mask it. - Attributes: - Interrupt source number (64-bit) - The kvm_device_attr.addr points to a __u64 value: - bits: | 63 .... 2 | 1 | 0 - values: | unused | level | type - - type: 0:MSI 1:LSI - - level: assertion level in case of an LSI. - Errors: - -E2BIG: Interrupt source number is out of range - -ENOMEM: Could not create a new source block - -EFAULT: Invalid user pointer for attr->addr. - -ENXIO: Could not allocate underlying HW interrupt - - 3. KVM_DEV_XIVE_GRP_SOURCE_CONFIG (write only) - Configures source targeting - Attributes: - Interrupt source number (64-bit) - The kvm_device_attr.addr points to a __u64 value: - bits: | 63 .... 33 | 32 | 31 .. 3 | 2 .. 0 - values: | eisn | mask | server | priority - - priority: 0-7 interrupt priority level - - server: CPU number chosen to handle the interrupt - - mask: mask flag (unused) - - eisn: Effective Interrupt Source Number - Errors: - -ENOENT: Unknown source number - -EINVAL: Not initialized source number - -EINVAL: Invalid priority - -EINVAL: Invalid CPU number. - -EFAULT: Invalid user pointer for attr->addr. - -ENXIO: CPU event queues not configured or configuration of the - underlying HW interrupt failed - -EBUSY: No CPU available to serve interrupt - - 4. KVM_DEV_XIVE_GRP_EQ_CONFIG (read-write) - Configures an event queue of a CPU - Attributes: - EQ descriptor identifier (64-bit) - The EQ descriptor identifier is a tuple (server, priority) : - bits: | 63 .... 32 | 31 .. 3 | 2 .. 0 - values: | unused | server | priority - The kvm_device_attr.addr points to : - struct kvm_ppc_xive_eq { - __u32 flags; - __u32 qshift; - __u64 qaddr; - __u32 qtoggle; - __u32 qindex; - __u8 pad[40]; - }; - - flags: queue flags - KVM_XIVE_EQ_ALWAYS_NOTIFY (required) - forces notification without using the coalescing mechanism - provided by the XIVE END ESBs. - - qshift: queue size (power of 2) - - qaddr: real address of queue - - qtoggle: current queue toggle bit - - qindex: current queue index - - pad: reserved for future use - Errors: - -ENOENT: Invalid CPU number - -EINVAL: Invalid priority - -EINVAL: Invalid flags - -EINVAL: Invalid queue size - -EINVAL: Invalid queue address - -EFAULT: Invalid user pointer for attr->addr. - -EIO: Configuration of the underlying HW failed - - 5. KVM_DEV_XIVE_GRP_SOURCE_SYNC (write only) - Synchronize the source to flush event notifications - Attributes: - Interrupt source number (64-bit) - Errors: - -ENOENT: Unknown source number - -EINVAL: Not initialized source number - -* VCPU state - - The XIVE IC maintains VP interrupt state in an internal structure - called the NVT. When a VP is not dispatched on a HW processor - thread, this structure can be updated by HW if the VP is the target - of an event notification. - - It is important for migration to capture the cached IPB from the NVT - as it synthesizes the priorities of the pending interrupts. We - capture a bit more to report debug information. - - KVM_REG_PPC_VP_STATE (2 * 64bits) - bits: | 63 .... 32 | 31 .... 0 | - values: | TIMA word0 | TIMA word1 | - bits: | 127 .......... 64 | - values: | unused | - -* Migration: - - Saving the state of a VM using the XIVE native exploitation mode - should follow a specific sequence. When the VM is stopped : - - 1. Mask all sources (PQ=01) to stop the flow of events. - - 2. Sync the XIVE device with the KVM control KVM_DEV_XIVE_EQ_SYNC to - flush any in-flight event notification and to stabilize the EQs. At - this stage, the EQ pages are marked dirty to make sure they are - transferred in the migration sequence. - - 3. Capture the state of the source targeting, the EQs configuration - and the state of thread interrupt context registers. - - Restore is similar : - - 1. Restore the EQ configuration. As targeting depends on it. - 2. Restore targeting - 3. Restore the thread interrupt contexts - 4. Restore the source states - 5. Let the vCPU run diff --git a/Documentation/virtual/kvm/halt-polling.txt b/Documentation/virtual/kvm/halt-polling.txt deleted file mode 100644 index 4f791b128dd2..000000000000 --- a/Documentation/virtual/kvm/halt-polling.txt +++ /dev/null @@ -1,136 +0,0 @@ -The KVM halt polling system -=========================== - -The KVM halt polling system provides a feature within KVM whereby the latency -of a guest can, under some circumstances, be reduced by polling in the host -for some time period after the guest has elected to no longer run by cedeing. -That is, when a guest vcpu has ceded, or in the case of powerpc when all of the -vcpus of a single vcore have ceded, the host kernel polls for wakeup conditions -before giving up the cpu to the scheduler in order to let something else run. - -Polling provides a latency advantage in cases where the guest can be run again -very quickly by at least saving us a trip through the scheduler, normally on -the order of a few micro-seconds, although performance benefits are workload -dependant. In the event that no wakeup source arrives during the polling -interval or some other task on the runqueue is runnable the scheduler is -invoked. Thus halt polling is especially useful on workloads with very short -wakeup periods where the time spent halt polling is minimised and the time -savings of not invoking the scheduler are distinguishable. - -The generic halt polling code is implemented in: - - virt/kvm/kvm_main.c: kvm_vcpu_block() - -The powerpc kvm-hv specific case is implemented in: - - arch/powerpc/kvm/book3s_hv.c: kvmppc_vcore_blocked() - -Halt Polling Interval -===================== - -The maximum time for which to poll before invoking the scheduler, referred to -as the halt polling interval, is increased and decreased based on the perceived -effectiveness of the polling in an attempt to limit pointless polling. -This value is stored in either the vcpu struct: - - kvm_vcpu->halt_poll_ns - -or in the case of powerpc kvm-hv, in the vcore struct: - - kvmppc_vcore->halt_poll_ns - -Thus this is a per vcpu (or vcore) value. - -During polling if a wakeup source is received within the halt polling interval, -the interval is left unchanged. In the event that a wakeup source isn't -received during the polling interval (and thus schedule is invoked) there are -two options, either the polling interval and total block time[0] were less than -the global max polling interval (see module params below), or the total block -time was greater than the global max polling interval. - -In the event that both the polling interval and total block time were less than -the global max polling interval then the polling interval can be increased in -the hope that next time during the longer polling interval the wake up source -will be received while the host is polling and the latency benefits will be -received. The polling interval is grown in the function grow_halt_poll_ns() and -is multiplied by the module parameters halt_poll_ns_grow and -halt_poll_ns_grow_start. - -In the event that the total block time was greater than the global max polling -interval then the host will never poll for long enough (limited by the global -max) to wakeup during the polling interval so it may as well be shrunk in order -to avoid pointless polling. The polling interval is shrunk in the function -shrink_halt_poll_ns() and is divided by the module parameter -halt_poll_ns_shrink, or set to 0 iff halt_poll_ns_shrink == 0. - -It is worth noting that this adjustment process attempts to hone in on some -steady state polling interval but will only really do a good job for wakeups -which come at an approximately constant rate, otherwise there will be constant -adjustment of the polling interval. - -[0] total block time: the time between when the halt polling function is - invoked and a wakeup source received (irrespective of - whether the scheduler is invoked within that function). - -Module Parameters -================= - -The kvm module has 3 tuneable module parameters to adjust the global max -polling interval as well as the rate at which the polling interval is grown and -shrunk. These variables are defined in include/linux/kvm_host.h and as module -parameters in virt/kvm/kvm_main.c, or arch/powerpc/kvm/book3s_hv.c in the -powerpc kvm-hv case. - -Module Parameter | Description | Default Value --------------------------------------------------------------------------------- -halt_poll_ns | The global max polling | KVM_HALT_POLL_NS_DEFAULT - | interval which defines | - | the ceiling value of the | - | polling interval for | (per arch value) - | each vcpu. | --------------------------------------------------------------------------------- -halt_poll_ns_grow | The value by which the | 2 - | halt polling interval is | - | multiplied in the | - | grow_halt_poll_ns() | - | function. | --------------------------------------------------------------------------------- -halt_poll_ns_grow_start | The initial value to grow | 10000 - | to from zero in the | - | grow_halt_poll_ns() | - | function. | --------------------------------------------------------------------------------- -halt_poll_ns_shrink | The value by which the | 0 - | halt polling interval is | - | divided in the | - | shrink_halt_poll_ns() | - | function. | --------------------------------------------------------------------------------- - -These module parameters can be set from the debugfs files in: - - /sys/module/kvm/parameters/ - -Note: that these module parameters are system wide values and are not able to - be tuned on a per vm basis. - -Further Notes -============= - -- Care should be taken when setting the halt_poll_ns module parameter as a -large value has the potential to drive the cpu usage to 100% on a machine which -would be almost entirely idle otherwise. This is because even if a guest has -wakeups during which very little work is done and which are quite far apart, if -the period is shorter than the global max polling interval (halt_poll_ns) then -the host will always poll for the entire block time and thus cpu utilisation -will go to 100%. - -- Halt polling essentially presents a trade off between power usage and latency -and the module parameters should be used to tune the affinity for this. Idle -cpu time is essentially converted to host kernel time with the aim of decreasing -latency when entering the guest. - -- Halt polling will only be conducted by the host when no other tasks are -runnable on that cpu, otherwise the polling will cease immediately and -schedule will be invoked to allow that other task to run. Thus this doesn't -allow a guest to denial of service the cpu. diff --git a/Documentation/virtual/kvm/hypercalls.txt b/Documentation/virtual/kvm/hypercalls.txt deleted file mode 100644 index da210651f714..000000000000 --- a/Documentation/virtual/kvm/hypercalls.txt +++ /dev/null @@ -1,154 +0,0 @@ -Linux KVM Hypercall: -=================== -X86: - KVM Hypercalls have a three-byte sequence of either the vmcall or the vmmcall - instruction. The hypervisor can replace it with instructions that are - guaranteed to be supported. - - Up to four arguments may be passed in rbx, rcx, rdx, and rsi respectively. - The hypercall number should be placed in rax and the return value will be - placed in rax. No other registers will be clobbered unless explicitly stated - by the particular hypercall. - -S390: - R2-R7 are used for parameters 1-6. In addition, R1 is used for hypercall - number. The return value is written to R2. - - S390 uses diagnose instruction as hypercall (0x500) along with hypercall - number in R1. - - For further information on the S390 diagnose call as supported by KVM, - refer to Documentation/virtual/kvm/s390-diag.txt. - - PowerPC: - It uses R3-R10 and hypercall number in R11. R4-R11 are used as output registers. - Return value is placed in R3. - - KVM hypercalls uses 4 byte opcode, that are patched with 'hypercall-instructions' - property inside the device tree's /hypervisor node. - For more information refer to Documentation/virtual/kvm/ppc-pv.txt - -MIPS: - KVM hypercalls use the HYPCALL instruction with code 0 and the hypercall - number in $2 (v0). Up to four arguments may be placed in $4-$7 (a0-a3) and - the return value is placed in $2 (v0). - -KVM Hypercalls Documentation -=========================== -The template for each hypercall is: -1. Hypercall name. -2. Architecture(s) -3. Status (deprecated, obsolete, active) -4. Purpose - -1. KVM_HC_VAPIC_POLL_IRQ ------------------------- -Architecture: x86 -Status: active -Purpose: Trigger guest exit so that the host can check for pending -interrupts on reentry. - -2. KVM_HC_MMU_OP ------------------------- -Architecture: x86 -Status: deprecated. -Purpose: Support MMU operations such as writing to PTE, -flushing TLB, release PT. - -3. KVM_HC_FEATURES ------------------------- -Architecture: PPC -Status: active -Purpose: Expose hypercall availability to the guest. On x86 platforms, cpuid -used to enumerate which hypercalls are available. On PPC, either device tree -based lookup ( which is also what EPAPR dictates) OR KVM specific enumeration -mechanism (which is this hypercall) can be used. - -4. KVM_HC_PPC_MAP_MAGIC_PAGE ------------------------- -Architecture: PPC -Status: active -Purpose: To enable communication between the hypervisor and guest there is a -shared page that contains parts of supervisor visible register state. -The guest can map this shared page to access its supervisor register through -memory using this hypercall. - -5. KVM_HC_KICK_CPU ------------------------- -Architecture: x86 -Status: active -Purpose: Hypercall used to wakeup a vcpu from HLT state -Usage example : A vcpu of a paravirtualized guest that is busywaiting in guest -kernel mode for an event to occur (ex: a spinlock to become available) can -execute HLT instruction once it has busy-waited for more than a threshold -time-interval. Execution of HLT instruction would cause the hypervisor to put -the vcpu to sleep until occurrence of an appropriate event. Another vcpu of the -same guest can wakeup the sleeping vcpu by issuing KVM_HC_KICK_CPU hypercall, -specifying APIC ID (a1) of the vcpu to be woken up. An additional argument (a0) -is used in the hypercall for future use. - - -6. KVM_HC_CLOCK_PAIRING ------------------------- -Architecture: x86 -Status: active -Purpose: Hypercall used to synchronize host and guest clocks. -Usage: - -a0: guest physical address where host copies -"struct kvm_clock_offset" structure. - -a1: clock_type, ATM only KVM_CLOCK_PAIRING_WALLCLOCK (0) -is supported (corresponding to the host's CLOCK_REALTIME clock). - - struct kvm_clock_pairing { - __s64 sec; - __s64 nsec; - __u64 tsc; - __u32 flags; - __u32 pad[9]; - }; - - Where: - * sec: seconds from clock_type clock. - * nsec: nanoseconds from clock_type clock. - * tsc: guest TSC value used to calculate sec/nsec pair - * flags: flags, unused (0) at the moment. - -The hypercall lets a guest compute a precise timestamp across -host and guest. The guest can use the returned TSC value to -compute the CLOCK_REALTIME for its clock, at the same instant. - -Returns KVM_EOPNOTSUPP if the host does not use TSC clocksource, -or if clock type is different than KVM_CLOCK_PAIRING_WALLCLOCK. - -6. KVM_HC_SEND_IPI ------------------------- -Architecture: x86 -Status: active -Purpose: Send IPIs to multiple vCPUs. - -a0: lower part of the bitmap of destination APIC IDs -a1: higher part of the bitmap of destination APIC IDs -a2: the lowest APIC ID in bitmap -a3: APIC ICR - -The hypercall lets a guest send multicast IPIs, with at most 128 -128 destinations per hypercall in 64-bit mode and 64 vCPUs per -hypercall in 32-bit mode. The destinations are represented by a -bitmap contained in the first two arguments (a0 and a1). Bit 0 of -a0 corresponds to the APIC ID in the third argument (a2), bit 1 -corresponds to the APIC ID a2+1, and so on. - -Returns the number of CPUs to which the IPIs were delivered successfully. - -7. KVM_HC_SCHED_YIELD ------------------------- -Architecture: x86 -Status: active -Purpose: Hypercall used to yield if the IPI target vCPU is preempted - -a0: destination APIC ID - -Usage example: When sending a call-function IPI-many to vCPUs, yield if -any of the IPI target vCPUs was preempted. diff --git a/Documentation/virtual/kvm/index.rst b/Documentation/virtual/kvm/index.rst deleted file mode 100644 index 0b206a06f5be..000000000000 --- a/Documentation/virtual/kvm/index.rst +++ /dev/null @@ -1,11 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -=== -KVM -=== - -.. toctree:: - :maxdepth: 2 - - amd-memory-encryption - cpuid diff --git a/Documentation/virtual/kvm/locking.txt b/Documentation/virtual/kvm/locking.txt deleted file mode 100644 index 635cd6eaf714..000000000000 --- a/Documentation/virtual/kvm/locking.txt +++ /dev/null @@ -1,215 +0,0 @@ -KVM Lock Overview -================= - -1. Acquisition Orders ---------------------- - -The acquisition orders for mutexes are as follows: - -- kvm->lock is taken outside vcpu->mutex - -- kvm->lock is taken outside kvm->slots_lock and kvm->irq_lock - -- kvm->slots_lock is taken outside kvm->irq_lock, though acquiring - them together is quite rare. - -On x86, vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock. - -Everything else is a leaf: no other lock is taken inside the critical -sections. - -2: Exception ------------- - -Fast page fault: - -Fast page fault is the fast path which fixes the guest page fault out of -the mmu-lock on x86. Currently, the page fault can be fast in one of the -following two cases: - -1. Access Tracking: The SPTE is not present, but it is marked for access -tracking i.e. the SPTE_SPECIAL_MASK is set. That means we need to -restore the saved R/X bits. This is described in more detail later below. - -2. Write-Protection: The SPTE is present and the fault is -caused by write-protect. That means we just need to change the W bit of the -spte. - -What we use to avoid all the race is the SPTE_HOST_WRITEABLE bit and -SPTE_MMU_WRITEABLE bit on the spte: -- SPTE_HOST_WRITEABLE means the gfn is writable on host. -- SPTE_MMU_WRITEABLE means the gfn is writable on mmu. The bit is set when - the gfn is writable on guest mmu and it is not write-protected by shadow - page write-protection. - -On fast page fault path, we will use cmpxchg to atomically set the spte W -bit if spte.SPTE_HOST_WRITEABLE = 1 and spte.SPTE_WRITE_PROTECT = 1, or -restore the saved R/X bits if VMX_EPT_TRACK_ACCESS mask is set, or both. This -is safe because whenever changing these bits can be detected by cmpxchg. - -But we need carefully check these cases: -1): The mapping from gfn to pfn -The mapping from gfn to pfn may be changed since we can only ensure the pfn -is not changed during cmpxchg. This is a ABA problem, for example, below case -will happen: - -At the beginning: -gpte = gfn1 -gfn1 is mapped to pfn1 on host -spte is the shadow page table entry corresponding with gpte and -spte = pfn1 - - VCPU 0 VCPU0 -on fast page fault path: - - old_spte = *spte; - pfn1 is swapped out: - spte = 0; - - pfn1 is re-alloced for gfn2. - - gpte is changed to point to - gfn2 by the guest: - spte = pfn1; - - if (cmpxchg(spte, old_spte, old_spte+W) - mark_page_dirty(vcpu->kvm, gfn1) - OOPS!!! - -We dirty-log for gfn1, that means gfn2 is lost in dirty-bitmap. - -For direct sp, we can easily avoid it since the spte of direct sp is fixed -to gfn. For indirect sp, before we do cmpxchg, we call gfn_to_pfn_atomic() -to pin gfn to pfn, because after gfn_to_pfn_atomic(): -- We have held the refcount of pfn that means the pfn can not be freed and - be reused for another gfn. -- The pfn is writable that means it can not be shared between different gfns - by KSM. - -Then, we can ensure the dirty bitmaps is correctly set for a gfn. - -Currently, to simplify the whole things, we disable fast page fault for -indirect shadow page. - -2): Dirty bit tracking -In the origin code, the spte can be fast updated (non-atomically) if the -spte is read-only and the Accessed bit has already been set since the -Accessed bit and Dirty bit can not be lost. - -But it is not true after fast page fault since the spte can be marked -writable between reading spte and updating spte. Like below case: - -At the beginning: -spte.W = 0 -spte.Accessed = 1 - - VCPU 0 VCPU0 -In mmu_spte_clear_track_bits(): - - old_spte = *spte; - - /* 'if' condition is satisfied. */ - if (old_spte.Accessed == 1 && - old_spte.W == 0) - spte = 0ull; - on fast page fault path: - spte.W = 1 - memory write on the spte: - spte.Dirty = 1 - - - else - old_spte = xchg(spte, 0ull) - - - if (old_spte.Accessed == 1) - kvm_set_pfn_accessed(spte.pfn); - if (old_spte.Dirty == 1) - kvm_set_pfn_dirty(spte.pfn); - OOPS!!! - -The Dirty bit is lost in this case. - -In order to avoid this kind of issue, we always treat the spte as "volatile" -if it can be updated out of mmu-lock, see spte_has_volatile_bits(), it means, -the spte is always atomically updated in this case. - -3): flush tlbs due to spte updated -If the spte is updated from writable to readonly, we should flush all TLBs, -otherwise rmap_write_protect will find a read-only spte, even though the -writable spte might be cached on a CPU's TLB. - -As mentioned before, the spte can be updated to writable out of mmu-lock on -fast page fault path, in order to easily audit the path, we see if TLBs need -be flushed caused by this reason in mmu_spte_update() since this is a common -function to update spte (present -> present). - -Since the spte is "volatile" if it can be updated out of mmu-lock, we always -atomically update the spte, the race caused by fast page fault can be avoided, -See the comments in spte_has_volatile_bits() and mmu_spte_update(). - -Lockless Access Tracking: - -This is used for Intel CPUs that are using EPT but do not support the EPT A/D -bits. In this case, when the KVM MMU notifier is called to track accesses to a -page (via kvm_mmu_notifier_clear_flush_young), it marks the PTE as not-present -by clearing the RWX bits in the PTE and storing the original R & X bits in -some unused/ignored bits. In addition, the SPTE_SPECIAL_MASK is also set on the -PTE (using the ignored bit 62). When the VM tries to access the page later on, -a fault is generated and the fast page fault mechanism described above is used -to atomically restore the PTE to a Present state. The W bit is not saved when -the PTE is marked for access tracking and during restoration to the Present -state, the W bit is set depending on whether or not it was a write access. If -it wasn't, then the W bit will remain clear until a write access happens, at -which time it will be set using the Dirty tracking mechanism described above. - -3. Reference ------------- - -Name: kvm_lock -Type: mutex -Arch: any -Protects: - vm_list - -Name: kvm_count_lock -Type: raw_spinlock_t -Arch: any -Protects: - hardware virtualization enable/disable -Comment: 'raw' because hardware enabling/disabling must be atomic /wrt - migration. - -Name: kvm_arch::tsc_write_lock -Type: raw_spinlock -Arch: x86 -Protects: - kvm_arch::{last_tsc_write,last_tsc_nsec,last_tsc_offset} - - tsc offset in vmcb -Comment: 'raw' because updating the tsc offsets must not be preempted. - -Name: kvm->mmu_lock -Type: spinlock_t -Arch: any -Protects: -shadow page/shadow tlb entry -Comment: it is a spinlock since it is used in mmu notifier. - -Name: kvm->srcu -Type: srcu lock -Arch: any -Protects: - kvm->memslots - - kvm->buses -Comment: The srcu read lock must be held while accessing memslots (e.g. - when using gfn_to_* functions) and while accessing in-kernel - MMIO/PIO address->device structure mapping (kvm->buses). - The srcu index can be stored in kvm_vcpu->srcu_idx per vcpu - if it is needed by multiple functions. - -Name: blocked_vcpu_on_cpu_lock -Type: spinlock_t -Arch: x86 -Protects: blocked_vcpu_on_cpu -Comment: This is a per-CPU lock and it is used for VT-d posted-interrupts. - When VT-d posted-interrupts is supported and the VM has assigned - devices, we put the blocked vCPU on the list blocked_vcpu_on_cpu - protected by blocked_vcpu_on_cpu_lock, when VT-d hardware issues - wakeup notification event since external interrupts from the - assigned devices happens, we will find the vCPU on the list to - wakeup. diff --git a/Documentation/virtual/kvm/mmu.txt b/Documentation/virtual/kvm/mmu.txt deleted file mode 100644 index 2efe0efc516e..000000000000 --- a/Documentation/virtual/kvm/mmu.txt +++ /dev/null @@ -1,449 +0,0 @@ -The x86 kvm shadow mmu -====================== - -The mmu (in arch/x86/kvm, files mmu.[ch] and paging_tmpl.h) is responsible -for presenting a standard x86 mmu to the guest, while translating guest -physical addresses to host physical addresses. - -The mmu code attempts to satisfy the following requirements: - -- correctness: the guest should not be able to determine that it is running - on an emulated mmu except for timing (we attempt to comply - with the specification, not emulate the characteristics of - a particular implementation such as tlb size) -- security: the guest must not be able to touch host memory not assigned - to it -- performance: minimize the performance penalty imposed by the mmu -- scaling: need to scale to large memory and large vcpu guests -- hardware: support the full range of x86 virtualization hardware -- integration: Linux memory management code must be in control of guest memory - so that swapping, page migration, page merging, transparent - hugepages, and similar features work without change -- dirty tracking: report writes to guest memory to enable live migration - and framebuffer-based displays -- footprint: keep the amount of pinned kernel memory low (most memory - should be shrinkable) -- reliability: avoid multipage or GFP_ATOMIC allocations - -Acronyms -======== - -pfn host page frame number -hpa host physical address -hva host virtual address -gfn guest frame number -gpa guest physical address -gva guest virtual address -ngpa nested guest physical address -ngva nested guest virtual address -pte page table entry (used also to refer generically to paging structure - entries) -gpte guest pte (referring to gfns) -spte shadow pte (referring to pfns) -tdp two dimensional paging (vendor neutral term for NPT and EPT) - -Virtual and real hardware supported -=================================== - -The mmu supports first-generation mmu hardware, which allows an atomic switch -of the current paging mode and cr3 during guest entry, as well as -two-dimensional paging (AMD's NPT and Intel's EPT). The emulated hardware -it exposes is the traditional 2/3/4 level x86 mmu, with support for global -pages, pae, pse, pse36, cr0.wp, and 1GB pages. Emulated hardware also -able to expose NPT capable hardware on NPT capable hosts. - -Translation -=========== - -The primary job of the mmu is to program the processor's mmu to translate -addresses for the guest. Different translations are required at different -times: - -- when guest paging is disabled, we translate guest physical addresses to - host physical addresses (gpa->hpa) -- when guest paging is enabled, we translate guest virtual addresses, to - guest physical addresses, to host physical addresses (gva->gpa->hpa) -- when the guest launches a guest of its own, we translate nested guest - virtual addresses, to nested guest physical addresses, to guest physical - addresses, to host physical addresses (ngva->ngpa->gpa->hpa) - -The primary challenge is to encode between 1 and 3 translations into hardware -that support only 1 (traditional) and 2 (tdp) translations. When the -number of required translations matches the hardware, the mmu operates in -direct mode; otherwise it operates in shadow mode (see below). - -Memory -====== - -Guest memory (gpa) is part of the user address space of the process that is -using kvm. Userspace defines the translation between guest addresses and user -addresses (gpa->hva); note that two gpas may alias to the same hva, but not -vice versa. - -These hvas may be backed using any method available to the host: anonymous -memory, file backed memory, and device memory. Memory might be paged by the -host at any time. - -Events -====== - -The mmu is driven by events, some from the guest, some from the host. - -Guest generated events: -- writes to control registers (especially cr3) -- invlpg/invlpga instruction execution -- access to missing or protected translations - -Host generated events: -- changes in the gpa->hpa translation (either through gpa->hva changes or - through hva->hpa changes) -- memory pressure (the shrinker) - -Shadow pages -============ - -The principal data structure is the shadow page, 'struct kvm_mmu_page'. A -shadow page contains 512 sptes, which can be either leaf or nonleaf sptes. A -shadow page may contain a mix of leaf and nonleaf sptes. - -A nonleaf spte allows the hardware mmu to reach the leaf pages and -is not related to a translation directly. It points to other shadow pages. - -A leaf spte corresponds to either one or two translations encoded into -one paging structure entry. These are always the lowest level of the -translation stack, with optional higher level translations left to NPT/EPT. -Leaf ptes point at guest pages. - -The following table shows translations encoded by leaf ptes, with higher-level -translations in parentheses: - - Non-nested guests: - nonpaging: gpa->hpa - paging: gva->gpa->hpa - paging, tdp: (gva->)gpa->hpa - Nested guests: - non-tdp: ngva->gpa->hpa (*) - tdp: (ngva->)ngpa->gpa->hpa - -(*) the guest hypervisor will encode the ngva->gpa translation into its page - tables if npt is not present - -Shadow pages contain the following information: - role.level: - The level in the shadow paging hierarchy that this shadow page belongs to. - 1=4k sptes, 2=2M sptes, 3=1G sptes, etc. - role.direct: - If set, leaf sptes reachable from this page are for a linear range. - Examples include real mode translation, large guest pages backed by small - host pages, and gpa->hpa translations when NPT or EPT is active. - The linear range starts at (gfn << PAGE_SHIFT) and its size is determined - by role.level (2MB for first level, 1GB for second level, 0.5TB for third - level, 256TB for fourth level) - If clear, this page corresponds to a guest page table denoted by the gfn - field. - role.quadrant: - When role.gpte_is_8_bytes=0, the guest uses 32-bit gptes while the host uses 64-bit - sptes. That means a guest page table contains more ptes than the host, - so multiple shadow pages are needed to shadow one guest page. - For first-level shadow pages, role.quadrant can be 0 or 1 and denotes the - first or second 512-gpte block in the guest page table. For second-level - page tables, each 32-bit gpte is converted to two 64-bit sptes - (since each first-level guest page is shadowed by two first-level - shadow pages) so role.quadrant takes values in the range 0..3. Each - quadrant maps 1GB virtual address space. - role.access: - Inherited guest access permissions in the form uwx. Note execute - permission is positive, not negative. - role.invalid: - The page is invalid and should not be used. It is a root page that is - currently pinned (by a cpu hardware register pointing to it); once it is - unpinned it will be destroyed. - role.gpte_is_8_bytes: - Reflects the size of the guest PTE for which the page is valid, i.e. '1' - if 64-bit gptes are in use, '0' if 32-bit gptes are in use. - role.nxe: - Contains the value of efer.nxe for which the page is valid. - role.cr0_wp: - Contains the value of cr0.wp for which the page is valid. - role.smep_andnot_wp: - Contains the value of cr4.smep && !cr0.wp for which the page is valid - (pages for which this is true are different from other pages; see the - treatment of cr0.wp=0 below). - role.smap_andnot_wp: - Contains the value of cr4.smap && !cr0.wp for which the page is valid - (pages for which this is true are different from other pages; see the - treatment of cr0.wp=0 below). - role.ept_sp: - This is a virtual flag to denote a shadowed nested EPT page. ept_sp - is true if "cr0_wp && smap_andnot_wp", an otherwise invalid combination. - role.smm: - Is 1 if the page is valid in system management mode. This field - determines which of the kvm_memslots array was used to build this - shadow page; it is also used to go back from a struct kvm_mmu_page - to a memslot, through the kvm_memslots_for_spte_role macro and - __gfn_to_memslot. - role.ad_disabled: - Is 1 if the MMU instance cannot use A/D bits. EPT did not have A/D - bits before Haswell; shadow EPT page tables also cannot use A/D bits - if the L1 hypervisor does not enable them. - gfn: - Either the guest page table containing the translations shadowed by this - page, or the base page frame for linear translations. See role.direct. - spt: - A pageful of 64-bit sptes containing the translations for this page. - Accessed by both kvm and hardware. - The page pointed to by spt will have its page->private pointing back - at the shadow page structure. - sptes in spt point either at guest pages, or at lower-level shadow pages. - Specifically, if sp1 and sp2 are shadow pages, then sp1->spt[n] may point - at __pa(sp2->spt). sp2 will point back at sp1 through parent_pte. - The spt array forms a DAG structure with the shadow page as a node, and - guest pages as leaves. - gfns: - An array of 512 guest frame numbers, one for each present pte. Used to - perform a reverse map from a pte to a gfn. When role.direct is set, any - element of this array can be calculated from the gfn field when used, in - this case, the array of gfns is not allocated. See role.direct and gfn. - root_count: - A counter keeping track of how many hardware registers (guest cr3 or - pdptrs) are now pointing at the page. While this counter is nonzero, the - page cannot be destroyed. See role.invalid. - parent_ptes: - The reverse mapping for the pte/ptes pointing at this page's spt. If - parent_ptes bit 0 is zero, only one spte points at this page and - parent_ptes points at this single spte, otherwise, there exists multiple - sptes pointing at this page and (parent_ptes & ~0x1) points at a data - structure with a list of parent sptes. - unsync: - If true, then the translations in this page may not match the guest's - translation. This is equivalent to the state of the tlb when a pte is - changed but before the tlb entry is flushed. Accordingly, unsync ptes - are synchronized when the guest executes invlpg or flushes its tlb by - other means. Valid for leaf pages. - unsync_children: - How many sptes in the page point at pages that are unsync (or have - unsynchronized children). - unsync_child_bitmap: - A bitmap indicating which sptes in spt point (directly or indirectly) at - pages that may be unsynchronized. Used to quickly locate all unsychronized - pages reachable from a given page. - clear_spte_count: - Only present on 32-bit hosts, where a 64-bit spte cannot be written - atomically. The reader uses this while running out of the MMU lock - to detect in-progress updates and retry them until the writer has - finished the write. - write_flooding_count: - A guest may write to a page table many times, causing a lot of - emulations if the page needs to be write-protected (see "Synchronized - and unsynchronized pages" below). Leaf pages can be unsynchronized - so that they do not trigger frequent emulation, but this is not - possible for non-leafs. This field counts the number of emulations - since the last time the page table was actually used; if emulation - is triggered too frequently on this page, KVM will unmap the page - to avoid emulation in the future. - -Reverse map -=========== - -The mmu maintains a reverse mapping whereby all ptes mapping a page can be -reached given its gfn. This is used, for example, when swapping out a page. - -Synchronized and unsynchronized pages -===================================== - -The guest uses two events to synchronize its tlb and page tables: tlb flushes -and page invalidations (invlpg). - -A tlb flush means that we need to synchronize all sptes reachable from the -guest's cr3. This is expensive, so we keep all guest page tables write -protected, and synchronize sptes to gptes when a gpte is written. - -A special case is when a guest page table is reachable from the current -guest cr3. In this case, the guest is obliged to issue an invlpg instruction -before using the translation. We take advantage of that by removing write -protection from the guest page, and allowing the guest to modify it freely. -We synchronize modified gptes when the guest invokes invlpg. This reduces -the amount of emulation we have to do when the guest modifies multiple gptes, -or when the a guest page is no longer used as a page table and is used for -random guest data. - -As a side effect we have to resynchronize all reachable unsynchronized shadow -pages on a tlb flush. - - -Reaction to events -================== - -- guest page fault (or npt page fault, or ept violation) - -This is the most complicated event. The cause of a page fault can be: - - - a true guest fault (the guest translation won't allow the access) (*) - - access to a missing translation - - access to a protected translation - - when logging dirty pages, memory is write protected - - synchronized shadow pages are write protected (*) - - access to untranslatable memory (mmio) - - (*) not applicable in direct mode - -Handling a page fault is performed as follows: - - - if the RSV bit of the error code is set, the page fault is caused by guest - accessing MMIO and cached MMIO information is available. - - walk shadow page table - - check for valid generation number in the spte (see "Fast invalidation of - MMIO sptes" below) - - cache the information to vcpu->arch.mmio_gva, vcpu->arch.access and - vcpu->arch.mmio_gfn, and call the emulator - - If both P bit and R/W bit of error code are set, this could possibly - be handled as a "fast page fault" (fixed without taking the MMU lock). See - the description in Documentation/virtual/kvm/locking.txt. - - if needed, walk the guest page tables to determine the guest translation - (gva->gpa or ngpa->gpa) - - if permissions are insufficient, reflect the fault back to the guest - - determine the host page - - if this is an mmio request, there is no host page; cache the info to - vcpu->arch.mmio_gva, vcpu->arch.access and vcpu->arch.mmio_gfn - - walk the shadow page table to find the spte for the translation, - instantiating missing intermediate page tables as necessary - - If this is an mmio request, cache the mmio info to the spte and set some - reserved bit on the spte (see callers of kvm_mmu_set_mmio_spte_mask) - - try to unsynchronize the page - - if successful, we can let the guest continue and modify the gpte - - emulate the instruction - - if failed, unshadow the page and let the guest continue - - update any translations that were modified by the instruction - -invlpg handling: - - - walk the shadow page hierarchy and drop affected translations - - try to reinstantiate the indicated translation in the hope that the - guest will use it in the near future - -Guest control register updates: - -- mov to cr3 - - look up new shadow roots - - synchronize newly reachable shadow pages - -- mov to cr0/cr4/efer - - set up mmu context for new paging mode - - look up new shadow roots - - synchronize newly reachable shadow pages - -Host translation updates: - - - mmu notifier called with updated hva - - look up affected sptes through reverse map - - drop (or update) translations - -Emulating cr0.wp -================ - -If tdp is not enabled, the host must keep cr0.wp=1 so page write protection -works for the guest kernel, not guest guest userspace. When the guest -cr0.wp=1, this does not present a problem. However when the guest cr0.wp=0, -we cannot map the permissions for gpte.u=1, gpte.w=0 to any spte (the -semantics require allowing any guest kernel access plus user read access). - -We handle this by mapping the permissions to two possible sptes, depending -on fault type: - -- kernel write fault: spte.u=0, spte.w=1 (allows full kernel access, - disallows user access) -- read fault: spte.u=1, spte.w=0 (allows full read access, disallows kernel - write access) - -(user write faults generate a #PF) - -In the first case there are two additional complications: -- if CR4.SMEP is enabled: since we've turned the page into a kernel page, - the kernel may now execute it. We handle this by also setting spte.nx. - If we get a user fetch or read fault, we'll change spte.u=1 and - spte.nx=gpte.nx back. For this to work, KVM forces EFER.NX to 1 when - shadow paging is in use. -- if CR4.SMAP is disabled: since the page has been changed to a kernel - page, it can not be reused when CR4.SMAP is enabled. We set - CR4.SMAP && !CR0.WP into shadow page's role to avoid this case. Note, - here we do not care the case that CR4.SMAP is enabled since KVM will - directly inject #PF to guest due to failed permission check. - -To prevent an spte that was converted into a kernel page with cr0.wp=0 -from being written by the kernel after cr0.wp has changed to 1, we make -the value of cr0.wp part of the page role. This means that an spte created -with one value of cr0.wp cannot be used when cr0.wp has a different value - -it will simply be missed by the shadow page lookup code. A similar issue -exists when an spte created with cr0.wp=0 and cr4.smep=0 is used after -changing cr4.smep to 1. To avoid this, the value of !cr0.wp && cr4.smep -is also made a part of the page role. - -Large pages -=========== - -The mmu supports all combinations of large and small guest and host pages. -Supported page sizes include 4k, 2M, 4M, and 1G. 4M pages are treated as -two separate 2M pages, on both guest and host, since the mmu always uses PAE -paging. - -To instantiate a large spte, four constraints must be satisfied: - -- the spte must point to a large host page -- the guest pte must be a large pte of at least equivalent size (if tdp is - enabled, there is no guest pte and this condition is satisfied) -- if the spte will be writeable, the large page frame may not overlap any - write-protected pages -- the guest page must be wholly contained by a single memory slot - -To check the last two conditions, the mmu maintains a ->disallow_lpage set of -arrays for each memory slot and large page size. Every write protected page -causes its disallow_lpage to be incremented, thus preventing instantiation of -a large spte. The frames at the end of an unaligned memory slot have -artificially inflated ->disallow_lpages so they can never be instantiated. - -Fast invalidation of MMIO sptes -=============================== - -As mentioned in "Reaction to events" above, kvm will cache MMIO -information in leaf sptes. When a new memslot is added or an existing -memslot is changed, this information may become stale and needs to be -invalidated. This also needs to hold the MMU lock while walking all -shadow pages, and is made more scalable with a similar technique. - -MMIO sptes have a few spare bits, which are used to store a -generation number. The global generation number is stored in -kvm_memslots(kvm)->generation, and increased whenever guest memory info -changes. - -When KVM finds an MMIO spte, it checks the generation number of the spte. -If the generation number of the spte does not equal the global generation -number, it will ignore the cached MMIO information and handle the page -fault through the slow path. - -Since only 19 bits are used to store generation-number on mmio spte, all -pages are zapped when there is an overflow. - -Unfortunately, a single memory access might access kvm_memslots(kvm) multiple -times, the last one happening when the generation number is retrieved and -stored into the MMIO spte. Thus, the MMIO spte might be created based on -out-of-date information, but with an up-to-date generation number. - -To avoid this, the generation number is incremented again after synchronize_srcu -returns; thus, bit 63 of kvm_memslots(kvm)->generation set to 1 only during a -memslot update, while some SRCU readers might be using the old copy. We do not -want to use an MMIO sptes created with an odd generation number, and we can do -this without losing a bit in the MMIO spte. The "update in-progress" bit of the -generation is not stored in MMIO spte, and is so is implicitly zero when the -generation is extracted out of the spte. If KVM is unlucky and creates an MMIO -spte while an update is in-progress, the next access to the spte will always be -a cache miss. For example, a subsequent access during the update window will -miss due to the in-progress flag diverging, while an access after the update -window closes will have a higher generation number (as compared to the spte). - - -Further reading -=============== - -- NPT presentation from KVM Forum 2008 - http://www.linux-kvm.org/images/c/c8/KvmForum2008%24kdf2008_21.pdf - diff --git a/Documentation/virtual/kvm/msr.txt b/Documentation/virtual/kvm/msr.txt deleted file mode 100644 index df1f4338b3ca..000000000000 --- a/Documentation/virtual/kvm/msr.txt +++ /dev/null @@ -1,284 +0,0 @@ -KVM-specific MSRs. -Glauber Costa , Red Hat Inc, 2010 -===================================================== - -KVM makes use of some custom MSRs to service some requests. - -Custom MSRs have a range reserved for them, that goes from -0x4b564d00 to 0x4b564dff. There are MSRs outside this area, -but they are deprecated and their use is discouraged. - -Custom MSR list --------- - -The current supported Custom MSR list is: - -MSR_KVM_WALL_CLOCK_NEW: 0x4b564d00 - - data: 4-byte alignment physical address of a memory area which must be - in guest RAM. This memory is expected to hold a copy of the following - structure: - - struct pvclock_wall_clock { - u32 version; - u32 sec; - u32 nsec; - } __attribute__((__packed__)); - - whose data will be filled in by the hypervisor. The hypervisor is only - guaranteed to update this data at the moment of MSR write. - Users that want to reliably query this information more than once have - to write more than once to this MSR. Fields have the following meanings: - - version: guest has to check version before and after grabbing - time information and check that they are both equal and even. - An odd version indicates an in-progress update. - - sec: number of seconds for wallclock at time of boot. - - nsec: number of nanoseconds for wallclock at time of boot. - - In order to get the current wallclock time, the system_time from - MSR_KVM_SYSTEM_TIME_NEW needs to be added. - - Note that although MSRs are per-CPU entities, the effect of this - particular MSR is global. - - Availability of this MSR must be checked via bit 3 in 0x4000001 cpuid - leaf prior to usage. - -MSR_KVM_SYSTEM_TIME_NEW: 0x4b564d01 - - data: 4-byte aligned physical address of a memory area which must be in - guest RAM, plus an enable bit in bit 0. This memory is expected to hold - a copy of the following structure: - - struct pvclock_vcpu_time_info { - u32 version; - u32 pad0; - u64 tsc_timestamp; - u64 system_time; - u32 tsc_to_system_mul; - s8 tsc_shift; - u8 flags; - u8 pad[2]; - } __attribute__((__packed__)); /* 32 bytes */ - - whose data will be filled in by the hypervisor periodically. Only one - write, or registration, is needed for each VCPU. The interval between - updates of this structure is arbitrary and implementation-dependent. - The hypervisor may update this structure at any time it sees fit until - anything with bit0 == 0 is written to it. - - Fields have the following meanings: - - version: guest has to check version before and after grabbing - time information and check that they are both equal and even. - An odd version indicates an in-progress update. - - tsc_timestamp: the tsc value at the current VCPU at the time - of the update of this structure. Guests can subtract this value - from current tsc to derive a notion of elapsed time since the - structure update. - - system_time: a host notion of monotonic time, including sleep - time at the time this structure was last updated. Unit is - nanoseconds. - - tsc_to_system_mul: multiplier to be used when converting - tsc-related quantity to nanoseconds - - tsc_shift: shift to be used when converting tsc-related - quantity to nanoseconds. This shift will ensure that - multiplication with tsc_to_system_mul does not overflow. - A positive value denotes a left shift, a negative value - a right shift. - - The conversion from tsc to nanoseconds involves an additional - right shift by 32 bits. With this information, guests can - derive per-CPU time by doing: - - time = (current_tsc - tsc_timestamp) - if (tsc_shift >= 0) - time <<= tsc_shift; - else - time >>= -tsc_shift; - time = (time * tsc_to_system_mul) >> 32 - time = time + system_time - - flags: bits in this field indicate extended capabilities - coordinated between the guest and the hypervisor. Availability - of specific flags has to be checked in 0x40000001 cpuid leaf. - Current flags are: - - flag bit | cpuid bit | meaning - ------------------------------------------------------------- - | | time measures taken across - 0 | 24 | multiple cpus are guaranteed to - | | be monotonic - ------------------------------------------------------------- - | | guest vcpu has been paused by - 1 | N/A | the host - | | See 4.70 in api.txt - ------------------------------------------------------------- - - Availability of this MSR must be checked via bit 3 in 0x4000001 cpuid - leaf prior to usage. - - -MSR_KVM_WALL_CLOCK: 0x11 - - data and functioning: same as MSR_KVM_WALL_CLOCK_NEW. Use that instead. - - This MSR falls outside the reserved KVM range and may be removed in the - future. Its usage is deprecated. - - Availability of this MSR must be checked via bit 0 in 0x4000001 cpuid - leaf prior to usage. - -MSR_KVM_SYSTEM_TIME: 0x12 - - data and functioning: same as MSR_KVM_SYSTEM_TIME_NEW. Use that instead. - - This MSR falls outside the reserved KVM range and may be removed in the - future. Its usage is deprecated. - - Availability of this MSR must be checked via bit 0 in 0x4000001 cpuid - leaf prior to usage. - - The suggested algorithm for detecting kvmclock presence is then: - - if (!kvm_para_available()) /* refer to cpuid.txt */ - return NON_PRESENT; - - flags = cpuid_eax(0x40000001); - if (flags & 3) { - msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW; - msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW; - return PRESENT; - } else if (flags & 0) { - msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; - msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK; - return PRESENT; - } else - return NON_PRESENT; - -MSR_KVM_ASYNC_PF_EN: 0x4b564d02 - data: Bits 63-6 hold 64-byte aligned physical address of a - 64 byte memory area which must be in guest RAM and must be - zeroed. Bits 5-3 are reserved and should be zero. Bit 0 is 1 - when asynchronous page faults are enabled on the vcpu 0 when - disabled. Bit 1 is 1 if asynchronous page faults can be injected - when vcpu is in cpl == 0. Bit 2 is 1 if asynchronous page faults - are delivered to L1 as #PF vmexits. Bit 2 can be set only if - KVM_FEATURE_ASYNC_PF_VMEXIT is present in CPUID. - - First 4 byte of 64 byte memory location will be written to by - the hypervisor at the time of asynchronous page fault (APF) - injection to indicate type of asynchronous page fault. Value - of 1 means that the page referred to by the page fault is not - present. Value 2 means that the page is now available. Disabling - interrupt inhibits APFs. Guest must not enable interrupt - before the reason is read, or it may be overwritten by another - APF. Since APF uses the same exception vector as regular page - fault guest must reset the reason to 0 before it does - something that can generate normal page fault. If during page - fault APF reason is 0 it means that this is regular page - fault. - - During delivery of type 1 APF cr2 contains a token that will - be used to notify a guest when missing page becomes - available. When page becomes available type 2 APF is sent with - cr2 set to the token associated with the page. There is special - kind of token 0xffffffff which tells vcpu that it should wake - up all processes waiting for APFs and no individual type 2 APFs - will be sent. - - If APF is disabled while there are outstanding APFs, they will - not be delivered. - - Currently type 2 APF will be always delivered on the same vcpu as - type 1 was, but guest should not rely on that. - -MSR_KVM_STEAL_TIME: 0x4b564d03 - - data: 64-byte alignment physical address of a memory area which must be - in guest RAM, plus an enable bit in bit 0. This memory is expected to - hold a copy of the following structure: - - struct kvm_steal_time { - __u64 steal; - __u32 version; - __u32 flags; - __u8 preempted; - __u8 u8_pad[3]; - __u32 pad[11]; - } - - whose data will be filled in by the hypervisor periodically. Only one - write, or registration, is needed for each VCPU. The interval between - updates of this structure is arbitrary and implementation-dependent. - The hypervisor may update this structure at any time it sees fit until - anything with bit0 == 0 is written to it. Guest is required to make sure - this structure is initialized to zero. - - Fields have the following meanings: - - version: a sequence counter. In other words, guest has to check - this field before and after grabbing time information and make - sure they are both equal and even. An odd version indicates an - in-progress update. - - flags: At this point, always zero. May be used to indicate - changes in this structure in the future. - - steal: the amount of time in which this vCPU did not run, in - nanoseconds. Time during which the vcpu is idle, will not be - reported as steal time. - - preempted: indicate the vCPU who owns this struct is running or - not. Non-zero values mean the vCPU has been preempted. Zero - means the vCPU is not preempted. NOTE, it is always zero if the - the hypervisor doesn't support this field. - -MSR_KVM_EOI_EN: 0x4b564d04 - data: Bit 0 is 1 when PV end of interrupt is enabled on the vcpu; 0 - when disabled. Bit 1 is reserved and must be zero. When PV end of - interrupt is enabled (bit 0 set), bits 63-2 hold a 4-byte aligned - physical address of a 4 byte memory area which must be in guest RAM and - must be zeroed. - - The first, least significant bit of 4 byte memory location will be - written to by the hypervisor, typically at the time of interrupt - injection. Value of 1 means that guest can skip writing EOI to the apic - (using MSR or MMIO write); instead, it is sufficient to signal - EOI by clearing the bit in guest memory - this location will - later be polled by the hypervisor. - Value of 0 means that the EOI write is required. - - It is always safe for the guest to ignore the optimization and perform - the APIC EOI write anyway. - - Hypervisor is guaranteed to only modify this least - significant bit while in the current VCPU context, this means that - guest does not need to use either lock prefix or memory ordering - primitives to synchronise with the hypervisor. - - However, hypervisor can set and clear this memory bit at any time: - therefore to make sure hypervisor does not interrupt the - guest and clear the least significant bit in the memory area - in the window between guest testing it to detect - whether it can skip EOI apic write and between guest - clearing it to signal EOI to the hypervisor, - guest must both read the least significant bit in the memory area and - clear it using a single CPU instruction, such as test and clear, or - compare and exchange. - -MSR_KVM_POLL_CONTROL: 0x4b564d05 - Control host-side polling. - - data: Bit 0 enables (1) or disables (0) host-side HLT polling logic. - - KVM guests can request the host not to poll on HLT, for example if - they are performing polling themselves. - diff --git a/Documentation/virtual/kvm/nested-vmx.txt b/Documentation/virtual/kvm/nested-vmx.txt deleted file mode 100644 index 97eb1353e962..000000000000 --- a/Documentation/virtual/kvm/nested-vmx.txt +++ /dev/null @@ -1,240 +0,0 @@ -Nested VMX -========== - -Overview ---------- - -On Intel processors, KVM uses Intel's VMX (Virtual-Machine eXtensions) -to easily and efficiently run guest operating systems. Normally, these guests -*cannot* themselves be hypervisors running their own guests, because in VMX, -guests cannot use VMX instructions. - -The "Nested VMX" feature adds this missing capability - of running guest -hypervisors (which use VMX) with their own nested guests. It does so by -allowing a guest to use VMX instructions, and correctly and efficiently -emulating them using the single level of VMX available in the hardware. - -We describe in much greater detail the theory behind the nested VMX feature, -its implementation and its performance characteristics, in the OSDI 2010 paper -"The Turtles Project: Design and Implementation of Nested Virtualization", -available at: - - http://www.usenix.org/events/osdi10/tech/full_papers/Ben-Yehuda.pdf - - -Terminology ------------ - -Single-level virtualization has two levels - the host (KVM) and the guests. -In nested virtualization, we have three levels: The host (KVM), which we call -L0, the guest hypervisor, which we call L1, and its nested guest, which we -call L2. - - -Running nested VMX ------------------- - -The nested VMX feature is disabled by default. It can be enabled by giving -the "nested=1" option to the kvm-intel module. - -No modifications are required to user space (qemu). However, qemu's default -emulated CPU type (qemu64) does not list the "VMX" CPU feature, so it must be -explicitly enabled, by giving qemu one of the following options: - - -cpu host (emulated CPU has all features of the real CPU) - - -cpu qemu64,+vmx (add just the vmx feature to a named CPU type) - - -ABIs ----- - -Nested VMX aims to present a standard and (eventually) fully-functional VMX -implementation for the a guest hypervisor to use. As such, the official -specification of the ABI that it provides is Intel's VMX specification, -namely volume 3B of their "Intel 64 and IA-32 Architectures Software -Developer's Manual". Not all of VMX's features are currently fully supported, -but the goal is to eventually support them all, starting with the VMX features -which are used in practice by popular hypervisors (KVM and others). - -As a VMX implementation, nested VMX presents a VMCS structure to L1. -As mandated by the spec, other than the two fields revision_id and abort, -this structure is *opaque* to its user, who is not supposed to know or care -about its internal structure. Rather, the structure is accessed through the -VMREAD and VMWRITE instructions. -Still, for debugging purposes, KVM developers might be interested to know the -internals of this structure; This is struct vmcs12 from arch/x86/kvm/vmx.c. - -The name "vmcs12" refers to the VMCS that L1 builds for L2. In the code we -also have "vmcs01", the VMCS that L0 built for L1, and "vmcs02" is the VMCS -which L0 builds to actually run L2 - how this is done is explained in the -aforementioned paper. - -For convenience, we repeat the content of struct vmcs12 here. If the internals -of this structure changes, this can break live migration across KVM versions. -VMCS12_REVISION (from vmx.c) should be changed if struct vmcs12 or its inner -struct shadow_vmcs is ever changed. - - typedef u64 natural_width; - struct __packed vmcs12 { - /* According to the Intel spec, a VMCS region must start with - * these two user-visible fields */ - u32 revision_id; - u32 abort; - - u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */ - u32 padding[7]; /* room for future expansion */ - - u64 io_bitmap_a; - u64 io_bitmap_b; - u64 msr_bitmap; - u64 vm_exit_msr_store_addr; - u64 vm_exit_msr_load_addr; - u64 vm_entry_msr_load_addr; - u64 tsc_offset; - u64 virtual_apic_page_addr; - u64 apic_access_addr; - u64 ept_pointer; - u64 guest_physical_address; - u64 vmcs_link_pointer; - u64 guest_ia32_debugctl; - u64 guest_ia32_pat; - u64 guest_ia32_efer; - u64 guest_pdptr0; - u64 guest_pdptr1; - u64 guest_pdptr2; - u64 guest_pdptr3; - u64 host_ia32_pat; - u64 host_ia32_efer; - u64 padding64[8]; /* room for future expansion */ - natural_width cr0_guest_host_mask; - natural_width cr4_guest_host_mask; - natural_width cr0_read_shadow; - natural_width cr4_read_shadow; - natural_width cr3_target_value0; - natural_width cr3_target_value1; - natural_width cr3_target_value2; - natural_width cr3_target_value3; - natural_width exit_qualification; - natural_width guest_linear_address; - natural_width guest_cr0; - natural_width guest_cr3; - natural_width guest_cr4; - natural_width guest_es_base; - natural_width guest_cs_base; - natural_width guest_ss_base; - natural_width guest_ds_base; - natural_width guest_fs_base; - natural_width guest_gs_base; - natural_width guest_ldtr_base; - natural_width guest_tr_base; - natural_width guest_gdtr_base; - natural_width guest_idtr_base; - natural_width guest_dr7; - natural_width guest_rsp; - natural_width guest_rip; - natural_width guest_rflags; - natural_width guest_pending_dbg_exceptions; - natural_width guest_sysenter_esp; - natural_width guest_sysenter_eip; - natural_width host_cr0; - natural_width host_cr3; - natural_width host_cr4; - natural_width host_fs_base; - natural_width host_gs_base; - natural_width host_tr_base; - natural_width host_gdtr_base; - natural_width host_idtr_base; - natural_width host_ia32_sysenter_esp; - natural_width host_ia32_sysenter_eip; - natural_width host_rsp; - natural_width host_rip; - natural_width paddingl[8]; /* room for future expansion */ - u32 pin_based_vm_exec_control; - u32 cpu_based_vm_exec_control; - u32 exception_bitmap; - u32 page_fault_error_code_mask; - u32 page_fault_error_code_match; - u32 cr3_target_count; - u32 vm_exit_controls; - u32 vm_exit_msr_store_count; - u32 vm_exit_msr_load_count; - u32 vm_entry_controls; - u32 vm_entry_msr_load_count; - u32 vm_entry_intr_info_field; - u32 vm_entry_exception_error_code; - u32 vm_entry_instruction_len; - u32 tpr_threshold; - u32 secondary_vm_exec_control; - u32 vm_instruction_error; - u32 vm_exit_reason; - u32 vm_exit_intr_info; - u32 vm_exit_intr_error_code; - u32 idt_vectoring_info_field; - u32 idt_vectoring_error_code; - u32 vm_exit_instruction_len; - u32 vmx_instruction_info; - u32 guest_es_limit; - u32 guest_cs_limit; - u32 guest_ss_limit; - u32 guest_ds_limit; - u32 guest_fs_limit; - u32 guest_gs_limit; - u32 guest_ldtr_limit; - u32 guest_tr_limit; - u32 guest_gdtr_limit; - u32 guest_idtr_limit; - u32 guest_es_ar_bytes; - u32 guest_cs_ar_bytes; - u32 guest_ss_ar_bytes; - u32 guest_ds_ar_bytes; - u32 guest_fs_ar_bytes; - u32 guest_gs_ar_bytes; - u32 guest_ldtr_ar_bytes; - u32 guest_tr_ar_bytes; - u32 guest_interruptibility_info; - u32 guest_activity_state; - u32 guest_sysenter_cs; - u32 host_ia32_sysenter_cs; - u32 padding32[8]; /* room for future expansion */ - u16 virtual_processor_id; - u16 guest_es_selector; - u16 guest_cs_selector; - u16 guest_ss_selector; - u16 guest_ds_selector; - u16 guest_fs_selector; - u16 guest_gs_selector; - u16 guest_ldtr_selector; - u16 guest_tr_selector; - u16 host_es_selector; - u16 host_cs_selector; - u16 host_ss_selector; - u16 host_ds_selector; - u16 host_fs_selector; - u16 host_gs_selector; - u16 host_tr_selector; - }; - - -Authors -------- - -These patches were written by: - Abel Gordon, abelg il.ibm.com - Nadav Har'El, nyh il.ibm.com - Orit Wasserman, oritw il.ibm.com - Ben-Ami Yassor, benami il.ibm.com - Muli Ben-Yehuda, muli il.ibm.com - -With contributions by: - Anthony Liguori, aliguori us.ibm.com - Mike Day, mdday us.ibm.com - Michael Factor, factor il.ibm.com - Zvi Dubitzky, dubi il.ibm.com - -And valuable reviews by: - Avi Kivity, avi redhat.com - Gleb Natapov, gleb redhat.com - Marcelo Tosatti, mtosatti redhat.com - Kevin Tian, kevin.tian intel.com - and others. diff --git a/Documentation/virtual/kvm/ppc-pv.txt b/Documentation/virtual/kvm/ppc-pv.txt deleted file mode 100644 index e26115ce4258..000000000000 --- a/Documentation/virtual/kvm/ppc-pv.txt +++ /dev/null @@ -1,212 +0,0 @@ -The PPC KVM paravirtual interface -================================= - -The basic execution principle by which KVM on PowerPC works is to run all kernel -space code in PR=1 which is user space. This way we trap all privileged -instructions and can emulate them accordingly. - -Unfortunately that is also the downfall. There are quite some privileged -instructions that needlessly return us to the hypervisor even though they -could be handled differently. - -This is what the PPC PV interface helps with. It takes privileged instructions -and transforms them into unprivileged ones with some help from the hypervisor. -This cuts down virtualization costs by about 50% on some of my benchmarks. - -The code for that interface can be found in arch/powerpc/kernel/kvm* - -Querying for existence -====================== - -To find out if we're running on KVM or not, we leverage the device tree. When -Linux is running on KVM, a node /hypervisor exists. That node contains a -compatible property with the value "linux,kvm". - -Once you determined you're running under a PV capable KVM, you can now use -hypercalls as described below. - -KVM hypercalls -============== - -Inside the device tree's /hypervisor node there's a property called -'hypercall-instructions'. This property contains at most 4 opcodes that make -up the hypercall. To call a hypercall, just call these instructions. - -The parameters are as follows: - - Register IN OUT - - r0 - volatile - r3 1st parameter Return code - r4 2nd parameter 1st output value - r5 3rd parameter 2nd output value - r6 4th parameter 3rd output value - r7 5th parameter 4th output value - r8 6th parameter 5th output value - r9 7th parameter 6th output value - r10 8th parameter 7th output value - r11 hypercall number 8th output value - r12 - volatile - -Hypercall definitions are shared in generic code, so the same hypercall numbers -apply for x86 and powerpc alike with the exception that each KVM hypercall -also needs to be ORed with the KVM vendor code which is (42 << 16). - -Return codes can be as follows: - - Code Meaning - - 0 Success - 12 Hypercall not implemented - <0 Error - -The magic page -============== - -To enable communication between the hypervisor and guest there is a new shared -page that contains parts of supervisor visible register state. The guest can -map this shared page using the KVM hypercall KVM_HC_PPC_MAP_MAGIC_PAGE. - -With this hypercall issued the guest always gets the magic page mapped at the -desired location. The first parameter indicates the effective address when the -MMU is enabled. The second parameter indicates the address in real mode, if -applicable to the target. For now, we always map the page to -4096. This way we -can access it using absolute load and store functions. The following -instruction reads the first field of the magic page: - - ld rX, -4096(0) - -The interface is designed to be extensible should there be need later to add -additional registers to the magic page. If you add fields to the magic page, -also define a new hypercall feature to indicate that the host can give you more -registers. Only if the host supports the additional features, make use of them. - -The magic page layout is described by struct kvm_vcpu_arch_shared -in arch/powerpc/include/asm/kvm_para.h. - -Magic page features -=================== - -When mapping the magic page using the KVM hypercall KVM_HC_PPC_MAP_MAGIC_PAGE, -a second return value is passed to the guest. This second return value contains -a bitmap of available features inside the magic page. - -The following enhancements to the magic page are currently available: - - KVM_MAGIC_FEAT_SR Maps SR registers r/w in the magic page - KVM_MAGIC_FEAT_MAS0_TO_SPRG7 Maps MASn, ESR, PIR and high SPRGs - -For enhanced features in the magic page, please check for the existence of the -feature before using them! - -Magic page flags -================ - -In addition to features that indicate whether a host is capable of a particular -feature we also have a channel for a guest to tell the guest whether it's capable -of something. This is what we call "flags". - -Flags are passed to the host in the low 12 bits of the Effective Address. - -The following flags are currently available for a guest to expose: - - MAGIC_PAGE_FLAG_NOT_MAPPED_NX Guest handles NX bits correctly wrt magic page - -MSR bits -======== - -The MSR contains bits that require hypervisor intervention and bits that do -not require direct hypervisor intervention because they only get interpreted -when entering the guest or don't have any impact on the hypervisor's behavior. - -The following bits are safe to be set inside the guest: - - MSR_EE - MSR_RI - -If any other bit changes in the MSR, please still use mtmsr(d). - -Patched instructions -==================== - -The "ld" and "std" instructions are transformed to "lwz" and "stw" instructions -respectively on 32 bit systems with an added offset of 4 to accommodate for big -endianness. - -The following is a list of mapping the Linux kernel performs when running as -guest. Implementing any of those mappings is optional, as the instruction traps -also act on the shared page. So calling privileged instructions still works as -before. - -From To -==== == - -mfmsr rX ld rX, magic_page->msr -mfsprg rX, 0 ld rX, magic_page->sprg0 -mfsprg rX, 1 ld rX, magic_page->sprg1 -mfsprg rX, 2 ld rX, magic_page->sprg2 -mfsprg rX, 3 ld rX, magic_page->sprg3 -mfsrr0 rX ld rX, magic_page->srr0 -mfsrr1 rX ld rX, magic_page->srr1 -mfdar rX ld rX, magic_page->dar -mfdsisr rX lwz rX, magic_page->dsisr - -mtmsr rX std rX, magic_page->msr -mtsprg 0, rX std rX, magic_page->sprg0 -mtsprg 1, rX std rX, magic_page->sprg1 -mtsprg 2, rX std rX, magic_page->sprg2 -mtsprg 3, rX std rX, magic_page->sprg3 -mtsrr0 rX std rX, magic_page->srr0 -mtsrr1 rX std rX, magic_page->srr1 -mtdar rX std rX, magic_page->dar -mtdsisr rX stw rX, magic_page->dsisr - -tlbsync nop - -mtmsrd rX, 0 b -mtmsr rX b - -mtmsrd rX, 1 b - -[Book3S only] -mtsrin rX, rY b - -[BookE only] -wrteei [0|1] b - - -Some instructions require more logic to determine what's going on than a load -or store instruction can deliver. To enable patching of those, we keep some -RAM around where we can live translate instructions to. What happens is the -following: - - 1) copy emulation code to memory - 2) patch that code to fit the emulated instruction - 3) patch that code to return to the original pc + 4 - 4) patch the original instruction to branch to the new code - -That way we can inject an arbitrary amount of code as replacement for a single -instruction. This allows us to check for pending interrupts when setting EE=1 -for example. - -Hypercall ABIs in KVM on PowerPC -================================= -1) KVM hypercalls (ePAPR) - -These are ePAPR compliant hypercall implementation (mentioned above). Even -generic hypercalls are implemented here, like the ePAPR idle hcall. These are -available on all targets. - -2) PAPR hypercalls - -PAPR hypercalls are needed to run server PowerPC PAPR guests (-M pseries in QEMU). -These are the same hypercalls that pHyp, the POWER hypervisor implements. Some of -them are handled in the kernel, some are handled in user space. This is only -available on book3s_64. - -3) OSI hypercalls - -Mac-on-Linux is another user of KVM on PowerPC, which has its own hypercall (long -before KVM). This is supported to maintain compatibility. All these hypercalls get -forwarded to user space. This is only useful on book3s_32, but can be used with -book3s_64 as well. diff --git a/Documentation/virtual/kvm/review-checklist.txt b/Documentation/virtual/kvm/review-checklist.txt deleted file mode 100644 index a83b27635fdd..000000000000 --- a/Documentation/virtual/kvm/review-checklist.txt +++ /dev/null @@ -1,38 +0,0 @@ -Review checklist for kvm patches -================================ - -1. The patch must follow Documentation/process/coding-style.rst and - Documentation/process/submitting-patches.rst. - -2. Patches should be against kvm.git master branch. - -3. If the patch introduces or modifies a new userspace API: - - the API must be documented in Documentation/virtual/kvm/api.txt - - the API must be discoverable using KVM_CHECK_EXTENSION - -4. New state must include support for save/restore. - -5. New features must default to off (userspace should explicitly request them). - Performance improvements can and should default to on. - -6. New cpu features should be exposed via KVM_GET_SUPPORTED_CPUID2 - -7. Emulator changes should be accompanied by unit tests for qemu-kvm.git - kvm/test directory. - -8. Changes should be vendor neutral when possible. Changes to common code - are better than duplicating changes to vendor code. - -9. Similarly, prefer changes to arch independent code than to arch dependent - code. - -10. User/kernel interfaces and guest/host interfaces must be 64-bit clean - (all variables and sizes naturally aligned on 64-bit; use specific types - only - u64 rather than ulong). - -11. New guest visible features must either be documented in a hardware manual - or be accompanied by documentation. - -12. Features must be robust against reset and kexec - for example, shared - host/guest memory must be unshared to prevent the host from writing to - guest memory that the guest has not reserved for this purpose. diff --git a/Documentation/virtual/kvm/s390-diag.txt b/Documentation/virtual/kvm/s390-diag.txt deleted file mode 100644 index 7c52e5f8b210..000000000000 --- a/Documentation/virtual/kvm/s390-diag.txt +++ /dev/null @@ -1,83 +0,0 @@ -The s390 DIAGNOSE call on KVM -============================= - -KVM on s390 supports the DIAGNOSE call for making hypercalls, both for -native hypercalls and for selected hypercalls found on other s390 -hypervisors. - -Note that bits are numbered as by the usual s390 convention (most significant -bit on the left). - - -General remarks ---------------- - -DIAGNOSE calls by the guest cause a mandatory intercept. This implies -all supported DIAGNOSE calls need to be handled by either KVM or its -userspace. - -All DIAGNOSE calls supported by KVM use the RS-a format: - --------------------------------------- -| '83' | R1 | R3 | B2 | D2 | --------------------------------------- -0 8 12 16 20 31 - -The second-operand address (obtained by the base/displacement calculation) -is not used to address data. Instead, bits 48-63 of this address specify -the function code, and bits 0-47 are ignored. - -The supported DIAGNOSE function codes vary by the userspace used. For -DIAGNOSE function codes not specific to KVM, please refer to the -documentation for the s390 hypervisors defining them. - - -DIAGNOSE function code 'X'500' - KVM virtio functions ------------------------------------------------------ - -If the function code specifies 0x500, various virtio-related functions -are performed. - -General register 1 contains the virtio subfunction code. Supported -virtio subfunctions depend on KVM's userspace. Generally, userspace -provides either s390-virtio (subcodes 0-2) or virtio-ccw (subcode 3). - -Upon completion of the DIAGNOSE instruction, general register 2 contains -the function's return code, which is either a return code or a subcode -specific value. - -Subcode 0 - s390-virtio notification and early console printk - Handled by userspace. - -Subcode 1 - s390-virtio reset - Handled by userspace. - -Subcode 2 - s390-virtio set status - Handled by userspace. - -Subcode 3 - virtio-ccw notification - Handled by either userspace or KVM (ioeventfd case). - - General register 2 contains a subchannel-identification word denoting - the subchannel of the virtio-ccw proxy device to be notified. - - General register 3 contains the number of the virtqueue to be notified. - - General register 4 contains a 64bit identifier for KVM usage (the - kvm_io_bus cookie). If general register 4 does not contain a valid - identifier, it is ignored. - - After completion of the DIAGNOSE call, general register 2 may contain - a 64bit identifier (in the kvm_io_bus cookie case), or a negative - error value, if an internal error occurred. - - See also the virtio standard for a discussion of this hypercall. - - -DIAGNOSE function code 'X'501 - KVM breakpoint ----------------------------------------------- - -If the function code specifies 0x501, breakpoint functions may be performed. -This function code is handled by userspace. - -This diagnose function code has no subfunctions and uses no parameters. diff --git a/Documentation/virtual/kvm/timekeeping.txt b/Documentation/virtual/kvm/timekeeping.txt deleted file mode 100644 index 76808a17ad84..000000000000 --- a/Documentation/virtual/kvm/timekeeping.txt +++ /dev/null @@ -1,612 +0,0 @@ - - Timekeeping Virtualization for X86-Based Architectures - - Zachary Amsden - Copyright (c) 2010, Red Hat. All rights reserved. - -1) Overview -2) Timing Devices -3) TSC Hardware -4) Virtualization Problems - -========================================================================= - -1) Overview - -One of the most complicated parts of the X86 platform, and specifically, -the virtualization of this platform is the plethora of timing devices available -and the complexity of emulating those devices. In addition, virtualization of -time introduces a new set of challenges because it introduces a multiplexed -division of time beyond the control of the guest CPU. - -First, we will describe the various timekeeping hardware available, then -present some of the problems which arise and solutions available, giving -specific recommendations for certain classes of KVM guests. - -The purpose of this document is to collect data and information relevant to -timekeeping which may be difficult to find elsewhere, specifically, -information relevant to KVM and hardware-based virtualization. - -========================================================================= - -2) Timing Devices - -First we discuss the basic hardware devices available. TSC and the related -KVM clock are special enough to warrant a full exposition and are described in -the following section. - -2.1) i8254 - PIT - -One of the first timer devices available is the programmable interrupt timer, -or PIT. The PIT has a fixed frequency 1.193182 MHz base clock and three -channels which can be programmed to deliver periodic or one-shot interrupts. -These three channels can be configured in different modes and have individual -counters. Channel 1 and 2 were not available for general use in the original -IBM PC, and historically were connected to control RAM refresh and the PC -speaker. Now the PIT is typically integrated as part of an emulated chipset -and a separate physical PIT is not used. - -The PIT uses I/O ports 0x40 - 0x43. Access to the 16-bit counters is done -using single or multiple byte access to the I/O ports. There are 6 modes -available, but not all modes are available to all timers, as only timer 2 -has a connected gate input, required for modes 1 and 5. The gate line is -controlled by port 61h, bit 0, as illustrated in the following diagram. - - -------------- ---------------- -| | | | -| 1.1932 MHz |---------->| CLOCK OUT | ---------> IRQ 0 -| Clock | | | | - -------------- | +->| GATE TIMER 0 | - | ---------------- - | - | ---------------- - | | | - |------>| CLOCK OUT | ---------> 66.3 KHZ DRAM - | | | (aka /dev/null) - | +->| GATE TIMER 1 | - | ---------------- - | - | ---------------- - | | | - |------>| CLOCK OUT | ---------> Port 61h, bit 5 - | | | -Port 61h, bit 0 ---------->| GATE TIMER 2 | \_.---- ____ - ---------------- _| )--|LPF|---Speaker - / *---- \___/ -Port 61h, bit 1 -----------------------------------/ - -The timer modes are now described. - -Mode 0: Single Timeout. This is a one-shot software timeout that counts down - when the gate is high (always true for timers 0 and 1). When the count - reaches zero, the output goes high. - -Mode 1: Triggered One-shot. The output is initially set high. When the gate - line is set high, a countdown is initiated (which does not stop if the gate is - lowered), during which the output is set low. When the count reaches zero, - the output goes high. - -Mode 2: Rate Generator. The output is initially set high. When the countdown - reaches 1, the output goes low for one count and then returns high. The value - is reloaded and the countdown automatically resumes. If the gate line goes - low, the count is halted. If the output is low when the gate is lowered, the - output automatically goes high (this only affects timer 2). - -Mode 3: Square Wave. This generates a high / low square wave. The count - determines the length of the pulse, which alternates between high and low - when zero is reached. The count only proceeds when gate is high and is - automatically reloaded on reaching zero. The count is decremented twice at - each clock to generate a full high / low cycle at the full periodic rate. - If the count is even, the clock remains high for N/2 counts and low for N/2 - counts; if the clock is odd, the clock is high for (N+1)/2 counts and low - for (N-1)/2 counts. Only even values are latched by the counter, so odd - values are not observed when reading. This is the intended mode for timer 2, - which generates sine-like tones by low-pass filtering the square wave output. - -Mode 4: Software Strobe. After programming this mode and loading the counter, - the output remains high until the counter reaches zero. Then the output - goes low for 1 clock cycle and returns high. The counter is not reloaded. - Counting only occurs when gate is high. - -Mode 5: Hardware Strobe. After programming and loading the counter, the - output remains high. When the gate is raised, a countdown is initiated - (which does not stop if the gate is lowered). When the counter reaches zero, - the output goes low for 1 clock cycle and then returns high. The counter is - not reloaded. - -In addition to normal binary counting, the PIT supports BCD counting. The -command port, 0x43 is used to set the counter and mode for each of the three -timers. - -PIT commands, issued to port 0x43, using the following bit encoding: - -Bit 7-4: Command (See table below) -Bit 3-1: Mode (000 = Mode 0, 101 = Mode 5, 11X = undefined) -Bit 0 : Binary (0) / BCD (1) - -Command table: - -0000 - Latch Timer 0 count for port 0x40 - sample and hold the count to be read in port 0x40; - additional commands ignored until counter is read; - mode bits ignored. - -0001 - Set Timer 0 LSB mode for port 0x40 - set timer to read LSB only and force MSB to zero; - mode bits set timer mode - -0010 - Set Timer 0 MSB mode for port 0x40 - set timer to read MSB only and force LSB to zero; - mode bits set timer mode - -0011 - Set Timer 0 16-bit mode for port 0x40 - set timer to read / write LSB first, then MSB; - mode bits set timer mode - -0100 - Latch Timer 1 count for port 0x41 - as described above -0101 - Set Timer 1 LSB mode for port 0x41 - as described above -0110 - Set Timer 1 MSB mode for port 0x41 - as described above -0111 - Set Timer 1 16-bit mode for port 0x41 - as described above - -1000 - Latch Timer 2 count for port 0x42 - as described above -1001 - Set Timer 2 LSB mode for port 0x42 - as described above -1010 - Set Timer 2 MSB mode for port 0x42 - as described above -1011 - Set Timer 2 16-bit mode for port 0x42 as described above - -1101 - General counter latch - Latch combination of counters into corresponding ports - Bit 3 = Counter 2 - Bit 2 = Counter 1 - Bit 1 = Counter 0 - Bit 0 = Unused - -1110 - Latch timer status - Latch combination of counter mode into corresponding ports - Bit 3 = Counter 2 - Bit 2 = Counter 1 - Bit 1 = Counter 0 - - The output of ports 0x40-0x42 following this command will be: - - Bit 7 = Output pin - Bit 6 = Count loaded (0 if timer has expired) - Bit 5-4 = Read / Write mode - 01 = MSB only - 10 = LSB only - 11 = LSB / MSB (16-bit) - Bit 3-1 = Mode - Bit 0 = Binary (0) / BCD mode (1) - -2.2) RTC - -The second device which was available in the original PC was the MC146818 real -time clock. The original device is now obsolete, and usually emulated by the -system chipset, sometimes by an HPET and some frankenstein IRQ routing. - -The RTC is accessed through CMOS variables, which uses an index register to -control which bytes are read. Since there is only one index register, read -of the CMOS and read of the RTC require lock protection (in addition, it is -dangerous to allow userspace utilities such as hwclock to have direct RTC -access, as they could corrupt kernel reads and writes of CMOS memory). - -The RTC generates an interrupt which is usually routed to IRQ 8. The interrupt -can function as a periodic timer, an additional once a day alarm, and can issue -interrupts after an update of the CMOS registers by the MC146818 is complete. -The type of interrupt is signalled in the RTC status registers. - -The RTC will update the current time fields by battery power even while the -system is off. The current time fields should not be read while an update is -in progress, as indicated in the status register. - -The clock uses a 32.768kHz crystal, so bits 6-4 of register A should be -programmed to a 32kHz divider if the RTC is to count seconds. - -This is the RAM map originally used for the RTC/CMOS: - -Location Size Description ------------------------------------------- -00h byte Current second (BCD) -01h byte Seconds alarm (BCD) -02h byte Current minute (BCD) -03h byte Minutes alarm (BCD) -04h byte Current hour (BCD) -05h byte Hours alarm (BCD) -06h byte Current day of week (BCD) -07h byte Current day of month (BCD) -08h byte Current month (BCD) -09h byte Current year (BCD) -0Ah byte Register A - bit 7 = Update in progress - bit 6-4 = Divider for clock - 000 = 4.194 MHz - 001 = 1.049 MHz - 010 = 32 kHz - 10X = test modes - 110 = reset / disable - 111 = reset / disable - bit 3-0 = Rate selection for periodic interrupt - 000 = periodic timer disabled - 001 = 3.90625 uS - 010 = 7.8125 uS - 011 = .122070 mS - 100 = .244141 mS - ... - 1101 = 125 mS - 1110 = 250 mS - 1111 = 500 mS -0Bh byte Register B - bit 7 = Run (0) / Halt (1) - bit 6 = Periodic interrupt enable - bit 5 = Alarm interrupt enable - bit 4 = Update-ended interrupt enable - bit 3 = Square wave interrupt enable - bit 2 = BCD calendar (0) / Binary (1) - bit 1 = 12-hour mode (0) / 24-hour mode (1) - bit 0 = 0 (DST off) / 1 (DST enabled) -OCh byte Register C (read only) - bit 7 = interrupt request flag (IRQF) - bit 6 = periodic interrupt flag (PF) - bit 5 = alarm interrupt flag (AF) - bit 4 = update interrupt flag (UF) - bit 3-0 = reserved -ODh byte Register D (read only) - bit 7 = RTC has power - bit 6-0 = reserved -32h byte Current century BCD (*) - (*) location vendor specific and now determined from ACPI global tables - -2.3) APIC - -On Pentium and later processors, an on-board timer is available to each CPU -as part of the Advanced Programmable Interrupt Controller. The APIC is -accessed through memory-mapped registers and provides interrupt service to each -CPU, used for IPIs and local timer interrupts. - -Although in theory the APIC is a safe and stable source for local interrupts, -in practice, many bugs and glitches have occurred due to the special nature of -the APIC CPU-local memory-mapped hardware. Beware that CPU errata may affect -the use of the APIC and that workarounds may be required. In addition, some of -these workarounds pose unique constraints for virtualization - requiring either -extra overhead incurred from extra reads of memory-mapped I/O or additional -functionality that may be more computationally expensive to implement. - -Since the APIC is documented quite well in the Intel and AMD manuals, we will -avoid repetition of the detail here. It should be pointed out that the APIC -timer is programmed through the LVT (local vector timer) register, is capable -of one-shot or periodic operation, and is based on the bus clock divided down -by the programmable divider register. - -2.4) HPET - -HPET is quite complex, and was originally intended to replace the PIT / RTC -support of the X86 PC. It remains to be seen whether that will be the case, as -the de facto standard of PC hardware is to emulate these older devices. Some -systems designated as legacy free may support only the HPET as a hardware timer -device. - -The HPET spec is rather loose and vague, requiring at least 3 hardware timers, -but allowing implementation freedom to support many more. It also imposes no -fixed rate on the timer frequency, but does impose some extremal values on -frequency, error and slew. - -In general, the HPET is recommended as a high precision (compared to PIT /RTC) -time source which is independent of local variation (as there is only one HPET -in any given system). The HPET is also memory-mapped, and its presence is -indicated through ACPI tables by the BIOS. - -Detailed specification of the HPET is beyond the current scope of this -document, as it is also very well documented elsewhere. - -2.5) Offboard Timers - -Several cards, both proprietary (watchdog boards) and commonplace (e1000) have -timing chips built into the cards which may have registers which are accessible -to kernel or user drivers. To the author's knowledge, using these to generate -a clocksource for a Linux or other kernel has not yet been attempted and is in -general frowned upon as not playing by the agreed rules of the game. Such a -timer device would require additional support to be virtualized properly and is -not considered important at this time as no known operating system does this. - -========================================================================= - -3) TSC Hardware - -The TSC or time stamp counter is relatively simple in theory; it counts -instruction cycles issued by the processor, which can be used as a measure of -time. In practice, due to a number of problems, it is the most complicated -timekeeping device to use. - -The TSC is represented internally as a 64-bit MSR which can be read with the -RDMSR, RDTSC, or RDTSCP (when available) instructions. In the past, hardware -limitations made it possible to write the TSC, but generally on old hardware it -was only possible to write the low 32-bits of the 64-bit counter, and the upper -32-bits of the counter were cleared. Now, however, on Intel processors family -0Fh, for models 3, 4 and 6, and family 06h, models e and f, this restriction -has been lifted and all 64-bits are writable. On AMD systems, the ability to -write the TSC MSR is not an architectural guarantee. - -The TSC is accessible from CPL-0 and conditionally, for CPL > 0 software by -means of the CR4.TSD bit, which when enabled, disables CPL > 0 TSC access. - -Some vendors have implemented an additional instruction, RDTSCP, which returns -atomically not just the TSC, but an indicator which corresponds to the -processor number. This can be used to index into an array of TSC variables to -determine offset information in SMP systems where TSCs are not synchronized. -The presence of this instruction must be determined by consulting CPUID feature -bits. - -Both VMX and SVM provide extension fields in the virtualization hardware which -allows the guest visible TSC to be offset by a constant. Newer implementations -promise to allow the TSC to additionally be scaled, but this hardware is not -yet widely available. - -3.1) TSC synchronization - -The TSC is a CPU-local clock in most implementations. This means, on SMP -platforms, the TSCs of different CPUs may start at different times depending -on when the CPUs are powered on. Generally, CPUs on the same die will share -the same clock, however, this is not always the case. - -The BIOS may attempt to resynchronize the TSCs during the poweron process and -the operating system or other system software may attempt to do this as well. -Several hardware limitations make the problem worse - if it is not possible to -write the full 64-bits of the TSC, it may be impossible to match the TSC in -newly arriving CPUs to that of the rest of the system, resulting in -unsynchronized TSCs. This may be done by BIOS or system software, but in -practice, getting a perfectly synchronized TSC will not be possible unless all -values are read from the same clock, which generally only is possible on single -socket systems or those with special hardware support. - -3.2) TSC and CPU hotplug - -As touched on already, CPUs which arrive later than the boot time of the system -may not have a TSC value that is synchronized with the rest of the system. -Either system software, BIOS, or SMM code may actually try to establish the TSC -to a value matching the rest of the system, but a perfect match is usually not -a guarantee. This can have the effect of bringing a system from a state where -TSC is synchronized back to a state where TSC synchronization flaws, however -small, may be exposed to the OS and any virtualization environment. - -3.3) TSC and multi-socket / NUMA - -Multi-socket systems, especially large multi-socket systems are likely to have -individual clocksources rather than a single, universally distributed clock. -Since these clocks are driven by different crystals, they will not have -perfectly matched frequency, and temperature and electrical variations will -cause the CPU clocks, and thus the TSCs to drift over time. Depending on the -exact clock and bus design, the drift may or may not be fixed in absolute -error, and may accumulate over time. - -In addition, very large systems may deliberately slew the clocks of individual -cores. This technique, known as spread-spectrum clocking, reduces EMI at the -clock frequency and harmonics of it, which may be required to pass FCC -standards for telecommunications and computer equipment. - -It is recommended not to trust the TSCs to remain synchronized on NUMA or -multiple socket systems for these reasons. - -3.4) TSC and C-states - -C-states, or idling states of the processor, especially C1E and deeper sleep -states may be problematic for TSC as well. The TSC may stop advancing in such -a state, resulting in a TSC which is behind that of other CPUs when execution -is resumed. Such CPUs must be detected and flagged by the operating system -based on CPU and chipset identifications. - -The TSC in such a case may be corrected by catching it up to a known external -clocksource. - -3.5) TSC frequency change / P-states - -To make things slightly more interesting, some CPUs may change frequency. They -may or may not run the TSC at the same rate, and because the frequency change -may be staggered or slewed, at some points in time, the TSC rate may not be -known other than falling within a range of values. In this case, the TSC will -not be a stable time source, and must be calibrated against a known, stable, -external clock to be a usable source of time. - -Whether the TSC runs at a constant rate or scales with the P-state is model -dependent and must be determined by inspecting CPUID, chipset or vendor -specific MSR fields. - -In addition, some vendors have known bugs where the P-state is actually -compensated for properly during normal operation, but when the processor is -inactive, the P-state may be raised temporarily to service cache misses from -other processors. In such cases, the TSC on halted CPUs could advance faster -than that of non-halted processors. AMD Turion processors are known to have -this problem. - -3.6) TSC and STPCLK / T-states - -External signals given to the processor may also have the effect of stopping -the TSC. This is typically done for thermal emergency power control to prevent -an overheating condition, and typically, there is no way to detect that this -condition has happened. - -3.7) TSC virtualization - VMX - -VMX provides conditional trapping of RDTSC, RDMSR, WRMSR and RDTSCP -instructions, which is enough for full virtualization of TSC in any manner. In -addition, VMX allows passing through the host TSC plus an additional TSC_OFFSET -field specified in the VMCS. Special instructions must be used to read and -write the VMCS field. - -3.8) TSC virtualization - SVM - -SVM provides conditional trapping of RDTSC, RDMSR, WRMSR and RDTSCP -instructions, which is enough for full virtualization of TSC in any manner. In -addition, SVM allows passing through the host TSC plus an additional offset -field specified in the SVM control block. - -3.9) TSC feature bits in Linux - -In summary, there is no way to guarantee the TSC remains in perfect -synchronization unless it is explicitly guaranteed by the architecture. Even -if so, the TSCs in multi-sockets or NUMA systems may still run independently -despite being locally consistent. - -The following feature bits are used by Linux to signal various TSC attributes, -but they can only be taken to be meaningful for UP or single node systems. - -X86_FEATURE_TSC : The TSC is available in hardware -X86_FEATURE_RDTSCP : The RDTSCP instruction is available -X86_FEATURE_CONSTANT_TSC : The TSC rate is unchanged with P-states -X86_FEATURE_NONSTOP_TSC : The TSC does not stop in C-states -X86_FEATURE_TSC_RELIABLE : TSC sync checks are skipped (VMware) - -4) Virtualization Problems - -Timekeeping is especially problematic for virtualization because a number of -challenges arise. The most obvious problem is that time is now shared between -the host and, potentially, a number of virtual machines. Thus the virtual -operating system does not run with 100% usage of the CPU, despite the fact that -it may very well make that assumption. It may expect it to remain true to very -exacting bounds when interrupt sources are disabled, but in reality only its -virtual interrupt sources are disabled, and the machine may still be preempted -at any time. This causes problems as the passage of real time, the injection -of machine interrupts and the associated clock sources are no longer completely -synchronized with real time. - -This same problem can occur on native hardware to a degree, as SMM mode may -steal cycles from the naturally on X86 systems when SMM mode is used by the -BIOS, but not in such an extreme fashion. However, the fact that SMM mode may -cause similar problems to virtualization makes it a good justification for -solving many of these problems on bare metal. - -4.1) Interrupt clocking - -One of the most immediate problems that occurs with legacy operating systems -is that the system timekeeping routines are often designed to keep track of -time by counting periodic interrupts. These interrupts may come from the PIT -or the RTC, but the problem is the same: the host virtualization engine may not -be able to deliver the proper number of interrupts per second, and so guest -time may fall behind. This is especially problematic if a high interrupt rate -is selected, such as 1000 HZ, which is unfortunately the default for many Linux -guests. - -There are three approaches to solving this problem; first, it may be possible -to simply ignore it. Guests which have a separate time source for tracking -'wall clock' or 'real time' may not need any adjustment of their interrupts to -maintain proper time. If this is not sufficient, it may be necessary to inject -additional interrupts into the guest in order to increase the effective -interrupt rate. This approach leads to complications in extreme conditions, -where host load or guest lag is too much to compensate for, and thus another -solution to the problem has risen: the guest may need to become aware of lost -ticks and compensate for them internally. Although promising in theory, the -implementation of this policy in Linux has been extremely error prone, and a -number of buggy variants of lost tick compensation are distributed across -commonly used Linux systems. - -Windows uses periodic RTC clocking as a means of keeping time internally, and -thus requires interrupt slewing to keep proper time. It does use a low enough -rate (ed: is it 18.2 Hz?) however that it has not yet been a problem in -practice. - -4.2) TSC sampling and serialization - -As the highest precision time source available, the cycle counter of the CPU -has aroused much interest from developers. As explained above, this timer has -many problems unique to its nature as a local, potentially unstable and -potentially unsynchronized source. One issue which is not unique to the TSC, -but is highlighted because of its very precise nature is sampling delay. By -definition, the counter, once read is already old. However, it is also -possible for the counter to be read ahead of the actual use of the result. -This is a consequence of the superscalar execution of the instruction stream, -which may execute instructions out of order. Such execution is called -non-serialized. Forcing serialized execution is necessary for precise -measurement with the TSC, and requires a serializing instruction, such as CPUID -or an MSR read. - -Since CPUID may actually be virtualized by a trap and emulate mechanism, this -serialization can pose a performance issue for hardware virtualization. An -accurate time stamp counter reading may therefore not always be available, and -it may be necessary for an implementation to guard against "backwards" reads of -the TSC as seen from other CPUs, even in an otherwise perfectly synchronized -system. - -4.3) Timespec aliasing - -Additionally, this lack of serialization from the TSC poses another challenge -when using results of the TSC when measured against another time source. As -the TSC is much higher precision, many possible values of the TSC may be read -while another clock is still expressing the same value. - -That is, you may read (T,T+10) while external clock C maintains the same value. -Due to non-serialized reads, you may actually end up with a range which -fluctuates - from (T-1.. T+10). Thus, any time calculated from a TSC, but -calibrated against an external value may have a range of valid values. -Re-calibrating this computation may actually cause time, as computed after the -calibration, to go backwards, compared with time computed before the -calibration. - -This problem is particularly pronounced with an internal time source in Linux, -the kernel time, which is expressed in the theoretically high resolution -timespec - but which advances in much larger granularity intervals, sometimes -at the rate of jiffies, and possibly in catchup modes, at a much larger step. - -This aliasing requires care in the computation and recalibration of kvmclock -and any other values derived from TSC computation (such as TSC virtualization -itself). - -4.4) Migration - -Migration of a virtual machine raises problems for timekeeping in two ways. -First, the migration itself may take time, during which interrupts cannot be -delivered, and after which, the guest time may need to be caught up. NTP may -be able to help to some degree here, as the clock correction required is -typically small enough to fall in the NTP-correctable window. - -An additional concern is that timers based off the TSC (or HPET, if the raw bus -clock is exposed) may now be running at different rates, requiring compensation -in some way in the hypervisor by virtualizing these timers. In addition, -migrating to a faster machine may preclude the use of a passthrough TSC, as a -faster clock cannot be made visible to a guest without the potential of time -advancing faster than usual. A slower clock is less of a problem, as it can -always be caught up to the original rate. KVM clock avoids these problems by -simply storing multipliers and offsets against the TSC for the guest to convert -back into nanosecond resolution values. - -4.5) Scheduling - -Since scheduling may be based on precise timing and firing of interrupts, the -scheduling algorithms of an operating system may be adversely affected by -virtualization. In theory, the effect is random and should be universally -distributed, but in contrived as well as real scenarios (guest device access, -causes of virtualization exits, possible context switch), this may not always -be the case. The effect of this has not been well studied. - -In an attempt to work around this, several implementations have provided a -paravirtualized scheduler clock, which reveals the true amount of CPU time for -which a virtual machine has been running. - -4.6) Watchdogs - -Watchdog timers, such as the lock detector in Linux may fire accidentally when -running under hardware virtualization due to timer interrupts being delayed or -misinterpretation of the passage of real time. Usually, these warnings are -spurious and can be ignored, but in some circumstances it may be necessary to -disable such detection. - -4.7) Delays and precision timing - -Precise timing and delays may not be possible in a virtualized system. This -can happen if the system is controlling physical hardware, or issues delays to -compensate for slower I/O to and from devices. The first issue is not solvable -in general for a virtualized system; hardware control software can't be -adequately virtualized without a full real-time operating system, which would -require an RT aware virtualization platform. - -The second issue may cause performance problems, but this is unlikely to be a -significant issue. In many cases these delays may be eliminated through -configuration or paravirtualization. - -4.8) Covert channels and leaks - -In addition to the above problems, time information will inevitably leak to the -guest about the host in anything but a perfect implementation of virtualized -time. This may allow the guest to infer the presence of a hypervisor (as in a -red-pill type detection), and it may allow information to leak between guests -by using CPU utilization itself as a signalling channel. Preventing such -problems would require completely isolated virtual time which may not track -real time any longer. This may be useful in certain security or QA contexts, -but in general isn't recommended for real-world deployment scenarios. diff --git a/Documentation/virtual/kvm/vcpu-requests.rst b/Documentation/virtual/kvm/vcpu-requests.rst deleted file mode 100644 index 5feb3706a7ae..000000000000 --- a/Documentation/virtual/kvm/vcpu-requests.rst +++ /dev/null @@ -1,307 +0,0 @@ -================= -KVM VCPU Requests -================= - -Overview -======== - -KVM supports an internal API enabling threads to request a VCPU thread to -perform some activity. For example, a thread may request a VCPU to flush -its TLB with a VCPU request. The API consists of the following functions:: - - /* Check if any requests are pending for VCPU @vcpu. */ - bool kvm_request_pending(struct kvm_vcpu *vcpu); - - /* Check if VCPU @vcpu has request @req pending. */ - bool kvm_test_request(int req, struct kvm_vcpu *vcpu); - - /* Clear request @req for VCPU @vcpu. */ - void kvm_clear_request(int req, struct kvm_vcpu *vcpu); - - /* - * Check if VCPU @vcpu has request @req pending. When the request is - * pending it will be cleared and a memory barrier, which pairs with - * another in kvm_make_request(), will be issued. - */ - bool kvm_check_request(int req, struct kvm_vcpu *vcpu); - - /* - * Make request @req of VCPU @vcpu. Issues a memory barrier, which pairs - * with another in kvm_check_request(), prior to setting the request. - */ - void kvm_make_request(int req, struct kvm_vcpu *vcpu); - - /* Make request @req of all VCPUs of the VM with struct kvm @kvm. */ - bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req); - -Typically a requester wants the VCPU to perform the activity as soon -as possible after making the request. This means most requests -(kvm_make_request() calls) are followed by a call to kvm_vcpu_kick(), -and kvm_make_all_cpus_request() has the kicking of all VCPUs built -into it. - -VCPU Kicks ----------- - -The goal of a VCPU kick is to bring a VCPU thread out of guest mode in -order to perform some KVM maintenance. To do so, an IPI is sent, forcing -a guest mode exit. However, a VCPU thread may not be in guest mode at the -time of the kick. Therefore, depending on the mode and state of the VCPU -thread, there are two other actions a kick may take. All three actions -are listed below: - -1) Send an IPI. This forces a guest mode exit. -2) Waking a sleeping VCPU. Sleeping VCPUs are VCPU threads outside guest - mode that wait on waitqueues. Waking them removes the threads from - the waitqueues, allowing the threads to run again. This behavior - may be suppressed, see KVM_REQUEST_NO_WAKEUP below. -3) Nothing. When the VCPU is not in guest mode and the VCPU thread is not - sleeping, then there is nothing to do. - -VCPU Mode ---------- - -VCPUs have a mode state, ``vcpu->mode``, that is used to track whether the -guest is running in guest mode or not, as well as some specific -outside guest mode states. The architecture may use ``vcpu->mode`` to -ensure VCPU requests are seen by VCPUs (see "Ensuring Requests Are Seen"), -as well as to avoid sending unnecessary IPIs (see "IPI Reduction"), and -even to ensure IPI acknowledgements are waited upon (see "Waiting for -Acknowledgements"). The following modes are defined: - -OUTSIDE_GUEST_MODE - - The VCPU thread is outside guest mode. - -IN_GUEST_MODE - - The VCPU thread is in guest mode. - -EXITING_GUEST_MODE - - The VCPU thread is transitioning from IN_GUEST_MODE to - OUTSIDE_GUEST_MODE. - -READING_SHADOW_PAGE_TABLES - - The VCPU thread is outside guest mode, but it wants the sender of - certain VCPU requests, namely KVM_REQ_TLB_FLUSH, to wait until the VCPU - thread is done reading the page tables. - -VCPU Request Internals -====================== - -VCPU requests are simply bit indices of the ``vcpu->requests`` bitmap. -This means general bitops, like those documented in [atomic-ops]_ could -also be used, e.g. :: - - clear_bit(KVM_REQ_UNHALT & KVM_REQUEST_MASK, &vcpu->requests); - -However, VCPU request users should refrain from doing so, as it would -break the abstraction. The first 8 bits are reserved for architecture -independent requests, all additional bits are available for architecture -dependent requests. - -Architecture Independent Requests ---------------------------------- - -KVM_REQ_TLB_FLUSH - - KVM's common MMU notifier may need to flush all of a guest's TLB - entries, calling kvm_flush_remote_tlbs() to do so. Architectures that - choose to use the common kvm_flush_remote_tlbs() implementation will - need to handle this VCPU request. - -KVM_REQ_MMU_RELOAD - - When shadow page tables are used and memory slots are removed it's - necessary to inform each VCPU to completely refresh the tables. This - request is used for that. - -KVM_REQ_PENDING_TIMER - - This request may be made from a timer handler run on the host on behalf - of a VCPU. It informs the VCPU thread to inject a timer interrupt. - -KVM_REQ_UNHALT - - This request may be made from the KVM common function kvm_vcpu_block(), - which is used to emulate an instruction that causes a CPU to halt until - one of an architectural specific set of events and/or interrupts is - received (determined by checking kvm_arch_vcpu_runnable()). When that - event or interrupt arrives kvm_vcpu_block() makes the request. This is - in contrast to when kvm_vcpu_block() returns due to any other reason, - such as a pending signal, which does not indicate the VCPU's halt - emulation should stop, and therefore does not make the request. - -KVM_REQUEST_MASK ----------------- - -VCPU requests should be masked by KVM_REQUEST_MASK before using them with -bitops. This is because only the lower 8 bits are used to represent the -request's number. The upper bits are used as flags. Currently only two -flags are defined. - -VCPU Request Flags ------------------- - -KVM_REQUEST_NO_WAKEUP - - This flag is applied to requests that only need immediate attention - from VCPUs running in guest mode. That is, sleeping VCPUs do not need - to be awaken for these requests. Sleeping VCPUs will handle the - requests when they are awaken later for some other reason. - -KVM_REQUEST_WAIT - - When requests with this flag are made with kvm_make_all_cpus_request(), - then the caller will wait for each VCPU to acknowledge its IPI before - proceeding. This flag only applies to VCPUs that would receive IPIs. - If, for example, the VCPU is sleeping, so no IPI is necessary, then - the requesting thread does not wait. This means that this flag may be - safely combined with KVM_REQUEST_NO_WAKEUP. See "Waiting for - Acknowledgements" for more information about requests with - KVM_REQUEST_WAIT. - -VCPU Requests with Associated State -=================================== - -Requesters that want the receiving VCPU to handle new state need to ensure -the newly written state is observable to the receiving VCPU thread's CPU -by the time it observes the request. This means a write memory barrier -must be inserted after writing the new state and before setting the VCPU -request bit. Additionally, on the receiving VCPU thread's side, a -corresponding read barrier must be inserted after reading the request bit -and before proceeding to read the new state associated with it. See -scenario 3, Message and Flag, of [lwn-mb]_ and the kernel documentation -[memory-barriers]_. - -The pair of functions, kvm_check_request() and kvm_make_request(), provide -the memory barriers, allowing this requirement to be handled internally by -the API. - -Ensuring Requests Are Seen -========================== - -When making requests to VCPUs, we want to avoid the receiving VCPU -executing in guest mode for an arbitrary long time without handling the -request. We can be sure this won't happen as long as we ensure the VCPU -thread checks kvm_request_pending() before entering guest mode and that a -kick will send an IPI to force an exit from guest mode when necessary. -Extra care must be taken to cover the period after the VCPU thread's last -kvm_request_pending() check and before it has entered guest mode, as kick -IPIs will only trigger guest mode exits for VCPU threads that are in guest -mode or at least have already disabled interrupts in order to prepare to -enter guest mode. This means that an optimized implementation (see "IPI -Reduction") must be certain when it's safe to not send the IPI. One -solution, which all architectures except s390 apply, is to: - -- set ``vcpu->mode`` to IN_GUEST_MODE between disabling the interrupts and - the last kvm_request_pending() check; -- enable interrupts atomically when entering the guest. - -This solution also requires memory barriers to be placed carefully in both -the requesting thread and the receiving VCPU. With the memory barriers we -can exclude the possibility of a VCPU thread observing -!kvm_request_pending() on its last check and then not receiving an IPI for -the next request made of it, even if the request is made immediately after -the check. This is done by way of the Dekker memory barrier pattern -(scenario 10 of [lwn-mb]_). As the Dekker pattern requires two variables, -this solution pairs ``vcpu->mode`` with ``vcpu->requests``. Substituting -them into the pattern gives:: - - CPU1 CPU2 - ================= ================= - local_irq_disable(); - WRITE_ONCE(vcpu->mode, IN_GUEST_MODE); kvm_make_request(REQ, vcpu); - smp_mb(); smp_mb(); - if (kvm_request_pending(vcpu)) { if (READ_ONCE(vcpu->mode) == - IN_GUEST_MODE) { - ...abort guest entry... ...send IPI... - } } - -As stated above, the IPI is only useful for VCPU threads in guest mode or -that have already disabled interrupts. This is why this specific case of -the Dekker pattern has been extended to disable interrupts before setting -``vcpu->mode`` to IN_GUEST_MODE. WRITE_ONCE() and READ_ONCE() are used to -pedantically implement the memory barrier pattern, guaranteeing the -compiler doesn't interfere with ``vcpu->mode``'s carefully planned -accesses. - -IPI Reduction -------------- - -As only one IPI is needed to get a VCPU to check for any/all requests, -then they may be coalesced. This is easily done by having the first IPI -sending kick also change the VCPU mode to something !IN_GUEST_MODE. The -transitional state, EXITING_GUEST_MODE, is used for this purpose. - -Waiting for Acknowledgements ----------------------------- - -Some requests, those with the KVM_REQUEST_WAIT flag set, require IPIs to -be sent, and the acknowledgements to be waited upon, even when the target -VCPU threads are in modes other than IN_GUEST_MODE. For example, one case -is when a target VCPU thread is in READING_SHADOW_PAGE_TABLES mode, which -is set after disabling interrupts. To support these cases, the -KVM_REQUEST_WAIT flag changes the condition for sending an IPI from -checking that the VCPU is IN_GUEST_MODE to checking that it is not -OUTSIDE_GUEST_MODE. - -Request-less VCPU Kicks ------------------------ - -As the determination of whether or not to send an IPI depends on the -two-variable Dekker memory barrier pattern, then it's clear that -request-less VCPU kicks are almost never correct. Without the assurance -that a non-IPI generating kick will still result in an action by the -receiving VCPU, as the final kvm_request_pending() check does for -request-accompanying kicks, then the kick may not do anything useful at -all. If, for instance, a request-less kick was made to a VCPU that was -just about to set its mode to IN_GUEST_MODE, meaning no IPI is sent, then -the VCPU thread may continue its entry without actually having done -whatever it was the kick was meant to initiate. - -One exception is x86's posted interrupt mechanism. In this case, however, -even the request-less VCPU kick is coupled with the same -local_irq_disable() + smp_mb() pattern described above; the ON bit -(Outstanding Notification) in the posted interrupt descriptor takes the -role of ``vcpu->requests``. When sending a posted interrupt, PIR.ON is -set before reading ``vcpu->mode``; dually, in the VCPU thread, -vmx_sync_pir_to_irr() reads PIR after setting ``vcpu->mode`` to -IN_GUEST_MODE. - -Additional Considerations -========================= - -Sleeping VCPUs --------------- - -VCPU threads may need to consider requests before and/or after calling -functions that may put them to sleep, e.g. kvm_vcpu_block(). Whether they -do or not, and, if they do, which requests need consideration, is -architecture dependent. kvm_vcpu_block() calls kvm_arch_vcpu_runnable() -to check if it should awaken. One reason to do so is to provide -architectures a function where requests may be checked if necessary. - -Clearing Requests ------------------ - -Generally it only makes sense for the receiving VCPU thread to clear a -request. However, in some circumstances, such as when the requesting -thread and the receiving VCPU thread are executed serially, such as when -they are the same thread, or when they are using some form of concurrency -control to temporarily execute synchronously, then it's possible to know -that the request may be cleared immediately, rather than waiting for the -receiving VCPU thread to handle the request in VCPU RUN. The only current -examples of this are kvm_vcpu_block() calls made by VCPUs to block -themselves. A possible side-effect of that call is to make the -KVM_REQ_UNHALT request, which may then be cleared immediately when the -VCPU returns from the call. - -References -========== - -.. [atomic-ops] Documentation/core-api/atomic_ops.rst -.. [memory-barriers] Documentation/memory-barriers.txt -.. [lwn-mb] https://lwn.net/Articles/573436/ diff --git a/Documentation/virtual/paravirt_ops.rst b/Documentation/virtual/paravirt_ops.rst deleted file mode 100644 index 6b789d27cead..000000000000 --- a/Documentation/virtual/paravirt_ops.rst +++ /dev/null @@ -1,35 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -============ -Paravirt_ops -============ - -Linux provides support for different hypervisor virtualization technologies. -Historically different binary kernels would be required in order to support -different hypervisors, this restriction was removed with pv_ops. -Linux pv_ops is a virtualization API which enables support for different -hypervisors. It allows each hypervisor to override critical operations and -allows a single kernel binary to run on all supported execution environments -including native machine -- without any hypervisors. - -pv_ops provides a set of function pointers which represent operations -corresponding to low level critical instructions and high level -functionalities in various areas. pv-ops allows for optimizations at run -time by enabling binary patching of the low-ops critical operations -at boot time. - -pv_ops operations are classified into three categories: - -- simple indirect call - These operations correspond to high level functionality where it is - known that the overhead of indirect call isn't very important. - -- indirect call which allows optimization with binary patch - Usually these operations correspond to low level critical instructions. They - are called frequently and are performance critical. The overhead is - very important. - -- a set of macros for hand written assembly code - Hand written assembly codes (.S files) also need paravirtualization - because they include sensitive instructions or some of code paths in - them are very performance critical. diff --git a/Documentation/virtual/uml/UserModeLinux-HOWTO.txt b/Documentation/virtual/uml/UserModeLinux-HOWTO.txt deleted file mode 100644 index 87b80f589e1c..000000000000 --- a/Documentation/virtual/uml/UserModeLinux-HOWTO.txt +++ /dev/null @@ -1,4589 +0,0 @@ - User Mode Linux HOWTO - User Mode Linux Core Team - Mon Nov 18 14:16:16 EST 2002 - - This document describes the use and abuse of Jeff Dike's User Mode - Linux: a port of the Linux kernel as a normal Intel Linux process. - ______________________________________________________________________ - - Table of Contents - - 1. Introduction - - 1.1 How is User Mode Linux Different? - 1.2 Why Would I Want User Mode Linux? - - 2. Compiling the kernel and modules - - 2.1 Compiling the kernel - 2.2 Compiling and installing kernel modules - 2.3 Compiling and installing uml_utilities - - 3. Running UML and logging in - - 3.1 Running UML - 3.2 Logging in - 3.3 Examples - - 4. UML on 2G/2G hosts - - 4.1 Introduction - 4.2 The problem - 4.3 The solution - - 5. Setting up serial lines and consoles - - 5.1 Specifying the device - 5.2 Specifying the channel - 5.3 Examples - - 6. Setting up the network - - 6.1 General setup - 6.2 Userspace daemons - 6.3 Specifying ethernet addresses - 6.4 UML interface setup - 6.5 Multicast - 6.6 TUN/TAP with the uml_net helper - 6.7 TUN/TAP with a preconfigured tap device - 6.8 Ethertap - 6.9 The switch daemon - 6.10 Slip - 6.11 Slirp - 6.12 pcap - 6.13 Setting up the host yourself - - 7. Sharing Filesystems between Virtual Machines - - 7.1 A warning - 7.2 Using layered block devices - 7.3 Note! - 7.4 Another warning - 7.5 uml_moo : Merging a COW file with its backing file - - 8. Creating filesystems - - 8.1 Create the filesystem file - 8.2 Assign the file to a UML device - 8.3 Creating and mounting the filesystem - - 9. Host file access - - 9.1 Using hostfs - 9.2 hostfs as the root filesystem - 9.3 Building hostfs - - 10. The Management Console - 10.1 version - 10.2 halt and reboot - 10.3 config - 10.4 remove - 10.5 sysrq - 10.6 help - 10.7 cad - 10.8 stop - 10.9 go - - 11. Kernel debugging - - 11.1 Starting the kernel under gdb - 11.2 Examining sleeping processes - 11.3 Running ddd on UML - 11.4 Debugging modules - 11.5 Attaching gdb to the kernel - 11.6 Using alternate debuggers - - 12. Kernel debugging examples - - 12.1 The case of the hung fsck - 12.2 Episode 2: The case of the hung fsck - - 13. What to do when UML doesn't work - - 13.1 Strange compilation errors when you build from source - 13.2 (obsolete) - 13.3 A variety of panics and hangs with /tmp on a reiserfs filesystem - 13.4 The compile fails with errors about conflicting types for 'open', 'dup', and 'waitpid' - 13.5 UML doesn't work when /tmp is an NFS filesystem - 13.6 UML hangs on boot when compiled with gprof support - 13.7 syslogd dies with a SIGTERM on startup - 13.8 TUN/TAP networking doesn't work on a 2.4 host - 13.9 You can network to the host but not to other machines on the net - 13.10 I have no root and I want to scream - 13.11 UML build conflict between ptrace.h and ucontext.h - 13.12 The UML BogoMips is exactly half the host's BogoMips - 13.13 When you run UML, it immediately segfaults - 13.14 xterms appear, then immediately disappear - 13.15 Any other panic, hang, or strange behavior - - 14. Diagnosing Problems - - 14.1 Case 1 : Normal kernel panics - 14.2 Case 2 : Tracing thread panics - 14.3 Case 3 : Tracing thread panics caused by other threads - 14.4 Case 4 : Hangs - - 15. Thanks - - 15.1 Code and Documentation - 15.2 Flushing out bugs - 15.3 Buglets and clean-ups - 15.4 Case Studies - 15.5 Other contributions - - - ______________________________________________________________________ - - 1. Introduction - - Welcome to User Mode Linux. It's going to be fun. - - - - 1.1. How is User Mode Linux Different? - - Normally, the Linux Kernel talks straight to your hardware (video - card, keyboard, hard drives, etc), and any programs which run ask the - kernel to operate the hardware, like so: - - - - +-----------+-----------+----+ - | Process 1 | Process 2 | ...| - +-----------+-----------+----+ - | Linux Kernel | - +----------------------------+ - | Hardware | - +----------------------------+ - - - - - The User Mode Linux Kernel is different; instead of talking to the - hardware, it talks to a `real' Linux kernel (called the `host kernel' - from now on), like any other program. Programs can then run inside - User-Mode Linux as if they were running under a normal kernel, like - so: - - - - +----------------+ - | Process 2 | ...| - +-----------+----------------+ - | Process 1 | User-Mode Linux| - +----------------------------+ - | Linux Kernel | - +----------------------------+ - | Hardware | - +----------------------------+ - - - - - - 1.2. Why Would I Want User Mode Linux? - - - 1. If User Mode Linux crashes, your host kernel is still fine. - - 2. You can run a usermode kernel as a non-root user. - - 3. You can debug the User Mode Linux like any normal process. - - 4. You can run gprof (profiling) and gcov (coverage testing). - - 5. You can play with your kernel without breaking things. - - 6. You can use it as a sandbox for testing new apps. - - 7. You can try new development kernels safely. - - 8. You can run different distributions simultaneously. - - 9. It's extremely fun. - - - - - - 2. Compiling the kernel and modules - - - - - 2.1. Compiling the kernel - - - Compiling the user mode kernel is just like compiling any other - kernel. Let's go through the steps, using 2.4.0-prerelease (current - as of this writing) as an example: - - - 1. Download the latest UML patch from - - the download page - . - - - 3. Make a directory and unpack the kernel into it. - - - - host% - mkdir ~/uml - - - - - - - host% - cd ~/uml - - - - - - - host% - tar -xzvf linux-2.4.0-prerelease.tar.bz2 - - - - - - - 4. Apply the patch using - - - - host% - cd ~/uml/linux - - - - host% - bzcat uml-patch-2.4.0-prerelease.bz2 | patch -p1 - - - - - - - 5. Run your favorite config; `make xconfig ARCH=um' is the most - convenient. `make config ARCH=um' and 'make menuconfig ARCH=um' - will work as well. The defaults will give you a useful kernel. If - you want to change something, go ahead, it probably won't hurt - anything. - - - Note: If the host is configured with a 2G/2G address space split - rather than the usual 3G/1G split, then the packaged UML binaries - will not run. They will immediately segfault. See ``UML on 2G/2G - hosts'' for the scoop on running UML on your system. - - - - 6. Finish with `make linux ARCH=um': the result is a file called - `linux' in the top directory of your source tree. - - Make sure that you don't build this kernel in /usr/src/linux. On some - distributions, /usr/include/asm is a link into this pool. The user- - mode build changes the other end of that link, and things that include - stop compiling. - - The sources are also available from cvs at the project's cvs page, - which has directions on getting the sources. You can also browse the - CVS pool from there. - - If you get the CVS sources, you will have to check them out into an - empty directory. You will then have to copy each file into the - corresponding directory in the appropriate kernel pool. - - If you don't have the latest kernel pool, you can get the - corresponding user-mode sources with - - - host% cvs co -r v_2_3_x linux - - - - - where 'x' is the version in your pool. Note that you will not get the - bug fixes and enhancements that have gone into subsequent releases. - - - 2.2. Compiling and installing kernel modules - - UML modules are built in the same way as the native kernel (with the - exception of the 'ARCH=um' that you always need for UML): - - - host% make modules ARCH=um - - - - - Any modules that you want to load into this kernel need to be built in - the user-mode pool. Modules from the native kernel won't work. - - You can install them by using ftp or something to copy them into the - virtual machine and dropping them into /lib/modules/`uname -r`. - - You can also get the kernel build process to install them as follows: - - 1. with the kernel not booted, mount the root filesystem in the top - level of the kernel pool: - - - host% mount root_fs mnt -o loop - - - - - - - 2. run - - - host% - make modules_install INSTALL_MOD_PATH=`pwd`/mnt ARCH=um - - - - - - - 3. unmount the filesystem - - - host% umount mnt - - - - - - - 4. boot the kernel on it - - - When the system is booted, you can use insmod as usual to get the - modules into the kernel. A number of things have been loaded into UML - as modules, especially filesystems and network protocols and filters, - so most symbols which need to be exported probably already are. - However, if you do find symbols that need exporting, let us - know, and - they'll be "taken care of". - - - - 2.3. Compiling and installing uml_utilities - - Many features of the UML kernel require a user-space helper program, - so a uml_utilities package is distributed separately from the kernel - patch which provides these helpers. Included within this is: - - o port-helper - Used by consoles which connect to xterms or ports - - o tunctl - Configuration tool to create and delete tap devices - - o uml_net - Setuid binary for automatic tap device configuration - - o uml_switch - User-space virtual switch required for daemon - transport - - The uml_utilities tree is compiled with: - - - host# - make && make install - - - - - Note that UML kernel patches may require a specific version of the - uml_utilities distribution. If you don't keep up with the mailing - lists, ensure that you have the latest release of uml_utilities if you - are experiencing problems with your UML kernel, particularly when - dealing with consoles or command-line switches to the helper programs - - - - - - - - - 3. Running UML and logging in - - - - 3.1. Running UML - - It runs on 2.2.15 or later, and all 2.4 kernels. - - - Booting UML is straightforward. Simply run 'linux': it will try to - mount the file `root_fs' in the current directory. You do not need to - run it as root. If your root filesystem is not named `root_fs', then - you need to put a `ubd0=root_fs_whatever' switch on the linux command - line. - - - You will need a filesystem to boot UML from. There are a number - available for download from here . There are also several tools - which can be - used to generate UML-compatible filesystem images from media. - The kernel will boot up and present you with a login prompt. - - - Note: If the host is configured with a 2G/2G address space split - rather than the usual 3G/1G split, then the packaged UML binaries will - not run. They will immediately segfault. See ``UML on 2G/2G hosts'' - for the scoop on running UML on your system. - - - - 3.2. Logging in - - - - The prepackaged filesystems have a root account with password 'root' - and a user account with password 'user'. The login banner will - generally tell you how to log in. So, you log in and you will find - yourself inside a little virtual machine. Our filesystems have a - variety of commands and utilities installed (and it is fairly easy to - add more), so you will have a lot of tools with which to poke around - the system. - - There are a couple of other ways to log in: - - o On a virtual console - - - - Each virtual console that is configured (i.e. the device exists in - /dev and /etc/inittab runs a getty on it) will come up in its own - xterm. If you get tired of the xterms, read ``Setting up serial - lines and consoles'' to see how to attach the consoles to - something else, like host ptys. - - - - o Over the serial line - - - In the boot output, find a line that looks like: - - - - serial line 0 assigned pty /dev/ptyp1 - - - - - Attach your favorite terminal program to the corresponding tty. I.e. - for minicom, the command would be - - - host% minicom -o -p /dev/ttyp1 - - - - - - - o Over the net - - - If the network is running, then you can telnet to the virtual - machine and log in to it. See ``Setting up the network'' to learn - about setting up a virtual network. - - When you're done using it, run halt, and the kernel will bring itself - down and the process will exit. - - - 3.3. Examples - - Here are some examples of UML in action: - - o A login session - - o A virtual network - - - - - - - - 4. UML on 2G/2G hosts - - - - - 4.1. Introduction - - - Most Linux machines are configured so that the kernel occupies the - upper 1G (0xc0000000 - 0xffffffff) of the 4G address space and - processes use the lower 3G (0x00000000 - 0xbfffffff). However, some - machine are configured with a 2G/2G split, with the kernel occupying - the upper 2G (0x80000000 - 0xffffffff) and processes using the lower - 2G (0x00000000 - 0x7fffffff). - - - - - 4.2. The problem - - - The prebuilt UML binaries on this site will not run on 2G/2G hosts - because UML occupies the upper .5G of the 3G process address space - (0xa0000000 - 0xbfffffff). Obviously, on 2G/2G hosts, this is right - in the middle of the kernel address space, so UML won't even load - it - will immediately segfault. - - - - - 4.3. The solution - - - The fix for this is to rebuild UML from source after enabling - CONFIG_HOST_2G_2G (under 'General Setup'). This will cause UML to - load itself in the top .5G of that smaller process address space, - where it will run fine. See ``Compiling the kernel and modules'' if - you need help building UML from source. - - - - - - - - - - - 5. Setting up serial lines and consoles - - - It is possible to attach UML serial lines and consoles to many types - of host I/O channels by specifying them on the command line. - - - You can attach them to host ptys, ttys, file descriptors, and ports. - This allows you to do things like - - o have a UML console appear on an unused host console, - - o hook two virtual machines together by having one attach to a pty - and having the other attach to the corresponding tty - - o make a virtual machine accessible from the net by attaching a - console to a port on the host. - - - The general format of the command line option is device=channel. - - - - 5.1. Specifying the device - - Devices are specified with "con" or "ssl" (console or serial line, - respectively), optionally with a device number if you are talking - about a specific device. - - - Using just "con" or "ssl" describes all of the consoles or serial - lines. If you want to talk about console #3 or serial line #10, they - would be "con3" and "ssl10", respectively. - - - A specific device name will override a less general "con=" or "ssl=". - So, for example, you can assign a pty to each of the serial lines - except for the first two like this: - - - ssl=pty ssl0=tty:/dev/tty0 ssl1=tty:/dev/tty1 - - - - - The specificity of the device name is all that matters; order on the - command line is irrelevant. - - - - 5.2. Specifying the channel - - There are a number of different types of channels to attach a UML - device to, each with a different way of specifying exactly what to - attach to. - - o pseudo-terminals - device=pty pts terminals - device=pts - - - This will cause UML to allocate a free host pseudo-terminal for the - device. The terminal that it got will be announced in the boot - log. You access it by attaching a terminal program to the - corresponding tty: - - o screen /dev/pts/n - - o screen /dev/ttyxx - - o minicom -o -p /dev/ttyxx - minicom seems not able to handle pts - devices - - o kermit - start it up, 'open' the device, then 'connect' - - - - - - o terminals - device=tty:tty device file - - - This will make UML attach the device to the specified tty (i.e - - - con1=tty:/dev/tty3 - - - - - will attach UML's console 1 to the host's /dev/tty3). If the tty that - you specify is the slave end of a tty/pty pair, something else must - have already opened the corresponding pty in order for this to work. - - - - - - o xterms - device=xterm - - - UML will run an xterm and the device will be attached to it. - - - - - - o Port - device=port:port number - - - This will attach the UML devices to the specified host port. - Attaching console 1 to the host's port 9000 would be done like - this: - - - con1=port:9000 - - - - - Attaching all the serial lines to that port would be done similarly: - - - ssl=port:9000 - - - - - You access these devices by telnetting to that port. Each active tel- - net session gets a different device. If there are more telnets to a - port than UML devices attached to it, then the extra telnet sessions - will block until an existing telnet detaches, or until another device - becomes active (i.e. by being activated in /etc/inittab). - - This channel has the advantage that you can both attach multiple UML - devices to it and know how to access them without reading the UML boot - log. It is also unique in allowing access to a UML from remote - machines without requiring that the UML be networked. This could be - useful in allowing public access to UMLs because they would be - accessible from the net, but wouldn't need any kind of network - filtering or access control because they would have no network access. - - - If you attach the main console to a portal, then the UML boot will - appear to hang. In reality, it's waiting for a telnet to connect, at - which point the boot will proceed. - - - - - - o already-existing file descriptors - device=file descriptor - - - If you set up a file descriptor on the UML command line, you can - attach a UML device to it. This is most commonly used to put the - main console back on stdin and stdout after assigning all the other - consoles to something else: - - - con0=fd:0,fd:1 con=pts - - - - - - - - - o Nothing - device=null - - - This allows the device to be opened, in contrast to 'none', but - reads will block, and writes will succeed and the data will be - thrown out. - - - - - - o None - device=none - - - This causes the device to disappear. - - - - You can also specify different input and output channels for a device - by putting a comma between them: - - - ssl3=tty:/dev/tty2,xterm - - - - - will cause serial line 3 to accept input on the host's /dev/tty2 and - display output on an xterm. That's a silly example - the most common - use of this syntax is to reattach the main console to stdin and stdout - as shown above. - - - If you decide to move the main console away from stdin/stdout, the - initial boot output will appear in the terminal that you're running - UML in. However, once the console driver has been officially - initialized, then the boot output will start appearing wherever you - specified that console 0 should be. That device will receive all - subsequent output. - - - - 5.3. Examples - - There are a number of interesting things you can do with this - capability. - - - First, this is how you get rid of those bleeding console xterms by - attaching them to host ptys: - - - con=pty con0=fd:0,fd:1 - - - - - This will make a UML console take over an unused host virtual console, - so that when you switch to it, you will see the UML login prompt - rather than the host login prompt: - - - con1=tty:/dev/tty6 - - - - - You can attach two virtual machines together with what amounts to a - serial line as follows: - - Run one UML with a serial line attached to a pty - - - - ssl1=pty - - - - - Look at the boot log to see what pty it got (this example will assume - that it got /dev/ptyp1). - - Boot the other UML with a serial line attached to the corresponding - tty - - - - ssl1=tty:/dev/ttyp1 - - - - - Log in, make sure that it has no getty on that serial line, attach a - terminal program like minicom to it, and you should see the login - prompt of the other virtual machine. - - - 6. Setting up the network - - - - This page describes how to set up the various transports and to - provide a UML instance with network access to the host, other machines - on the local net, and the rest of the net. - - - As of 2.4.5, UML networking has been completely redone to make it much - easier to set up, fix bugs, and add new features. - - - There is a new helper, uml_net, which does the host setup that - requires root privileges. - - - There are currently five transport types available for a UML virtual - machine to exchange packets with other hosts: - - o ethertap - - o TUN/TAP - - o Multicast - - o a switch daemon - - o slip - - o slirp - - o pcap - - The TUN/TAP, ethertap, slip, and slirp transports allow a UML - instance to exchange packets with the host. They may be directed - to the host or the host may just act as a router to provide access - to other physical or virtual machines. - - - The pcap transport is a synthetic read-only interface, using the - libpcap binary to collect packets from interfaces on the host and - filter them. This is useful for building preconfigured traffic - monitors or sniffers. - - - The daemon and multicast transports provide a completely virtual - network to other virtual machines. This network is completely - disconnected from the physical network unless one of the virtual - machines on it is acting as a gateway. - - - With so many host transports, which one should you use? Here's when - you should use each one: - - o ethertap - if you want access to the host networking and it is - running 2.2 - - o TUN/TAP - if you want access to the host networking and it is - running 2.4. Also, the TUN/TAP transport is able to use a - preconfigured device, allowing it to avoid using the setuid uml_net - helper, which is a security advantage. - - o Multicast - if you want a purely virtual network and you don't want - to set up anything but the UML - - o a switch daemon - if you want a purely virtual network and you - don't mind running the daemon in order to get somewhat better - performance - - o slip - there is no particular reason to run the slip backend unless - ethertap and TUN/TAP are just not available for some reason - - o slirp - if you don't have root access on the host to setup - networking, or if you don't want to allocate an IP to your UML - - o pcap - not much use for actual network connectivity, but great for - monitoring traffic on the host - - Ethertap is available on 2.4 and works fine. TUN/TAP is preferred - to it because it has better performance and ethertap is officially - considered obsolete in 2.4. Also, the root helper only needs to - run occasionally for TUN/TAP, rather than handling every packet, as - it does with ethertap. This is a slight security advantage since - it provides fewer opportunities for a nasty UML user to somehow - exploit the helper's root privileges. - - - 6.1. General setup - - First, you must have the virtual network enabled in your UML. If are - running a prebuilt kernel from this site, everything is already - enabled. If you build the kernel yourself, under the "Network device - support" menu, enable "Network device support", and then the three - transports. - - - The next step is to provide a network device to the virtual machine. - This is done by describing it on the kernel command line. - - The general format is - - - eth = , - - - - - For example, a virtual ethernet device may be attached to a host - ethertap device as follows: - - - eth0=ethertap,tap0,fe:fd:0:0:0:1,192.168.0.254 - - - - - This sets up eth0 inside the virtual machine to attach itself to the - host /dev/tap0, assigns it an ethernet address, and assigns the host - tap0 interface an IP address. - - - - Note that the IP address you assign to the host end of the tap device - must be different than the IP you assign to the eth device inside UML. - If you are short on IPs and don't want to consume two per UML, then - you can reuse the host's eth IP address for the host ends of the tap - devices. Internally, the UMLs must still get unique IPs for their eth - devices. You can also give the UMLs non-routable IPs (192.168.x.x or - 10.x.x.x) and have the host masquerade them. This will let outgoing - connections work, but incoming connections won't without more work, - such as port forwarding from the host. - Also note that when you configure the host side of an interface, it is - only acting as a gateway. It will respond to pings sent to it - locally, but is not useful to do that since it's a host interface. - You are not talking to the UML when you ping that interface and get a - response. - - - You can also add devices to a UML and remove them at runtime. See the - ``The Management Console'' page for details. - - - The sections below describe this in more detail. - - - Once you've decided how you're going to set up the devices, you boot - UML, log in, configure the UML side of the devices, and set up routes - to the outside world. At that point, you will be able to talk to any - other machines, physical or virtual, on the net. - - - If ifconfig inside UML fails and the network refuses to come up, run - tell you what went wrong. - - - - 6.2. Userspace daemons - - You will likely need the setuid helper, or the switch daemon, or both. - They are both installed with the RPM and deb, so if you've installed - either, you can skip the rest of this section. - - - If not, then you need to check them out of CVS, build them, and - install them. The helper is uml_net, in CVS /tools/uml_net, and the - daemon is uml_switch, in CVS /tools/uml_router. They are both built - with a plain 'make'. Both need to be installed in a directory that's - in your path - /usr/bin is recommend. On top of that, uml_net needs - to be setuid root. - - - - 6.3. Specifying ethernet addresses - - Below, you will see that the TUN/TAP, ethertap, and daemon interfaces - allow you to specify hardware addresses for the virtual ethernet - devices. This is generally not necessary. If you don't have a - specific reason to do it, you probably shouldn't. If one is not - specified on the command line, the driver will assign one based on the - device IP address. It will provide the address fe:fd:nn:nn:nn:nn - where nn.nn.nn.nn is the device IP address. This is nearly always - sufficient to guarantee a unique hardware address for the device. A - couple of exceptions are: - - o Another set of virtual ethernet devices are on the same network and - they are assigned hardware addresses using a different scheme which - may conflict with the UML IP address-based scheme - - o You aren't going to use the device for IP networking, so you don't - assign the device an IP address - - If you let the driver provide the hardware address, you should make - sure that the device IP address is known before the interface is - brought up. So, inside UML, this will guarantee that: - - - - UML# - ifconfig eth0 192.168.0.250 up - - - - - If you decide to assign the hardware address yourself, make sure that - the first byte of the address is even. Addresses with an odd first - byte are broadcast addresses, which you don't want assigned to a - device. - - - - 6.4. UML interface setup - - Once the network devices have been described on the command line, you - should boot UML and log in. - - - The first thing to do is bring the interface up: - - - UML# ifconfig ethn ip-address up - - - - - You should be able to ping the host at this point. - - - To reach the rest of the world, you should set a default route to the - host: - - - UML# route add default gw host ip - - - - - Again, with host ip of 192.168.0.4: - - - UML# route add default gw 192.168.0.4 - - - - - This page used to recommend setting a network route to your local net. - This is wrong, because it will cause UML to try to figure out hardware - addresses of the local machines by arping on the interface to the - host. Since that interface is basically a single strand of ethernet - with two nodes on it (UML and the host) and arp requests don't cross - networks, they will fail to elicit any responses. So, what you want - is for UML to just blindly throw all packets at the host and let it - figure out what to do with them, which is what leaving out the network - route and adding the default route does. - - - Note: If you can't communicate with other hosts on your physical - ethernet, it's probably because of a network route that's - automatically set up. If you run 'route -n' and see a route that - looks like this: - - - - - Destination Gateway Genmask Flags Metric Ref Use Iface - 192.168.0.0 0.0.0.0 255.255.255.0 U 0 0 0 eth0 - - - - - with a mask that's not 255.255.255.255, then replace it with a route - to your host: - - - UML# - route del -net 192.168.0.0 dev eth0 netmask 255.255.255.0 - - - - - - - UML# - route add -host 192.168.0.4 dev eth0 - - - - - This, plus the default route to the host, will allow UML to exchange - packets with any machine on your ethernet. - - - - 6.5. Multicast - - The simplest way to set up a virtual network between multiple UMLs is - to use the mcast transport. This was written by Harald Welte and is - present in UML version 2.4.5-5um and later. Your system must have - multicast enabled in the kernel and there must be a multicast-capable - network device on the host. Normally, this is eth0, but if there is - no ethernet card on the host, then you will likely get strange error - messages when you bring the device up inside UML. - - - To use it, run two UMLs with - - - eth0=mcast - - - - - on their command lines. Log in, configure the ethernet device in each - machine with different IP addresses: - - - UML1# ifconfig eth0 192.168.0.254 - - - - - - - UML2# ifconfig eth0 192.168.0.253 - - - - - and they should be able to talk to each other. - - The full set of command line options for this transport are - - - - ethn=mcast,ethernet address,multicast - address,multicast port,ttl - - - - - Harald's original README is here and explains these in detail, as well as - some other issues. - - There is also a related point-to-point only "ucast" transport. - This is useful when your network does not support multicast, and - all network connections are simple point to point links. - - The full set of command line options for this transport are - - - ethn=ucast,ethernet address,remote address,listen port,remote port - - - - - 6.6. TUN/TAP with the uml_net helper - - TUN/TAP is the preferred mechanism on 2.4 to exchange packets with the - host. The TUN/TAP backend has been in UML since 2.4.9-3um. - - - The easiest way to get up and running is to let the setuid uml_net - helper do the host setup for you. This involves insmod-ing the tun.o - module if necessary, configuring the device, and setting up IP - forwarding, routing, and proxy arp. If you are new to UML networking, - do this first. If you're concerned about the security implications of - the setuid helper, use it to get up and running, then read the next - section to see how to have UML use a preconfigured tap device, which - avoids the use of uml_net. - - - If you specify an IP address for the host side of the device, the - uml_net helper will do all necessary setup on the host - the only - requirement is that TUN/TAP be available, either built in to the host - kernel or as the tun.o module. - - The format of the command line switch to attach a device to a TUN/TAP - device is - - - eth =tuntap,,, - - - - - For example, this argument will attach the UML's eth0 to the next - available tap device and assign an ethernet address to it based on its - IP address - - - eth0=tuntap,,,192.168.0.254 - - - - - - - Note that the IP address that must be used for the eth device inside - UML is fixed by the routing and proxy arp that is set up on the - TUN/TAP device on the host. You can use a different one, but it won't - work because reply packets won't reach the UML. This is a feature. - It prevents a nasty UML user from doing things like setting the UML IP - to the same as the network's nameserver or mail server. - - - There are a couple potential problems with running the TUN/TAP - transport on a 2.4 host kernel - - o TUN/TAP seems not to work on 2.4.3 and earlier. Upgrade the host - kernel or use the ethertap transport. - - o With an upgraded kernel, TUN/TAP may fail with - - - File descriptor in bad state - - - - - This is due to a header mismatch between the upgraded kernel and the - kernel that was originally installed on the machine. The fix is to - make sure that /usr/src/linux points to the headers for the running - kernel. - - These were pointed out by Tim Robinson in - name="this uml- - user post"> . - - - - 6.7. TUN/TAP with a preconfigured tap device - - If you prefer not to have UML use uml_net (which is somewhat - insecure), with UML 2.4.17-11, you can set up a TUN/TAP device - beforehand. The setup needs to be done as root, but once that's done, - there is no need for root assistance. Setting up the device is done - as follows: - - o Create the device with tunctl (available from the UML utilities - tarball) - - - - - host# tunctl -u uid - - - - - where uid is the user id or username that UML will be run as. This - will tell you what device was created. - - o Configure the device IP (change IP addresses and device name to - suit) - - - - - host# ifconfig tap0 192.168.0.254 up - - - - - - o Set up routing and arping if desired - this is my recipe, there are - other ways of doing the same thing - - - host# - bash -c 'echo 1 > /proc/sys/net/ipv4/ip_forward' - - host# - route add -host 192.168.0.253 dev tap0 - - - - - - - host# - bash -c 'echo 1 > /proc/sys/net/ipv4/conf/tap0/proxy_arp' - - - - - - - host# - arp -Ds 192.168.0.253 eth0 pub - - - - - Note that this must be done every time the host boots - this configu- - ration is not stored across host reboots. So, it's probably a good - idea to stick it in an rc file. An even better idea would be a little - utility which reads the information from a config file and sets up - devices at boot time. - - o Rather than using up two IPs and ARPing for one of them, you can - also provide direct access to your LAN by the UML by using a - bridge. - - - host# - brctl addbr br0 - - - - - - - host# - ifconfig eth0 0.0.0.0 promisc up - - - - - - - host# - ifconfig tap0 0.0.0.0 promisc up - - - - - - - host# - ifconfig br0 192.168.0.1 netmask 255.255.255.0 up - - - - - - - - host# - brctl stp br0 off - - - - - - - host# - brctl setfd br0 1 - - - - - - - host# - brctl sethello br0 1 - - - - - - - host# - brctl addif br0 eth0 - - - - - - - host# - brctl addif br0 tap0 - - - - - Note that 'br0' should be setup using ifconfig with the existing IP - address of eth0, as eth0 no longer has its own IP. - - o - - - Also, the /dev/net/tun device must be writable by the user running - UML in order for the UML to use the device that's been configured - for it. The simplest thing to do is - - - host# chmod 666 /dev/net/tun - - - - - Making it world-writable looks bad, but it seems not to be - exploitable as a security hole. However, it does allow anyone to cre- - ate useless tap devices (useless because they can't configure them), - which is a DOS attack. A somewhat more secure alternative would to be - to create a group containing all the users who have preconfigured tap - devices and chgrp /dev/net/tun to that group with mode 664 or 660. - - - o Once the device is set up, run UML with 'eth0=tuntap,device name' - (i.e. 'eth0=tuntap,tap0') on the command line (or do it with the - mconsole config command). - - o Bring the eth device up in UML and you're in business. - - If you don't want that tap device any more, you can make it non- - persistent with - - - host# tunctl -d tap device - - - - - Finally, tunctl has a -b (for brief mode) switch which causes it to - output only the name of the tap device it created. This makes it - suitable for capture by a script: - - - host# TAP=`tunctl -u 1000 -b` - - - - - - - 6.8. Ethertap - - Ethertap is the general mechanism on 2.2 for userspace processes to - exchange packets with the kernel. - - - - To use this transport, you need to describe the virtual network device - on the UML command line. The general format for this is - - - eth =ethertap, , , - - - - - So, the previous example - - - eth0=ethertap,tap0,fe:fd:0:0:0:1,192.168.0.254 - - - - - attaches the UML eth0 device to the host /dev/tap0, assigns it the - ethernet address fe:fd:0:0:0:1, and assigns the IP address - 192.168.0.254 to the tap device. - - - - The tap device is mandatory, but the others are optional. If the - ethernet address is omitted, one will be assigned to it. - - - The presence of the tap IP address will cause the helper to run and do - whatever host setup is needed to allow the virtual machine to - communicate with the outside world. If you're not sure you know what - you're doing, this is the way to go. - - - If it is absent, then you must configure the tap device and whatever - arping and routing you will need on the host. However, even in this - case, the uml_net helper still needs to be in your path and it must be - setuid root if you're not running UML as root. This is because the - tap device doesn't support SIGIO, which UML needs in order to use - something as a source of input. So, the helper is used as a - convenient asynchronous IO thread. - - If you're using the uml_net helper, you can ignore the following host - setup - uml_net will do it for you. You just need to make sure you - have ethertap available, either built in to the host kernel or - available as a module. - - - If you want to set things up yourself, you need to make sure that the - appropriate /dev entry exists. If it doesn't, become root and create - it as follows: - - - mknod /dev/tap c 36 + 16 - - - - - For example, this is how to create /dev/tap0: - - - mknod /dev/tap0 c 36 0 + 16 - - - - - You also need to make sure that the host kernel has ethertap support. - If ethertap is enabled as a module, you apparently need to insmod - ethertap once for each ethertap device you want to enable. So, - - - host# - insmod ethertap - - - - - will give you the tap0 interface. To get the tap1 interface, you need - to run - - - host# - insmod ethertap unit=1 -o ethertap1 - - - - - - - - 6.9. The switch daemon - - Note: This is the daemon formerly known as uml_router, but which was - renamed so the network weenies of the world would stop growling at me. - - - The switch daemon, uml_switch, provides a mechanism for creating a - totally virtual network. By default, it provides no connection to the - host network (but see -tap, below). - - - The first thing you need to do is run the daemon. Running it with no - arguments will make it listen on a default pair of unix domain - sockets. - - - If you want it to listen on a different pair of sockets, use - - - -unix control socket data socket - - - - - - If you want it to act as a hub rather than a switch, use - - - -hub - - - - - - If you want the switch to be connected to host networking (allowing - the umls to get access to the outside world through the host), use - - - -tap tap0 - - - - - - Note that the tap device must be preconfigured (see "TUN/TAP with a - preconfigured tap device", above). If you're using a different tap - device than tap0, specify that instead of tap0. - - - uml_switch can be backgrounded as follows - - - host% - uml_switch [ options ] < /dev/null > /dev/null - - - - - The reason it doesn't background by default is that it listens to - stdin for EOF. When it sees that, it exits. - - - The general format of the kernel command line switch is - - - - ethn=daemon,ethernet address,socket - type,control socket,data socket - - - - - You can leave off everything except the 'daemon'. You only need to - specify the ethernet address if the one that will be assigned to it - isn't acceptable for some reason. The rest of the arguments describe - how to communicate with the daemon. You should only specify them if - you told the daemon to use different sockets than the default. So, if - you ran the daemon with no arguments, running the UML on the same - machine with - eth0=daemon - - - - - will cause the eth0 driver to attach itself to the daemon correctly. - - - - 6.10. Slip - - Slip is another, less general, mechanism for a process to communicate - with the host networking. In contrast to the ethertap interface, - which exchanges ethernet frames with the host and can be used to - transport any higher-level protocol, it can only be used to transport - IP. - - - The general format of the command line switch is - - - - ethn=slip,slip IP - - - - - The slip IP argument is the IP address that will be assigned to the - host end of the slip device. If it is specified, the helper will run - and will set up the host so that the virtual machine can reach it and - the rest of the network. - - - There are some oddities with this interface that you should be aware - of. You should only specify one slip device on a given virtual - machine, and its name inside UML will be 'umn', not 'eth0' or whatever - you specified on the command line. These problems will be fixed at - some point. - - - - 6.11. Slirp - - slirp uses an external program, usually /usr/bin/slirp, to provide IP - only networking connectivity through the host. This is similar to IP - masquerading with a firewall, although the translation is performed in - user-space, rather than by the kernel. As slirp does not set up any - interfaces on the host, or changes routing, slirp does not require - root access or setuid binaries on the host. - - - The general format of the command line switch for slirp is: - - - - ethn=slirp,ethernet address,slirp path - - - - - The ethernet address is optional, as UML will set up the interface - with an ethernet address based upon the initial IP address of the - interface. The slirp path is generally /usr/bin/slirp, although it - will depend on distribution. - - - The slirp program can have a number of options passed to the command - line and we can't add them to the UML command line, as they will be - parsed incorrectly. Instead, a wrapper shell script can be written or - the options inserted into the /.slirprc file. More information on - all of the slirp options can be found in its man pages. - - - The eth0 interface on UML should be set up with the IP 10.2.0.15, - although you can use anything as long as it is not used by a network - you will be connecting to. The default route on UML should be set to - use - - - UML# - route add default dev eth0 - - - - - slirp provides a number of useful IP addresses which can be used by - UML, such as 10.0.2.3 which is an alias for the DNS server specified - in /etc/resolv.conf on the host or the IP given in the 'dns' option - for slirp. - - - Even with a baudrate setting higher than 115200, the slirp connection - is limited to 115200. If you need it to go faster, the slirp binary - needs to be compiled with FULL_BOLT defined in config.h. - - - - 6.12. pcap - - The pcap transport is attached to a UML ethernet device on the command - line or with uml_mconsole with the following syntax: - - - - ethn=pcap,host interface,filter - expression,option1,option2 - - - - - The expression and options are optional. - - - The interface is whatever network device on the host you want to - sniff. The expression is a pcap filter expression, which is also what - tcpdump uses, so if you know how to specify tcpdump filters, you will - use the same expressions here. The options are up to two of - 'promisc', control whether pcap puts the host interface into - promiscuous mode. 'optimize' and 'nooptimize' control whether the pcap - expression optimizer is used. - - - Example: - - - - eth0=pcap,eth0,tcp - - eth1=pcap,eth0,!tcp - - - - will cause the UML eth0 to emit all tcp packets on the host eth0 and - the UML eth1 to emit all non-tcp packets on the host eth0. - - - - 6.13. Setting up the host yourself - - If you don't specify an address for the host side of the ethertap or - slip device, UML won't do any setup on the host. So this is what is - needed to get things working (the examples use a host-side IP of - 192.168.0.251 and a UML-side IP of 192.168.0.250 - adjust to suit your - own network): - - o The device needs to be configured with its IP address. Tap devices - are also configured with an mtu of 1484. Slip devices are - configured with a point-to-point address pointing at the UML ip - address. - - - host# ifconfig tap0 arp mtu 1484 192.168.0.251 up - - - - - - - host# - ifconfig sl0 192.168.0.251 pointopoint 192.168.0.250 up - - - - - - o If a tap device is being set up, a route is set to the UML IP. - - - UML# route add -host 192.168.0.250 gw 192.168.0.251 - - - - - - o To allow other hosts on your network to see the virtual machine, - proxy arp is set up for it. - - - host# arp -Ds 192.168.0.250 eth0 pub - - - - - - o Finally, the host is set up to route packets. - - - host# echo 1 > /proc/sys/net/ipv4/ip_forward - - - - - - - - - - - 7. Sharing Filesystems between Virtual Machines - - - - - 7.1. A warning - - Don't attempt to share filesystems simply by booting two UMLs from the - same file. That's the same thing as booting two physical machines - from a shared disk. It will result in filesystem corruption. - - - - 7.2. Using layered block devices - - The way to share a filesystem between two virtual machines is to use - the copy-on-write (COW) layering capability of the ubd block driver. - As of 2.4.6-2um, the driver supports layering a read-write private - device over a read-only shared device. A machine's writes are stored - in the private device, while reads come from either device - the - private one if the requested block is valid in it, the shared one if - not. Using this scheme, the majority of data which is unchanged is - shared between an arbitrary number of virtual machines, each of which - has a much smaller file containing the changes that it has made. With - a large number of UMLs booting from a large root filesystem, this - leads to a huge disk space saving. It will also help performance, - since the host will be able to cache the shared data using a much - smaller amount of memory, so UML disk requests will be served from the - host's memory rather than its disks. - - - - - To add a copy-on-write layer to an existing block device file, simply - add the name of the COW file to the appropriate ubd switch: - - - ubd0=root_fs_cow,root_fs_debian_22 - - - - - where 'root_fs_cow' is the private COW file and 'root_fs_debian_22' is - the existing shared filesystem. The COW file need not exist. If it - doesn't, the driver will create and initialize it. Once the COW file - has been initialized, it can be used on its own on the command line: - - - ubd0=root_fs_cow - - - - - The name of the backing file is stored in the COW file header, so it - would be redundant to continue specifying it on the command line. - - - - 7.3. Note! - - When checking the size of the COW file in order to see the gobs of - space that you're saving, make sure you use 'ls -ls' to see the actual - disk consumption rather than the length of the file. The COW file is - sparse, so the length will be very different from the disk usage. - Here is a 'ls -l' of a COW file and backing file from one boot and - shutdown: - host% ls -l cow.debian debian2.2 - -rw-r--r-- 1 jdike jdike 492504064 Aug 6 21:16 cow.debian - -rwxrw-rw- 1 jdike jdike 537919488 Aug 6 20:42 debian2.2 - - - - - Doesn't look like much saved space, does it? Well, here's 'ls -ls': - - - host% ls -ls cow.debian debian2.2 - 880 -rw-r--r-- 1 jdike jdike 492504064 Aug 6 21:16 cow.debian - 525832 -rwxrw-rw- 1 jdike jdike 537919488 Aug 6 20:42 debian2.2 - - - - - Now, you can see that the COW file has less than a meg of disk, rather - than 492 meg. - - - - 7.4. Another warning - - Once a filesystem is being used as a readonly backing file for a COW - file, do not boot directly from it or modify it in any way. Doing so - will invalidate any COW files that are using it. The mtime and size - of the backing file are stored in the COW file header at its creation, - and they must continue to match. If they don't, the driver will - refuse to use the COW file. - - - - - If you attempt to evade this restriction by changing either the - backing file or the COW header by hand, you will get a corrupted - filesystem. - - - - - Among other things, this means that upgrading the distribution in a - backing file and expecting that all of the COW files using it will see - the upgrade will not work. - - - - - 7.5. uml_moo : Merging a COW file with its backing file - - Depending on how you use UML and COW devices, it may be advisable to - merge the changes in the COW file into the backing file every once in - a while. - - - - - The utility that does this is uml_moo. Its usage is - - - host% uml_moo COW file new backing file - - - - - There's no need to specify the backing file since that information is - already in the COW file header. If you're paranoid, boot the new - merged file, and if you're happy with it, move it over the old backing - file. - - - - - uml_moo creates a new backing file by default as a safety measure. It - also has a destructive merge option which will merge the COW file - directly into its current backing file. This is really only usable - when the backing file only has one COW file associated with it. If - there are multiple COWs associated with a backing file, a -d merge of - one of them will invalidate all of the others. However, it is - convenient if you're short of disk space, and it should also be - noticeably faster than a non-destructive merge. - - - - - uml_moo is installed with the UML deb and RPM. If you didn't install - UML from one of those packages, you can also get it from the UML - utilities tar file in tools/moo. - - - - - - - - - 8. Creating filesystems - - - You may want to create and mount new UML filesystems, either because - your root filesystem isn't large enough or because you want to use a - filesystem other than ext2. - - - This was written on the occasion of reiserfs being included in the - 2.4.1 kernel pool, and therefore the 2.4.1 UML, so the examples will - talk about reiserfs. This information is generic, and the examples - should be easy to translate to the filesystem of your choice. - - - 8.1. Create the filesystem file - - dd is your friend. All you need to do is tell dd to create an empty - file of the appropriate size. I usually make it sparse to save time - and to avoid allocating disk space until it's actually used. For - example, the following command will create a sparse 100 meg file full - of zeroes. - - - host% - dd if=/dev/zero of=new_filesystem seek=100 count=1 bs=1M - - - - - - - 8.2. Assign the file to a UML device - - Add an argument like the following to the UML command line: - - ubd4=new_filesystem - - - - - making sure that you use an unassigned ubd device number. - - - - 8.3. Creating and mounting the filesystem - - Make sure that the filesystem is available, either by being built into - the kernel, or available as a module, then boot up UML and log in. If - the root filesystem doesn't have the filesystem utilities (mkfs, fsck, - etc), then get them into UML by way of the net or hostfs. - - - Make the new filesystem on the device assigned to the new file: - - - host# mkreiserfs /dev/ubd/4 - - - <----------- MKREISERFSv2 -----------> - - ReiserFS version 3.6.25 - Block size 4096 bytes - Block count 25856 - Used blocks 8212 - Journal - 8192 blocks (18-8209), journal header is in block 8210 - Bitmaps: 17 - Root block 8211 - Hash function "r5" - ATTENTION: ALL DATA WILL BE LOST ON '/dev/ubd/4'! (y/n)y - journal size 8192 (from 18) - Initializing journal - 0%....20%....40%....60%....80%....100% - Syncing..done. - - - - - Now, mount it: - - - UML# - mount /dev/ubd/4 /mnt - - - - - and you're in business. - - - - - - - - - - 9. Host file access - - - If you want to access files on the host machine from inside UML, you - can treat it as a separate machine and either nfs mount directories - from the host or copy files into the virtual machine with scp or rcp. - However, since UML is running on the host, it can access those - files just like any other process and make them available inside the - virtual machine without needing to use the network. - - - This is now possible with the hostfs virtual filesystem. With it, you - can mount a host directory into the UML filesystem and access the - files contained in it just as you would on the host. - - - 9.1. Using hostfs - - To begin with, make sure that hostfs is available inside the virtual - machine with - - - UML# cat /proc/filesystems - - - - . hostfs should be listed. If it's not, either rebuild the kernel - with hostfs configured into it or make sure that hostfs is built as a - module and available inside the virtual machine, and insmod it. - - - Now all you need to do is run mount: - - - UML# mount none /mnt/host -t hostfs - - - - - will mount the host's / on the virtual machine's /mnt/host. - - - If you don't want to mount the host root directory, then you can - specify a subdirectory to mount with the -o switch to mount: - - - UML# mount none /mnt/home -t hostfs -o /home - - - - - will mount the hosts's /home on the virtual machine's /mnt/home. - - - - 9.2. hostfs as the root filesystem - - It's possible to boot from a directory hierarchy on the host using - hostfs rather than using the standard filesystem in a file. - - To start, you need that hierarchy. The easiest way is to loop mount - an existing root_fs file: - - - host# mount root_fs uml_root_dir -o loop - - - - - You need to change the filesystem type of / in etc/fstab to be - 'hostfs', so that line looks like this: - - /dev/ubd/0 / hostfs defaults 1 1 - - - - - Then you need to chown to yourself all the files in that directory - that are owned by root. This worked for me: - - - host# find . -uid 0 -exec chown jdike {} \; - - - - - Next, make sure that your UML kernel has hostfs compiled in, not as a - module. Then run UML with the boot device pointing at that directory: - - - ubd0=/path/to/uml/root/directory - - - - - UML should then boot as it does normally. - - - 9.3. Building hostfs - - If you need to build hostfs because it's not in your kernel, you have - two choices: - - - - o Compiling hostfs into the kernel: - - - Reconfigure the kernel and set the 'Host filesystem' option under - - - o Compiling hostfs as a module: - - - Reconfigure the kernel and set the 'Host filesystem' option under - be in arch/um/fs/hostfs/hostfs.o. Install that in - /lib/modules/`uname -r`/fs in the virtual machine, boot it up, and - - - UML# insmod hostfs - - - - - - - - - - - - - 10. The Management Console - - - - The UML management console is a low-level interface to the kernel, - somewhat like the i386 SysRq interface. Since there is a full-blown - operating system under UML, there is much greater flexibility possible - than with the SysRq mechanism. - - - There are a number of things you can do with the mconsole interface: - - o get the kernel version - - o add and remove devices - - o halt or reboot the machine - - o Send SysRq commands - - o Pause and resume the UML - - - You need the mconsole client (uml_mconsole) which is present in CVS - (/tools/mconsole) in 2.4.5-9um and later, and will be in the RPM in - 2.4.6. - - - You also need CONFIG_MCONSOLE (under 'General Setup') enabled in UML. - When you boot UML, you'll see a line like: - - - mconsole initialized on /home/jdike/.uml/umlNJ32yL/mconsole - - - - - If you specify a unique machine id one the UML command line, i.e. - - - umid=debian - - - - - you'll see this - - - mconsole initialized on /home/jdike/.uml/debian/mconsole - - - - - That file is the socket that uml_mconsole will use to communicate with - UML. Run it with either the umid or the full path as its argument: - - - host% uml_mconsole debian - - - - - or - - - host% uml_mconsole /home/jdike/.uml/debian/mconsole - - - - - You'll get a prompt, at which you can run one of these commands: - - o version - - o halt - - o reboot - - o config - - o remove - - o sysrq - - o help - - o cad - - o stop - - o go - - - 10.1. version - - This takes no arguments. It prints the UML version. - - - (mconsole) version - OK Linux usermode 2.4.5-9um #1 Wed Jun 20 22:47:08 EDT 2001 i686 - - - - - There are a couple actual uses for this. It's a simple no-op which - can be used to check that a UML is running. It's also a way of - sending an interrupt to the UML. This is sometimes useful on SMP - hosts, where there's a bug which causes signals to UML to be lost, - often causing it to appear to hang. Sending such a UML the mconsole - version command is a good way to 'wake it up' before networking has - been enabled, as it does not do anything to the function of the UML. - - - - 10.2. halt and reboot - - These take no arguments. They shut the machine down immediately, with - no syncing of disks and no clean shutdown of userspace. So, they are - pretty close to crashing the machine. - - - (mconsole) halt - OK - - - - - - - 10.3. config - - "config" adds a new device to the virtual machine. Currently the ubd - and network drivers support this. It takes one argument, which is the - device to add, with the same syntax as the kernel command line. - - - - - (mconsole) - config ubd3=/home/jdike/incoming/roots/root_fs_debian22 - - OK - (mconsole) config eth1=mcast - OK - - - - - - - 10.4. remove - - "remove" deletes a device from the system. Its argument is just the - name of the device to be removed. The device must be idle in whatever - sense the driver considers necessary. In the case of the ubd driver, - the removed block device must not be mounted, swapped on, or otherwise - open, and in the case of the network driver, the device must be down. - - - (mconsole) remove ubd3 - OK - (mconsole) remove eth1 - OK - - - - - - - 10.5. sysrq - - This takes one argument, which is a single letter. It calls the - generic kernel's SysRq driver, which does whatever is called for by - that argument. See the SysRq documentation in - Documentation/admin-guide/sysrq.rst in your favorite kernel tree to - see what letters are valid and what they do. - - - - 10.6. help - - "help" returns a string listing the valid commands and what each one - does. - - - - 10.7. cad - - This invokes the Ctl-Alt-Del action on init. What exactly this ends - up doing is up to /etc/inittab. Normally, it reboots the machine. - With UML, this is usually not desired, so if a halt would be better, - then find the section of inittab that looks like this - - - # What to do when CTRL-ALT-DEL is pressed. - ca:12345:ctrlaltdel:/sbin/shutdown -t1 -a -r now - - - - - and change the command to halt. - - - - 10.8. stop - - This puts the UML in a loop reading mconsole requests until a 'go' - mconsole command is received. This is very useful for making backups - of UML filesystems, as the UML can be stopped, then synced via 'sysrq - s', so that everything is written to the filesystem. You can then copy - the filesystem and then send the UML 'go' via mconsole. - - - Note that a UML running with more than one CPU will have problems - after you send the 'stop' command, as only one CPU will be held in a - mconsole loop and all others will continue as normal. This is a bug, - and will be fixed. - - - - 10.9. go - - This resumes a UML after being paused by a 'stop' command. Note that - when the UML has resumed, TCP connections may have timed out and if - the UML is paused for a long period of time, crond might go a little - crazy, running all the jobs it didn't do earlier. - - - - - - - - - 11. Kernel debugging - - - Note: The interface that makes debugging, as described here, possible - is present in 2.4.0-test6 kernels and later. - - - Since the user-mode kernel runs as a normal Linux process, it is - possible to debug it with gdb almost like any other process. It is - slightly different because the kernel's threads are already being - ptraced for system call interception, so gdb can't ptrace them. - However, a mechanism has been added to work around that problem. - - - In order to debug the kernel, you need build it from source. See - ``Compiling the kernel and modules'' for information on doing that. - Make sure that you enable CONFIG_DEBUGSYM and CONFIG_PT_PROXY during - the config. These will compile the kernel with -g, and enable the - ptrace proxy so that gdb works with UML, respectively. - - - - - 11.1. Starting the kernel under gdb - - You can have the kernel running under the control of gdb from the - beginning by putting 'debug' on the command line. You will get an - xterm with gdb running inside it. The kernel will send some commands - to gdb which will leave it stopped at the beginning of start_kernel. - At this point, you can get things going with 'next', 'step', or - 'cont'. - - - There is a transcript of a debugging session here , with breakpoints being set in the scheduler and in an - interrupt handler. - 11.2. Examining sleeping processes - - Not every bug is evident in the currently running process. Sometimes, - processes hang in the kernel when they shouldn't because they've - deadlocked on a semaphore or something similar. In this case, when - you ^C gdb and get a backtrace, you will see the idle thread, which - isn't very relevant. - - - What you want is the stack of whatever process is sleeping when it - shouldn't be. You need to figure out which process that is, which is - generally fairly easy. Then you need to get its host process id, - which you can do either by looking at ps on the host or at - task.thread.extern_pid in gdb. - - - Now what you do is this: - - o detach from the current thread - - - (UML gdb) det - - - - - - o attach to the thread you are interested in - - - (UML gdb) att - - - - - - o look at its stack and anything else of interest - - - (UML gdb) bt - - - - - Note that you can't do anything at this point that requires that a - process execute, e.g. calling a function - - o when you're done looking at that process, reattach to the current - thread and continue it - - - (UML gdb) - att 1 - - - - - - - (UML gdb) - c - - - - - Here, specifying any pid which is not the process id of a UML thread - will cause gdb to reattach to the current thread. I commonly use 1, - but any other invalid pid would work. - - - - 11.3. Running ddd on UML - - ddd works on UML, but requires a special kludge. The process goes - like this: - - o Start ddd - - - host% ddd linux - - - - - - o With ps, get the pid of the gdb that ddd started. You can ask the - gdb to tell you, but for some reason that confuses things and - causes a hang. - - o run UML with 'debug=parent gdb-pid=' added to the command line - - it will just sit there after you hit return - - o type 'att 1' to the ddd gdb and you will see something like - - - 0xa013dc51 in __kill () - - - (gdb) - - - - - - o At this point, type 'c', UML will boot up, and you can use ddd just - as you do on any other process. - - - - 11.4. Debugging modules - - gdb has support for debugging code which is dynamically loaded into - the process. This support is what is needed to debug kernel modules - under UML. - - - Using that support is somewhat complicated. You have to tell gdb what - object file you just loaded into UML and where in memory it is. Then, - it can read the symbol table, and figure out where all the symbols are - from the load address that you provided. It gets more interesting - when you load the module again (i.e. after an rmmod). You have to - tell gdb to forget about all its symbols, including the main UML ones - for some reason, then load then all back in again. - - - There's an easy way and a hard way to do this. The easy way is to use - the umlgdb expect script written by Chandan Kudige. It basically - automates the process for you. - - - First, you must tell it where your modules are. There is a list in - the script that looks like this: - set MODULE_PATHS { - "fat" "/usr/src/uml/linux-2.4.18/fs/fat/fat.o" - "isofs" "/usr/src/uml/linux-2.4.18/fs/isofs/isofs.o" - "minix" "/usr/src/uml/linux-2.4.18/fs/minix/minix.o" - } - - - - - You change that to list the names and paths of the modules that you - are going to debug. Then you run it from the toplevel directory of - your UML pool and it basically tells you what to do: - - - - - ******** GDB pid is 21903 ******** - Start UML as: ./linux debug gdb-pid=21903 - - - - GNU gdb 5.0rh-5 Red Hat Linux 7.1 - Copyright 2001 Free Software Foundation, Inc. - GDB is free software, covered by the GNU General Public License, and you are - welcome to change it and/or distribute copies of it under certain conditions. - Type "show copying" to see the conditions. - There is absolutely no warranty for GDB. Type "show warranty" for details. - This GDB was configured as "i386-redhat-linux"... - (gdb) b sys_init_module - Breakpoint 1 at 0xa0011923: file module.c, line 349. - (gdb) att 1 - - - - - After you run UML and it sits there doing nothing, you hit return at - the 'att 1' and continue it: - - - Attaching to program: /home/jdike/linux/2.4/um/./linux, process 1 - 0xa00f4221 in __kill () - (UML gdb) c - Continuing. - - - - - At this point, you debug normally. When you insmod something, the - expect magic will kick in and you'll see something like: - - - - - - - - - - - - - - - - - - *** Module hostfs loaded *** - Breakpoint 1, sys_init_module (name_user=0x805abb0 "hostfs", - mod_user=0x8070e00) at module.c:349 - 349 char *name, *n_name, *name_tmp = NULL; - (UML gdb) finish - Run till exit from #0 sys_init_module (name_user=0x805abb0 "hostfs", - mod_user=0x8070e00) at module.c:349 - 0xa00e2e23 in execute_syscall (r=0xa8140284) at syscall_kern.c:411 - 411 else res = EXECUTE_SYSCALL(syscall, regs); - Value returned is $1 = 0 - (UML gdb) - p/x (int)module_list + module_list->size_of_struct - - $2 = 0xa9021054 - (UML gdb) symbol-file ./linux - Load new symbol table from "./linux"? (y or n) y - Reading symbols from ./linux... - done. - (UML gdb) - add-symbol-file /home/jdike/linux/2.4/um/arch/um/fs/hostfs/hostfs.o 0xa9021054 - - add symbol table from file "/home/jdike/linux/2.4/um/arch/um/fs/hostfs/hostfs.o" at - .text_addr = 0xa9021054 - (y or n) y - - Reading symbols from /home/jdike/linux/2.4/um/arch/um/fs/hostfs/hostfs.o... - done. - (UML gdb) p *module_list - $1 = {size_of_struct = 84, next = 0xa0178720, name = 0xa9022de0 "hostfs", - size = 9016, uc = {usecount = {counter = 0}, pad = 0}, flags = 1, - nsyms = 57, ndeps = 0, syms = 0xa9023170, deps = 0x0, refs = 0x0, - init = 0xa90221f0 , cleanup = 0xa902222c , - ex_table_start = 0x0, ex_table_end = 0x0, persist_start = 0x0, - persist_end = 0x0, can_unload = 0, runsize = 0, kallsyms_start = 0x0, - kallsyms_end = 0x0, - archdata_start = 0x1b855
, - archdata_end = 0xe5890000
, - kernel_data = 0xf689c35d
} - >> Finished loading symbols for hostfs ... - - - - - That's the easy way. It's highly recommended. The hard way is - described below in case you're interested in what's going on. - - - Boot the kernel under the debugger and load the module with insmod or - modprobe. With gdb, do: - - - (UML gdb) p module_list - - - - - This is a list of modules that have been loaded into the kernel, with - the most recently loaded module first. Normally, the module you want - is at module_list. If it's not, walk down the next links, looking at - the name fields until find the module you want to debug. Take the - address of that structure, and add module.size_of_struct (which in - 2.4.10 kernels is 96 (0x60)) to it. Gdb can make this hard addition - for you :-): - - - - (UML gdb) - printf "%#x\n", (int)module_list module_list->size_of_struct - - - - - The offset from the module start occasionally changes (before 2.4.0, - it was module.size_of_struct + 4), so it's a good idea to check the - init and cleanup addresses once in a while, as describe below. Now - do: - - - (UML gdb) - add-symbol-file /path/to/module/on/host that_address - - - - - Tell gdb you really want to do it, and you're in business. - - - If there's any doubt that you got the offset right, like breakpoints - appear not to work, or they're appearing in the wrong place, you can - check it by looking at the module structure. The init and cleanup - fields should look like: - - - init = 0x588066b0 , cleanup = 0x588066c0 - - - - - with no offsets on the symbol names. If the names are right, but they - are offset, then the offset tells you how much you need to add to the - address you gave to add-symbol-file. - - - When you want to load in a new version of the module, you need to get - gdb to forget about the old one. The only way I've found to do that - is to tell gdb to forget about all symbols that it knows about: - - - (UML gdb) symbol-file - - - - - Then reload the symbols from the kernel binary: - - - (UML gdb) symbol-file /path/to/kernel - - - - - and repeat the process above. You'll also need to re-enable break- - points. They were disabled when you dumped all the symbols because - gdb couldn't figure out where they should go. - - - - 11.5. Attaching gdb to the kernel - - If you don't have the kernel running under gdb, you can attach gdb to - it later by sending the tracing thread a SIGUSR1. The first line of - the console output identifies its pid: - tracing thread pid = 20093 - - - - - When you send it the signal: - - - host% kill -USR1 20093 - - - - - you will get an xterm with gdb running in it. - - - If you have the mconsole compiled into UML, then the mconsole client - can be used to start gdb: - - - (mconsole) (mconsole) config gdb=xterm - - - - - will fire up an xterm with gdb running in it. - - - - 11.6. Using alternate debuggers - - UML has support for attaching to an already running debugger rather - than starting gdb itself. This is present in CVS as of 17 Apr 2001. - I sent it to Alan for inclusion in the ac tree, and it will be in my - 2.4.4 release. - - - This is useful when gdb is a subprocess of some UI, such as emacs or - ddd. It can also be used to run debuggers other than gdb on UML. - Below is an example of using strace as an alternate debugger. - - - To do this, you need to get the pid of the debugger and pass it in - with the - - - If you are using gdb under some UI, then tell it to 'att 1', and - you'll find yourself attached to UML. - - - If you are using something other than gdb as your debugger, then - you'll need to get it to do the equivalent of 'att 1' if it doesn't do - it automatically. - - - An example of an alternate debugger is strace. You can strace the - actual kernel as follows: - - o Run the following in a shell - - - host% - sh -c 'echo pid=$$; echo -n hit return; read x; exec strace -p 1 -o strace.out' - - - - o Run UML with 'debug' and 'gdb-pid=' with the pid printed out - by the previous command - - o Hit return in the shell, and UML will start running, and strace - output will start accumulating in the output file. - - Note that this is different from running - - - host% strace ./linux - - - - - That will strace only the main UML thread, the tracing thread, which - doesn't do any of the actual kernel work. It just oversees the vir- - tual machine. In contrast, using strace as described above will show - you the low-level activity of the virtual machine. - - - - - - 12. Kernel debugging examples - - 12.1. The case of the hung fsck - - When booting up the kernel, fsck failed, and dropped me into a shell - to fix things up. I ran fsck -y, which hung: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Setting hostname uml [ OK ] - Checking root filesystem - /dev/fhd0 was not cleanly unmounted, check forced. - Error reading block 86894 (Attempt to read block from filesystem resulted in short read) while reading indirect blocks of inode 19780. - - /dev/fhd0: UNEXPECTED INCONSISTENCY; RUN fsck MANUALLY. - (i.e., without -a or -p options) - [ FAILED ] - - *** An error occurred during the file system check. - *** Dropping you to a shell; the system will reboot - *** when you leave the shell. - Give root password for maintenance - (or type Control-D for normal startup): - - [root@uml /root]# fsck -y /dev/fhd0 - fsck -y /dev/fhd0 - Parallelizing fsck version 1.14 (9-Jan-1999) - e2fsck 1.14, 9-Jan-1999 for EXT2 FS 0.5b, 95/08/09 - /dev/fhd0 contains a file system with errors, check forced. - Pass 1: Checking inodes, blocks, and sizes - Error reading block 86894 (Attempt to read block from filesystem resulted in short read) while reading indirect blocks of inode 19780. Ignore error? yes - - Inode 19780, i_blocks is 1548, should be 540. Fix? yes - - Pass 2: Checking directory structure - Error reading block 49405 (Attempt to read block from filesystem resulted in short read). Ignore error? yes - - Directory inode 11858, block 0, offset 0: directory corrupted - Salvage? yes - - Missing '.' in directory inode 11858. - Fix? yes - - Missing '..' in directory inode 11858. - Fix? yes - - - - - - The standard drill in this sort of situation is to fire up gdb on the - signal thread, which, in this case, was pid 1935. In another window, - I run gdb and attach pid 1935. - - - - - ~/linux/2.3.26/um 1016: gdb linux - GNU gdb 4.17.0.11 with Linux support - Copyright 1998 Free Software Foundation, Inc. - GDB is free software, covered by the GNU General Public License, and you are - welcome to change it and/or distribute copies of it under certain conditions. - Type "show copying" to see the conditions. - There is absolutely no warranty for GDB. Type "show warranty" for details. - This GDB was configured as "i386-redhat-linux"... - - (gdb) att 1935 - Attaching to program `/home/dike/linux/2.3.26/um/linux', Pid 1935 - 0x100756d9 in __wait4 () - - - - - - - Let's see what's currently running: - - - - (gdb) p current_task.pid - $1 = 0 - - - - - - It's the idle thread, which means that fsck went to sleep for some - reason and never woke up. - - - Let's guess that the last process in the process list is fsck: - - - - (gdb) p current_task.prev_task.comm - $13 = "fsck.ext2\000\000\000\000\000\000" - - - - - - It is, so let's see what it thinks it's up to: - - - - (gdb) p current_task.prev_task.thread - $14 = {extern_pid = 1980, tracing = 0, want_tracing = 0, forking = 0, - kernel_stack_page = 0, signal_stack = 1342627840, syscall = {id = 4, args = { - 3, 134973440, 1024, 0, 1024}, have_result = 0, result = 50590720}, - request = {op = 2, u = {exec = {ip = 1350467584, sp = 2952789424}, fork = { - regs = {1350467584, 2952789424, 0 }, sigstack = 0, - pid = 0}, switch_to = 0x507e8000, thread = {proc = 0x507e8000, - arg = 0xaffffdb0, flags = 0, new_pid = 0}, input_request = { - op = 1350467584, fd = -1342177872, proc = 0, pid = 0}}}} - - - - - - The interesting things here are the fact that its .thread.syscall.id - is __NR_write (see the big switch in arch/um/kernel/syscall_kern.c or - the defines in include/asm-um/arch/unistd.h), and that it never - returned. Also, its .request.op is OP_SWITCH (see - arch/um/include/user_util.h). These mean that it went into a write, - and, for some reason, called schedule(). - - - The fact that it never returned from write means that its stack should - be fairly interesting. Its pid is 1980 (.thread.extern_pid). That - process is being ptraced by the signal thread, so it must be detached - before gdb can attach it: - - - - - - - - - - - (gdb) call detach(1980) - - Program received signal SIGSEGV, Segmentation fault. - - The program being debugged stopped while in a function called from GDB. - When the function (detach) is done executing, GDB will silently - stop (instead of continuing to evaluate the expression containing - the function call). - (gdb) call detach(1980) - $15 = 0 - - - - - - The first detach segfaults for some reason, and the second one - succeeds. - - - Now I detach from the signal thread, attach to the fsck thread, and - look at its stack: - - - (gdb) det - Detaching from program: /home/dike/linux/2.3.26/um/linux Pid 1935 - (gdb) att 1980 - Attaching to program `/home/dike/linux/2.3.26/um/linux', Pid 1980 - 0x10070451 in __kill () - (gdb) bt - #0 0x10070451 in __kill () - #1 0x10068ccd in usr1_pid (pid=1980) at process.c:30 - #2 0x1006a03f in _switch_to (prev=0x50072000, next=0x507e8000) - at process_kern.c:156 - #3 0x1006a052 in switch_to (prev=0x50072000, next=0x507e8000, last=0x50072000) - at process_kern.c:161 - #4 0x10001d12 in schedule () at core.c:777 - #5 0x1006a744 in __down (sem=0x507d241c) at semaphore.c:71 - #6 0x1006aa10 in __down_failed () at semaphore.c:157 - #7 0x1006c5d8 in segv_handler (sc=0x5006e940) at trap_user.c:174 - #8 0x1006c5ec in kern_segv_handler (sig=11) at trap_user.c:182 - #9 - #10 0x10155404 in errno () - #11 0x1006c0aa in segv (address=1342179328, is_write=2) at trap_kern.c:50 - #12 0x1006c5d8 in segv_handler (sc=0x5006eaf8) at trap_user.c:174 - #13 0x1006c5ec in kern_segv_handler (sig=11) at trap_user.c:182 - #14 - #15 0xc0fd in ?? () - #16 0x10016647 in sys_write (fd=3, - buf=0x80b8800
, count=1024) - at read_write.c:159 - #17 0x1006d5b3 in execute_syscall (syscall=4, args=0x5006ef08) - at syscall_kern.c:254 - #18 0x1006af87 in really_do_syscall (sig=12) at syscall_user.c:35 - #19 - #20 0x400dc8b0 in ?? () - - - - - - The interesting things here are : - - o There are two segfaults on this stack (frames 9 and 14) - - o The first faulting address (frame 11) is 0x50000800 - - (gdb) p (void *)1342179328 - $16 = (void *) 0x50000800 - - - - - - The initial faulting address is interesting because it is on the idle - thread's stack. I had been seeing the idle thread segfault for no - apparent reason, and the cause looked like stack corruption. In hopes - of catching the culprit in the act, I had turned off all protections - to that stack while the idle thread wasn't running. This apparently - tripped that trap. - - - However, the more immediate problem is that second segfault and I'm - going to concentrate on that. First, I want to see where the fault - happened, so I have to go look at the sigcontent struct in frame 8: - - - - (gdb) up - #1 0x10068ccd in usr1_pid (pid=1980) at process.c:30 - 30 kill(pid, SIGUSR1); - (gdb) - #2 0x1006a03f in _switch_to (prev=0x50072000, next=0x507e8000) - at process_kern.c:156 - 156 usr1_pid(getpid()); - (gdb) - #3 0x1006a052 in switch_to (prev=0x50072000, next=0x507e8000, last=0x50072000) - at process_kern.c:161 - 161 _switch_to(prev, next); - (gdb) - #4 0x10001d12 in schedule () at core.c:777 - 777 switch_to(prev, next, prev); - (gdb) - #5 0x1006a744 in __down (sem=0x507d241c) at semaphore.c:71 - 71 schedule(); - (gdb) - #6 0x1006aa10 in __down_failed () at semaphore.c:157 - 157 } - (gdb) - #7 0x1006c5d8 in segv_handler (sc=0x5006e940) at trap_user.c:174 - 174 segv(sc->cr2, sc->err & 2); - (gdb) - #8 0x1006c5ec in kern_segv_handler (sig=11) at trap_user.c:182 - 182 segv_handler(sc); - (gdb) p *sc - Cannot access memory at address 0x0. - - - - - That's not very useful, so I'll try a more manual method: - - - (gdb) p *((struct sigcontext *) (&sig + 1)) - $19 = {gs = 0, __gsh = 0, fs = 0, __fsh = 0, es = 43, __esh = 0, ds = 43, - __dsh = 0, edi = 1342179328, esi = 1350378548, ebp = 1342630440, - esp = 1342630420, ebx = 1348150624, edx = 1280, ecx = 0, eax = 0, - trapno = 14, err = 4, eip = 268480945, cs = 35, __csh = 0, eflags = 66118, - esp_at_signal = 1342630420, ss = 43, __ssh = 0, fpstate = 0x0, oldmask = 0, - cr2 = 1280} - - - - The ip is in handle_mm_fault: - - - (gdb) p (void *)268480945 - $20 = (void *) 0x1000b1b1 - (gdb) i sym $20 - handle_mm_fault + 57 in section .text - - - - - - Specifically, it's in pte_alloc: - - - (gdb) i line *$20 - Line 124 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" - starts at address 0x1000b1b1 - and ends at 0x1000b1b7 . - - - - - - To find where in handle_mm_fault this is, I'll jump forward in the - code until I see an address in that procedure: - - - - (gdb) i line *0x1000b1c0 - Line 126 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" - starts at address 0x1000b1b7 - and ends at 0x1000b1c3 . - (gdb) i line *0x1000b1d0 - Line 131 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" - starts at address 0x1000b1d0 - and ends at 0x1000b1da . - (gdb) i line *0x1000b1e0 - Line 61 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" - starts at address 0x1000b1da - and ends at 0x1000b1e1 . - (gdb) i line *0x1000b1f0 - Line 134 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" - starts at address 0x1000b1f0 - and ends at 0x1000b200 . - (gdb) i line *0x1000b200 - Line 135 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" - starts at address 0x1000b200 - and ends at 0x1000b208 . - (gdb) i line *0x1000b210 - Line 139 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" - starts at address 0x1000b210 - and ends at 0x1000b219 . - (gdb) i line *0x1000b220 - Line 1168 of "memory.c" starts at address 0x1000b21e - and ends at 0x1000b222 . - - - - - - Something is apparently wrong with the page tables or vma_structs, so - lets go back to frame 11 and have a look at them: - - - - #11 0x1006c0aa in segv (address=1342179328, is_write=2) at trap_kern.c:50 - 50 handle_mm_fault(current, vma, address, is_write); - (gdb) call pgd_offset_proc(vma->vm_mm, address) - $22 = (pgd_t *) 0x80a548c - - - - - - That's pretty bogus. Page tables aren't supposed to be in process - text or data areas. Let's see what's in the vma: - - - (gdb) p *vma - $23 = {vm_mm = 0x507d2434, vm_start = 0, vm_end = 134512640, - vm_next = 0x80a4f8c, vm_page_prot = {pgprot = 0}, vm_flags = 31200, - vm_avl_height = 2058, vm_avl_left = 0x80a8c94, vm_avl_right = 0x80d1000, - vm_next_share = 0xaffffdb0, vm_pprev_share = 0xaffffe63, - vm_ops = 0xaffffe7a, vm_pgoff = 2952789626, vm_file = 0xafffffec, - vm_private_data = 0x62} - (gdb) p *vma.vm_mm - $24 = {mmap = 0x507d2434, mmap_avl = 0x0, mmap_cache = 0x8048000, - pgd = 0x80a4f8c, mm_users = {counter = 0}, mm_count = {counter = 134904288}, - map_count = 134909076, mmap_sem = {count = {counter = 135073792}, - sleepers = -1342177872, wait = {lock = , - task_list = {next = 0xaffffe63, prev = 0xaffffe7a}, - __magic = -1342177670, __creator = -1342177300}, __magic = 98}, - page_table_lock = {}, context = 138, start_code = 0, end_code = 0, - start_data = 0, end_data = 0, start_brk = 0, brk = 0, start_stack = 0, - arg_start = 0, arg_end = 0, env_start = 0, env_end = 0, rss = 1350381536, - total_vm = 0, locked_vm = 0, def_flags = 0, cpu_vm_mask = 0, swap_cnt = 0, - swap_address = 0, segments = 0x0} - - - - - - This also pretty bogus. With all of the 0x80xxxxx and 0xaffffxxx - addresses, this is looking like a stack was plonked down on top of - these structures. Maybe it's a stack overflow from the next page: - - - - (gdb) p vma - $25 = (struct vm_area_struct *) 0x507d2434 - - - - - - That's towards the lower quarter of the page, so that would have to - have been pretty heavy stack overflow: - - - - - - - - - - - - - - - (gdb) x/100x $25 - 0x507d2434: 0x507d2434 0x00000000 0x08048000 0x080a4f8c - 0x507d2444: 0x00000000 0x080a79e0 0x080a8c94 0x080d1000 - 0x507d2454: 0xaffffdb0 0xaffffe63 0xaffffe7a 0xaffffe7a - 0x507d2464: 0xafffffec 0x00000062 0x0000008a 0x00000000 - 0x507d2474: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d2484: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d2494: 0x00000000 0x00000000 0x507d2fe0 0x00000000 - 0x507d24a4: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d24b4: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d24c4: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d24d4: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d24e4: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d24f4: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d2504: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d2514: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d2524: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d2534: 0x00000000 0x00000000 0x507d25dc 0x00000000 - 0x507d2544: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d2554: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d2564: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d2574: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d2584: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d2594: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d25a4: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d25b4: 0x00000000 0x00000000 0x00000000 0x00000000 - - - - - - It's not stack overflow. The only "stack-like" piece of this data is - the vma_struct itself. - - - At this point, I don't see any avenues to pursue, so I just have to - admit that I have no idea what's going on. What I will do, though, is - stick a trap on the segfault handler which will stop if it sees any - writes to the idle thread's stack. That was the thing that happened - first, and it may be that if I can catch it immediately, what's going - on will be somewhat clearer. - - - 12.2. Episode 2: The case of the hung fsck - - After setting a trap in the SEGV handler for accesses to the signal - thread's stack, I reran the kernel. - - - fsck hung again, this time by hitting the trap: - - - - - - - - - - - - - - - - - Setting hostname uml [ OK ] - Checking root filesystem - /dev/fhd0 contains a file system with errors, check forced. - Error reading block 86894 (Attempt to read block from filesystem resulted in short read) while reading indirect blocks of inode 19780. - - /dev/fhd0: UNEXPECTED INCONSISTENCY; RUN fsck MANUALLY. - (i.e., without -a or -p options) - [ FAILED ] - - *** An error occurred during the file system check. - *** Dropping you to a shell; the system will reboot - *** when you leave the shell. - Give root password for maintenance - (or type Control-D for normal startup): - - [root@uml /root]# fsck -y /dev/fhd0 - fsck -y /dev/fhd0 - Parallelizing fsck version 1.14 (9-Jan-1999) - e2fsck 1.14, 9-Jan-1999 for EXT2 FS 0.5b, 95/08/09 - /dev/fhd0 contains a file system with errors, check forced. - Pass 1: Checking inodes, blocks, and sizes - Error reading block 86894 (Attempt to read block from filesystem resulted in short read) while reading indirect blocks of inode 19780. Ignore error? yes - - Pass 2: Checking directory structure - Error reading block 49405 (Attempt to read block from filesystem resulted in short read). Ignore error? yes - - Directory inode 11858, block 0, offset 0: directory corrupted - Salvage? yes - - Missing '.' in directory inode 11858. - Fix? yes - - Missing '..' in directory inode 11858. - Fix? yes - - Untested (4127) [100fe44c]: trap_kern.c line 31 - - - - - - I need to get the signal thread to detach from pid 4127 so that I can - attach to it with gdb. This is done by sending it a SIGUSR1, which is - caught by the signal thread, which detaches the process: - - - kill -USR1 4127 - - - - - - Now I can run gdb on it: - - - - - - - - - - - - - - ~/linux/2.3.26/um 1034: gdb linux - GNU gdb 4.17.0.11 with Linux support - Copyright 1998 Free Software Foundation, Inc. - GDB is free software, covered by the GNU General Public License, and you are - welcome to change it and/or distribute copies of it under certain conditions. - Type "show copying" to see the conditions. - There is absolutely no warranty for GDB. Type "show warranty" for details. - This GDB was configured as "i386-redhat-linux"... - (gdb) att 4127 - Attaching to program `/home/dike/linux/2.3.26/um/linux', Pid 4127 - 0x10075891 in __libc_nanosleep () - - - - - - The backtrace shows that it was in a write and that the fault address - (address in frame 3) is 0x50000800, which is right in the middle of - the signal thread's stack page: - - - (gdb) bt - #0 0x10075891 in __libc_nanosleep () - #1 0x1007584d in __sleep (seconds=1000000) - at ../sysdeps/unix/sysv/linux/sleep.c:78 - #2 0x1006ce9a in stop () at user_util.c:191 - #3 0x1006bf88 in segv (address=1342179328, is_write=2) at trap_kern.c:31 - #4 0x1006c628 in segv_handler (sc=0x5006eaf8) at trap_user.c:174 - #5 0x1006c63c in kern_segv_handler (sig=11) at trap_user.c:182 - #6 - #7 0xc0fd in ?? () - #8 0x10016647 in sys_write (fd=3, buf=0x80b8800 "R.", count=1024) - at read_write.c:159 - #9 0x1006d603 in execute_syscall (syscall=4, args=0x5006ef08) - at syscall_kern.c:254 - #10 0x1006af87 in really_do_syscall (sig=12) at syscall_user.c:35 - #11 - #12 0x400dc8b0 in ?? () - #13 - #14 0x400dc8b0 in ?? () - #15 0x80545fd in ?? () - #16 0x804daae in ?? () - #17 0x8054334 in ?? () - #18 0x804d23e in ?? () - #19 0x8049632 in ?? () - #20 0x80491d2 in ?? () - #21 0x80596b5 in ?? () - (gdb) p (void *)1342179328 - $3 = (void *) 0x50000800 - - - - - - Going up the stack to the segv_handler frame and looking at where in - the code the access happened shows that it happened near line 110 of - block_dev.c: - - - - - - - - - - (gdb) up - #1 0x1007584d in __sleep (seconds=1000000) - at ../sysdeps/unix/sysv/linux/sleep.c:78 - ../sysdeps/unix/sysv/linux/sleep.c:78: No such file or directory. - (gdb) - #2 0x1006ce9a in stop () at user_util.c:191 - 191 while(1) sleep(1000000); - (gdb) - #3 0x1006bf88 in segv (address=1342179328, is_write=2) at trap_kern.c:31 - 31 KERN_UNTESTED(); - (gdb) - #4 0x1006c628 in segv_handler (sc=0x5006eaf8) at trap_user.c:174 - 174 segv(sc->cr2, sc->err & 2); - (gdb) p *sc - $1 = {gs = 0, __gsh = 0, fs = 0, __fsh = 0, es = 43, __esh = 0, ds = 43, - __dsh = 0, edi = 1342179328, esi = 134973440, ebp = 1342631484, - esp = 1342630864, ebx = 256, edx = 0, ecx = 256, eax = 1024, trapno = 14, - err = 6, eip = 268550834, cs = 35, __csh = 0, eflags = 66070, - esp_at_signal = 1342630864, ss = 43, __ssh = 0, fpstate = 0x0, oldmask = 0, - cr2 = 1342179328} - (gdb) p (void *)268550834 - $2 = (void *) 0x1001c2b2 - (gdb) i sym $2 - block_write + 1090 in section .text - (gdb) i line *$2 - Line 209 of "/home/dike/linux/2.3.26/um/include/asm/arch/string.h" - starts at address 0x1001c2a1 - and ends at 0x1001c2bf . - (gdb) i line *0x1001c2c0 - Line 110 of "block_dev.c" starts at address 0x1001c2bf - and ends at 0x1001c2e3 . - - - - - - Looking at the source shows that the fault happened during a call to - copy_from_user to copy the data into the kernel: - - - 107 count -= chars; - 108 copy_from_user(p,buf,chars); - 109 p += chars; - 110 buf += chars; - - - - - - p is the pointer which must contain 0x50000800, since buf contains - 0x80b8800 (frame 8 above). It is defined as: - - - p = offset + bh->b_data; - - - - - - I need to figure out what bh is, and it just so happens that bh is - passed as an argument to mark_buffer_uptodate and mark_buffer_dirty a - few lines later, so I do a little disassembly: - - - - - (gdb) disas 0x1001c2bf 0x1001c2e0 - Dump of assembler code from 0x1001c2bf to 0x1001c2d0: - 0x1001c2bf : addl %eax,0xc(%ebp) - 0x1001c2c2 : movl 0xfffffdd4(%ebp),%edx - 0x1001c2c8 : btsl $0x0,0x18(%edx) - 0x1001c2cd : btsl $0x1,0x18(%edx) - 0x1001c2d2 : sbbl %ecx,%ecx - 0x1001c2d4 : testl %ecx,%ecx - 0x1001c2d6 : jne 0x1001c2e3 - 0x1001c2d8 : pushl $0x0 - 0x1001c2da : pushl %edx - 0x1001c2db : call 0x1001819c <__mark_buffer_dirty> - End of assembler dump. - - - - - - At that point, bh is in %edx (address 0x1001c2da), which is calculated - at 0x1001c2c2 as %ebp + 0xfffffdd4, so I figure exactly what that is, - taking %ebp from the sigcontext_struct above: - - - (gdb) p (void *)1342631484 - $5 = (void *) 0x5006ee3c - (gdb) p 0x5006ee3c+0xfffffdd4 - $6 = 1342630928 - (gdb) p (void *)$6 - $7 = (void *) 0x5006ec10 - (gdb) p *((void **)$7) - $8 = (void *) 0x50100200 - - - - - - Now, I look at the structure to see what's in it, and particularly, - what its b_data field contains: - - - (gdb) p *((struct buffer_head *)0x50100200) - $13 = {b_next = 0x50289380, b_blocknr = 49405, b_size = 1024, b_list = 0, - b_dev = 15872, b_count = {counter = 1}, b_rdev = 15872, b_state = 24, - b_flushtime = 0, b_next_free = 0x501001a0, b_prev_free = 0x50100260, - b_this_page = 0x501001a0, b_reqnext = 0x0, b_pprev = 0x507fcf58, - b_data = 0x50000800 "", b_page = 0x50004000, - b_end_io = 0x10017f60 , b_dev_id = 0x0, - b_rsector = 98810, b_wait = {lock = , - task_list = {next = 0x50100248, prev = 0x50100248}, __magic = 1343226448, - __creator = 0}, b_kiobuf = 0x0} - - - - - - The b_data field is indeed 0x50000800, so the question becomes how - that happened. The rest of the structure looks fine, so this probably - is not a case of data corruption. It happened on purpose somehow. - - - The b_page field is a pointer to the page_struct representing the - 0x50000000 page. Looking at it shows the kernel's idea of the state - of that page: - - - - (gdb) p *$13.b_page - $17 = {list = {next = 0x50004a5c, prev = 0x100c5174}, mapping = 0x0, - index = 0, next_hash = 0x0, count = {counter = 1}, flags = 132, lru = { - next = 0x50008460, prev = 0x50019350}, wait = { - lock = , task_list = {next = 0x50004024, - prev = 0x50004024}, __magic = 1342193708, __creator = 0}, - pprev_hash = 0x0, buffers = 0x501002c0, virtual = 1342177280, - zone = 0x100c5160} - - - - - - Some sanity-checking: the virtual field shows the "virtual" address of - this page, which in this kernel is the same as its "physical" address, - and the page_struct itself should be mem_map[0], since it represents - the first page of memory: - - - - (gdb) p (void *)1342177280 - $18 = (void *) 0x50000000 - (gdb) p mem_map - $19 = (mem_map_t *) 0x50004000 - - - - - - These check out fine. - - - Now to check out the page_struct itself. In particular, the flags - field shows whether the page is considered free or not: - - - (gdb) p (void *)132 - $21 = (void *) 0x84 - - - - - - The "reserved" bit is the high bit, which is definitely not set, so - the kernel considers the signal stack page to be free and available to - be used. - - - At this point, I jump to conclusions and start looking at my early - boot code, because that's where that page is supposed to be reserved. - - - In my setup_arch procedure, I have the following code which looks just - fine: - - - - bootmap_size = init_bootmem(start_pfn, end_pfn - start_pfn); - free_bootmem(__pa(low_physmem) + bootmap_size, high_physmem - low_physmem); - - - - - - Two stack pages have already been allocated, and low_physmem points to - the third page, which is the beginning of free memory. - The init_bootmem call declares the entire memory to the boot memory - manager, which marks it all reserved. The free_bootmem call frees up - all of it, except for the first two pages. This looks correct to me. - - - So, I decide to see init_bootmem run and make sure that it is marking - those first two pages as reserved. I never get that far. - - - Stepping into init_bootmem, and looking at bootmem_map before looking - at what it contains shows the following: - - - - (gdb) p bootmem_map - $3 = (void *) 0x50000000 - - - - - - Aha! The light dawns. That first page is doing double duty as a - stack and as the boot memory map. The last thing that the boot memory - manager does is to free the pages used by its memory map, so this page - is getting freed even its marked as reserved. - - - The fix was to initialize the boot memory manager before allocating - those two stack pages, and then allocate them through the boot memory - manager. After doing this, and fixing a couple of subsequent buglets, - the stack corruption problem disappeared. - - - - - - 13. What to do when UML doesn't work - - - - - 13.1. Strange compilation errors when you build from source - - As of test11, it is necessary to have "ARCH=um" in the environment or - on the make command line for all steps in building UML, including - clean, distclean, or mrproper, config, menuconfig, or xconfig, dep, - and linux. If you forget for any of them, the i386 build seems to - contaminate the UML build. If this happens, start from scratch with - - - host% - make mrproper ARCH=um - - - - - and repeat the build process with ARCH=um on all the steps. - - - See ``Compiling the kernel and modules'' for more details. - - - Another cause of strange compilation errors is building UML in - /usr/src/linux. If you do this, the first thing you need to do is - clean up the mess you made. The /usr/src/linux/asm link will now - point to /usr/src/linux/asm-um. Make it point back to - /usr/src/linux/asm-i386. Then, move your UML pool someplace else and - build it there. Also see below, where a more specific set of symptoms - is described. - - - - 13.3. A variety of panics and hangs with /tmp on a reiserfs filesys- - tem - - I saw this on reiserfs 3.5.21 and it seems to be fixed in 3.5.27. - Panics preceded by - - - Detaching pid nnnn - - - - are diagnostic of this problem. This is a reiserfs bug which causes a - thread to occasionally read stale data from a mmapped page shared with - another thread. The fix is to upgrade the filesystem or to have /tmp - be an ext2 filesystem. - - - - 13.4. The compile fails with errors about conflicting types for - 'open', 'dup', and 'waitpid' - - This happens when you build in /usr/src/linux. The UML build makes - the include/asm link point to include/asm-um. /usr/include/asm points - to /usr/src/linux/include/asm, so when that link gets moved, files - which need to include the asm-i386 versions of headers get the - incompatible asm-um versions. The fix is to move the include/asm link - back to include/asm-i386 and to do UML builds someplace else. - - - - 13.5. UML doesn't work when /tmp is an NFS filesystem - - This seems to be a similar situation with the ReiserFS problem above. - Some versions of NFS seems not to handle mmap correctly, which UML - depends on. The workaround is have /tmp be a non-NFS directory. - - - 13.6. UML hangs on boot when compiled with gprof support - - If you build UML with gprof support and, early in the boot, it does - this - - - kernel BUG at page_alloc.c:100! - - - - - you have a buggy gcc. You can work around the problem by removing - UM_FASTCALL from CFLAGS in arch/um/Makefile-i386. This will open up - another bug, but that one is fairly hard to reproduce. - - - - 13.7. syslogd dies with a SIGTERM on startup - - The exact boot error depends on the distribution that you're booting, - but Debian produces this: - - - /etc/rc2.d/S10sysklogd: line 49: 93 Terminated - start-stop-daemon --start --quiet --exec /sbin/syslogd -- $SYSLOGD - - - - - This is a syslogd bug. There's a race between a parent process - installing a signal handler and its child sending the signal. See - this uml-devel post for the details. - - - - 13.8. TUN/TAP networking doesn't work on a 2.4 host - - There are a couple of problems which were - name="pointed - out"> by Tim Robinson - - o It doesn't work on hosts running 2.4.7 (or thereabouts) or earlier. - The fix is to upgrade to something more recent and then read the - next item. - - o If you see - - - File descriptor in bad state - - - - when you bring up the device inside UML, you have a header mismatch - between the original kernel and the upgraded one. Make /usr/src/linux - point at the new headers. This will only be a problem if you build - uml_net yourself. - - - - 13.9. You can network to the host but not to other machines on the - net - - If you can connect to the host, and the host can connect to UML, but - you cannot connect to any other machines, then you may need to enable - IP Masquerading on the host. Usually this is only experienced when - using private IP addresses (192.168.x.x or 10.x.x.x) for host/UML - networking, rather than the public address space that your host is - connected to. UML does not enable IP Masquerading, so you will need - to create a static rule to enable it: - - - host% - iptables -t nat -A POSTROUTING -o eth0 -j MASQUERADE - - - - - Replace eth0 with the interface that you use to talk to the rest of - the world. - - - Documentation on IP Masquerading, and SNAT, can be found at - www.netfilter.org . - - - If you can reach the local net, but not the outside Internet, then - that is usually a routing problem. The UML needs a default route: - - - UML# - route add default gw gateway IP - - - - - The gateway IP can be any machine on the local net that knows how to - reach the outside world. Usually, this is the host or the local net- - work's gateway. - - - Occasionally, we hear from someone who can reach some machines, but - not others on the same net, or who can reach some ports on other - machines, but not others. These are usually caused by strange - firewalling somewhere between the UML and the other box. You track - this down by running tcpdump on every interface the packets travel - over and see where they disappear. When you find a machine that takes - the packets in, but does not send them onward, that's the culprit. - - - - 13.10. I have no root and I want to scream - - Thanks to Birgit Wahlich for telling me about this strange one. It - turns out that there's a limit of six environment variables on the - kernel command line. When that limit is reached or exceeded, argument - processing stops, which means that the 'root=' argument that UML - usually adds is not seen. So, the filesystem has no idea what the - root device is, so it panics. - - - The fix is to put less stuff on the command line. Glomming all your - setup variables into one is probably the best way to go. - - - - 13.11. UML build conflict between ptrace.h and ucontext.h - - On some older systems, /usr/include/asm/ptrace.h and - /usr/include/sys/ucontext.h define the same names. So, when they're - included together, the defines from one completely mess up the parsing - of the other, producing errors like: - /usr/include/sys/ucontext.h:47: parse error before - `10' - - - - - plus a pile of warnings. - - - This is a libc botch, which has since been fixed, and I don't see any - way around it besides upgrading. - - - - 13.12. The UML BogoMips is exactly half the host's BogoMips - - On i386 kernels, there are two ways of running the loop that is used - to calculate the BogoMips rating, using the TSC if it's there or using - a one-instruction loop. The TSC produces twice the BogoMips as the - loop. UML uses the loop, since it has nothing resembling a TSC, and - will get almost exactly the same BogoMips as a host using the loop. - However, on a host with a TSC, its BogoMips will be double the loop - BogoMips, and therefore double the UML BogoMips. - - - - 13.13. When you run UML, it immediately segfaults - - If the host is configured with the 2G/2G address space split, that's - why. See ``UML on 2G/2G hosts'' for the details on getting UML to - run on your host. - - - - 13.14. xterms appear, then immediately disappear - - If you're running an up to date kernel with an old release of - uml_utilities, the port-helper program will not work properly, so - xterms will exit straight after they appear. The solution is to - upgrade to the latest release of uml_utilities. Usually this problem - occurs when you have installed a packaged release of UML then compiled - your own development kernel without upgrading the uml_utilities from - the source distribution. - - - - 13.15. Any other panic, hang, or strange behavior - - If you're seeing truly strange behavior, such as hangs or panics that - happen in random places, or you try running the debugger to see what's - happening and it acts strangely, then it could be a problem in the - host kernel. If you're not running a stock Linus or -ac kernel, then - try that. An early version of the preemption patch and a 2.4.10 SuSE - kernel have caused very strange problems in UML. - - - Otherwise, let me know about it. Send a message to one of the UML - mailing lists - either the developer list - user-mode-linux-devel at - lists dot sourceforge dot net (subscription info) or the user list - - user-mode-linux-user at lists dot sourceforge do net (subscription - info), whichever you prefer. Don't assume that everyone knows about - it and that a fix is imminent. - - - If you want to be super-helpful, read ``Diagnosing Problems'' and - follow the instructions contained therein. - 14. Diagnosing Problems - - - If you get UML to crash, hang, or otherwise misbehave, you should - report this on one of the project mailing lists, either the developer - list - user-mode-linux-devel at lists dot sourceforge dot net - (subscription info) or the user list - user-mode-linux-user at lists - dot sourceforge dot net (subscription info). When you do, it is - likely that I will want more information. So, it would be helpful to - read the stuff below, do whatever is applicable in your case, and - report the results to the list. - - - For any diagnosis, you're going to need to build a debugging kernel. - The binaries from this site aren't debuggable. If you haven't done - this before, read about ``Compiling the kernel and modules'' and - ``Kernel debugging'' UML first. - - - 14.1. Case 1 : Normal kernel panics - - The most common case is for a normal thread to panic. To debug this, - you will need to run it under the debugger (add 'debug' to the command - line). An xterm will start up with gdb running inside it. Continue - it when it stops in start_kernel and make it crash. Now ^C gdb and - - - If the panic was a "Kernel mode fault", then there will be a segv - frame on the stack and I'm going to want some more information. The - stack might look something like this: - - - (UML gdb) backtrace - #0 0x1009bf76 in __sigprocmask (how=1, set=0x5f347940, oset=0x0) - at ../sysdeps/unix/sysv/linux/sigprocmask.c:49 - #1 0x10091411 in change_sig (signal=10, on=1) at process.c:218 - #2 0x10094785 in timer_handler (sig=26) at time_kern.c:32 - #3 0x1009bf38 in __restore () - at ../sysdeps/unix/sysv/linux/i386/sigaction.c:125 - #4 0x1009534c in segv (address=8, ip=268849158, is_write=2, is_user=0) - at trap_kern.c:66 - #5 0x10095c04 in segv_handler (sig=11) at trap_user.c:285 - #6 0x1009bf38 in __restore () - - - - - I'm going to want to see the symbol and line information for the value - of ip in the segv frame. In this case, you would do the following: - - - (UML gdb) i sym 268849158 - - - - - and - - - (UML gdb) i line *268849158 - - - - - The reason for this is the __restore frame right above the segv_han- - dler frame is hiding the frame that actually segfaulted. So, I have - to get that information from the faulting ip. - - - 14.2. Case 2 : Tracing thread panics - - The less common and more painful case is when the tracing thread - panics. In this case, the kernel debugger will be useless because it - needs a healthy tracing thread in order to work. The first thing to - do is get a backtrace from the tracing thread. This is done by - figuring out what its pid is, firing up gdb, and attaching it to that - pid. You can figure out the tracing thread pid by looking at the - first line of the console output, which will look like this: - - - tracing thread pid = 15851 - - - - - or by running ps on the host and finding the line that looks like - this: - - - jdike 15851 4.5 0.4 132568 1104 pts/0 S 21:34 0:05 ./linux [(tracing thread)] - - - - - If the panic was 'segfault in signals', then follow the instructions - above for collecting information about the location of the seg fault. - - - If the tracing thread flaked out all by itself, then send that - backtrace in and wait for our crack debugging team to fix the problem. - - - 14.3. Case 3 : Tracing thread panics caused by other threads - - However, there are cases where the misbehavior of another thread - caused the problem. The most common panic of this type is: - - - wait_for_stop failed to wait for to stop with - - - - - In this case, you'll need to get a backtrace from the process men- - tioned in the panic, which is complicated by the fact that the kernel - debugger is defunct and without some fancy footwork, another gdb can't - attach to it. So, this is how the fancy footwork goes: - - In a shell: - - - host% kill -STOP pid - - - - - Run gdb on the tracing thread as described in case 2 and do: - - - (host gdb) call detach(pid) - - - If you get a segfault, do it again. It always works the second time. - - Detach from the tracing thread and attach to that other thread: - - - (host gdb) detach - - - - - - - (host gdb) attach pid - - - - - If gdb hangs when attaching to that process, go back to a shell and - do: - - - host% - kill -CONT pid - - - - - And then get the backtrace: - - - (host gdb) backtrace - - - - - - 14.4. Case 4 : Hangs - - Hangs seem to be fairly rare, but they sometimes happen. When a hang - happens, we need a backtrace from the offending process. Run the - kernel debugger as described in case 1 and get a backtrace. If the - current process is not the idle thread, then send in the backtrace. - You can tell that it's the idle thread if the stack looks like this: - - - #0 0x100b1401 in __libc_nanosleep () - #1 0x100a2885 in idle_sleep (secs=10) at time.c:122 - #2 0x100a546f in do_idle () at process_kern.c:445 - #3 0x100a5508 in cpu_idle () at process_kern.c:471 - #4 0x100ec18f in start_kernel () at init/main.c:592 - #5 0x100a3e10 in start_kernel_proc (unused=0x0) at um_arch.c:71 - #6 0x100a383f in signal_tramp (arg=0x100a3dd8) at trap_user.c:50 - - - - - If this is the case, then some other process is at fault, and went to - sleep when it shouldn't have. Run ps on the host and figure out which - process should not have gone to sleep and stayed asleep. Then attach - to it with gdb and get a backtrace as described in case 3. - - - - - - - 15. Thanks - - - A number of people have helped this project in various ways, and this - page gives recognition where recognition is due. - - - If you're listed here and you would prefer a real link on your name, - or no link at all, instead of the despammed email address pseudo-link, - let me know. - - - If you're not listed here and you think maybe you should be, please - let me know that as well. I try to get everyone, but sometimes my - bookkeeping lapses and I forget about contributions. - - - 15.1. Code and Documentation - - Rusty Russell - - - o wrote the HOWTO - - o prodded me into making this project official and putting it on - SourceForge - - o came up with the way cool UML logo - - o redid the config process - - - Peter Moulder - Fixed my config and build - processes, and added some useful code to the block driver - - - Bill Stearns - - - o HOWTO updates - - o lots of bug reports - - o lots of testing - - o dedicated a box (uml.ists.dartmouth.edu) to support UML development - - o wrote the mkrootfs script, which allows bootable filesystems of - RPM-based distributions to be cranked out - - o cranked out a large number of filesystems with said script - - - Jim Leu - Wrote the virtual ethernet driver - and associated usermode tools - - Lars Brinkhoff - Contributed the ptrace - proxy from his own project to allow easier - kernel debugging - - - Andrea Arcangeli - Redid some of the early boot - code so that it would work on machines with Large File Support - - - Chris Emerson - Did - the first UML port to Linux/ppc - - - Harald Welte - Wrote the multicast - transport for the network driver - - - Jorgen Cederlof - Added special file support to hostfs - - - Greg Lonnon - Changed the ubd driver - to allow it to layer a COW file on a shared read-only filesystem and - wrote the iomem emulation support - - - Henrik Nordstrom - Provided a variety - of patches, fixes, and clues - - - Lennert Buytenhek - Contributed various patches, a rewrite of the - network driver, the first implementation of the mconsole driver, and - did the bulk of the work needed to get SMP working again. - - - Yon Uriarte - Fixed the TUN/TAP network backend while I slept. - - - Adam Heath - Made a bunch of nice cleanups to the initialization code, - plus various other small patches. - - - Matt Zimmerman - Matt volunteered to be the UML Debian maintainer and - is doing a real nice job of it. He also noticed and fixed a number of - actually and potentially exploitable security holes in uml_net. Plus - the occasional patch. I like patches. - - - James McMechan - James seems to have taken over maintenance of the ubd - driver and is doing a nice job of it. - - - Chandan Kudige - wrote the umlgdb script which automates the reloading - of module symbols. - - - Steve Schmidtke - wrote the UML slirp transport and hostaudio drivers, - enabling UML processes to access audio devices on the host. He also - submitted patches for the slip transport and lots of other things. - - - David Coulson - - - o Set up the usermodelinux.org site, - which is a great way of keeping the UML user community on top of - UML goings-on. - - o Site documentation and updates - - o Nifty little UML management daemon UMLd - - - o Lots of testing and bug reports - - - - - 15.2. Flushing out bugs - - - - o Yuri Pudgorodsky - - o Gerald Britton - - o Ian Wehrman - - o Gord Lamb - - o Eugene Koontz - - o John H. Hartman - - o Anders Karlsson - - o Daniel Phillips - - o John Fremlin - - o Rainer Burgstaller - - o James Stevenson - - o Matt Clay - - o Cliff Jefferies - - o Geoff Hoff - - o Lennert Buytenhek - - o Al Viro - - o Frank Klingenhoefer - - o Livio Baldini Soares - - o Jon Burgess - - o Petru Paler - - o Paul - - o Chris Reahard - - o Sverker Nilsson - - o Gong Su - - o johan verrept - - o Bjorn Eriksson - - o Lorenzo Allegrucci - - o Muli Ben-Yehuda - - o David Mansfield - - o Howard Goff - - o Mike Anderson - - o John Byrne - - o Sapan J. Batia - - o Iris Huang - - o Jan Hudec - - o Voluspa - - - - - 15.3. Buglets and clean-ups - - - - o Dave Zarzycki - - o Adam Lazur - - o Boria Feigin - - o Brian J. Murrell - - o JS - - o Roman Zippel - - o Wil Cooley - - o Ayelet Shemesh - - o Will Dyson - - o Sverker Nilsson - - o dvorak - - o v.naga srinivas - - o Shlomi Fish - - o Roger Binns - - o johan verrept - - o MrChuoi - - o Peter Cleve - - o Vincent Guffens - - o Nathan Scott - - o Patrick Caulfield - - o jbearce - - o Catalin Marinas - - o Shane Spencer - - o Zou Min - - - o Ryan Boder - - o Lorenzo Colitti - - o Gwendal Grignou - - o Andre' Breiler - - o Tsutomu Yasuda - - - - 15.4. Case Studies - - - o Jon Wright - - o William McEwan - - o Michael Richardson - - - - 15.5. Other contributions - - - Bill Carr made the Red Hat mkrootfs script - work with RH 6.2. - - Michael Jennings sent in some material which - is now gracing the top of the index page of this site. - - SGI (and more specifically Ralf Baechle ) gave me an account on oss.sgi.com - . The bandwidth there made it possible to - produce most of the filesystems available on the project download - page. - - Laurent Bonnaud took the old grotty - Debian filesystem that I've been distributing and updated it to 2.2. - It is now available by itself here. - - Rik van Riel gave me some ftp space on ftp.nl.linux.org so I can make - releases even when Sourceforge is broken. - - Rodrigo de Castro looked at my broken pte code and told me what was - wrong with it, letting me fix a long-standing (several weeks) and - serious set of bugs. - - Chris Reahard built a specialized root filesystem for running a DNS - server jailed inside UML. It's available from the download - page in the Jail - Filesystems section. - - - - - - - - - - - - diff --git a/MAINTAINERS b/MAINTAINERS index debbb7b97c98..1aec93695040 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8727,7 +8727,7 @@ L: kvm@vger.kernel.org W: http://www.linux-kvm.org T: git git://git.kernel.org/pub/scm/virt/kvm/kvm.git S: Supported -F: Documentation/virtual/kvm/ +F: Documentation/virt/kvm/ F: include/trace/events/kvm.h F: include/uapi/asm-generic/kvm* F: include/uapi/linux/kvm* @@ -12054,7 +12054,7 @@ M: Juergen Gross M: Alok Kataria L: virtualization@lists.linux-foundation.org S: Supported -F: Documentation/virtual/paravirt_ops.txt +F: Documentation/virt/paravirt_ops.txt F: arch/*/kernel/paravirt* F: arch/*/include/asm/paravirt*.h F: include/linux/hypervisor.h @@ -16745,7 +16745,7 @@ W: http://user-mode-linux.sourceforge.net Q: https://patchwork.ozlabs.org/project/linux-um/list/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/rw/uml.git S: Maintained -F: Documentation/virtual/uml/ +F: Documentation/virt/uml/ F: arch/um/ F: arch/x86/um/ F: fs/hostfs/ diff --git a/arch/powerpc/include/uapi/asm/kvm_para.h b/arch/powerpc/include/uapi/asm/kvm_para.h index 01555c6ae0f5..be48c2215fa2 100644 --- a/arch/powerpc/include/uapi/asm/kvm_para.h +++ b/arch/powerpc/include/uapi/asm/kvm_para.h @@ -31,7 +31,7 @@ * Struct fields are always 32 or 64 bit aligned, depending on them being 32 * or 64 bit wide respectively. * - * See Documentation/virtual/kvm/ppc-pv.txt + * See Documentation/virt/kvm/ppc-pv.txt */ struct kvm_vcpu_arch_shared { __u64 scratch1; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 8f72526e2f68..24843cf49579 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3466,7 +3466,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, /* * Currently, fast page fault only works for direct mapping * since the gfn is not stable for indirect shadow page. See - * Documentation/virtual/kvm/locking.txt to get more detail. + * Documentation/virt/kvm/locking.txt to get more detail. */ fault_handled = fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte, diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index a7c19540ce21..5e3f12d5359e 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -116,7 +116,7 @@ struct kvm_irq_level { * ACPI gsi notion of irq. * For IA-64 (APIC model) IOAPIC0: irq 0-23; IOAPIC1: irq 24-47.. * For X86 (standard AT mode) PIC0/1: irq 0-15. IOAPIC0: 0-23.. - * For ARM: See Documentation/virtual/kvm/api.txt + * For ARM: See Documentation/virt/kvm/api.txt */ union { __u32 irq; @@ -1086,7 +1086,7 @@ struct kvm_xen_hvm_config { * * KVM_IRQFD_FLAG_RESAMPLE indicates resamplefd is valid and specifies * the irqfd to operate in resampling mode for level triggered interrupt - * emulation. See Documentation/virtual/kvm/api.txt. + * emulation. See Documentation/virt/kvm/api.txt. */ #define KVM_IRQFD_FLAG_RESAMPLE (1 << 1) diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index c2152f3dd02d..e7c67be7c15f 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -116,7 +116,7 @@ struct kvm_irq_level { * ACPI gsi notion of irq. * For IA-64 (APIC model) IOAPIC0: irq 0-23; IOAPIC1: irq 24-47.. * For X86 (standard AT mode) PIC0/1: irq 0-15. IOAPIC0: 0-23.. - * For ARM: See Documentation/virtual/kvm/api.txt + * For ARM: See Documentation/virt/kvm/api.txt */ union { __u32 irq; @@ -1085,7 +1085,7 @@ struct kvm_xen_hvm_config { * * KVM_IRQFD_FLAG_RESAMPLE indicates resamplefd is valid and specifies * the irqfd to operate in resampling mode for level triggered interrupt - * emulation. See Documentation/virtual/kvm/api.txt. + * emulation. See Documentation/virt/kvm/api.txt. */ #define KVM_IRQFD_FLAG_RESAMPLE (1 << 1) diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index f645c0fbf7ec..acc43242a310 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -727,7 +727,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) * Ensure we set mode to IN_GUEST_MODE after we disable * interrupts and before the final VCPU requests check. * See the comment in kvm_vcpu_exiting_guest_mode() and - * Documentation/virtual/kvm/vcpu-requests.rst + * Documentation/virt/kvm/vcpu-requests.rst */ smp_store_mb(vcpu->mode, IN_GUEST_MODE); diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c index 936962abc38d..c45e2d7e942f 100644 --- a/virt/kvm/arm/vgic/vgic-mmio-v3.c +++ b/virt/kvm/arm/vgic/vgic-mmio-v3.c @@ -250,7 +250,7 @@ static unsigned long vgic_v3_uaccess_read_pending(struct kvm_vcpu *vcpu, * pending state of interrupt is latched in pending_latch variable. * Userspace will save and restore pending state and line_level * separately. - * Refer to Documentation/virtual/kvm/devices/arm-vgic-v3.txt + * Refer to Documentation/virt/kvm/devices/arm-vgic-v3.txt * for handling of ISPENDR and ICPENDR. */ for (i = 0; i < len * 8; i++) { diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h index 57205beaa981..3b7525deec80 100644 --- a/virt/kvm/arm/vgic/vgic.h +++ b/virt/kvm/arm/vgic/vgic.h @@ -42,7 +42,7 @@ VGIC_AFFINITY_LEVEL(val, 3)) /* - * As per Documentation/virtual/kvm/devices/arm-vgic-v3.txt, + * As per Documentation/virt/kvm/devices/arm-vgic-v3.txt, * below macros are defined for CPUREG encoding. */ #define KVM_REG_ARM_VGIC_SYSREG_OP0_MASK 0x000000000000c000 @@ -63,7 +63,7 @@ KVM_REG_ARM_VGIC_SYSREG_OP2_MASK) /* - * As per Documentation/virtual/kvm/devices/arm-vgic-its.txt, + * As per Documentation/virt/kvm/devices/arm-vgic-its.txt, * below macros are defined for ITS table entry encoding. */ #define KVM_ITS_CTE_VALID_SHIFT 63 -- cgit v1.2.3 From 6d1bcb957be2850e0776f24c289e1f87c256baeb Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 2 Jul 2019 16:43:07 +0100 Subject: iommu: Remove empty iommu_tlb_range_add() callback from iommu_ops Commit add02cfdc9bc ("iommu: Introduce Interface for IOMMU TLB Flushing") added three new TLB flushing operations to the IOMMU API so that the underlying driver operations can be batched when unmapping large regions of IO virtual address space. However, the ->iotlb_range_add() callback has not been implemented by any IOMMU drivers (amd_iommu.c implements it as an empty function, which incurs the overhead of an indirect branch). Instead, drivers either flush the entire IOTLB in the ->iotlb_sync() callback or perform the necessary invalidation during ->unmap(). Attempting to implement ->iotlb_range_add() for arm-smmu-v3.c revealed two major issues: 1. The page size used to map the region in the page-table is not known, and so it is not generally possible to issue TLB flushes in the most efficient manner. 2. The only mutable state passed to the callback is a pointer to the iommu_domain, which can be accessed concurrently and therefore requires expensive synchronisation to keep track of the outstanding flushes. Remove the callback entirely in preparation for extending ->unmap() and ->iotlb_sync() to update a token on the caller's stack. Signed-off-by: Will Deacon --- drivers/iommu/amd_iommu.c | 6 ------ drivers/iommu/iommu.c | 3 --- drivers/vfio/vfio_iommu_type1.c | 1 - include/linux/iommu.h | 15 --------------- 4 files changed, 25 deletions(-) (limited to 'include') diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index b607a92791d3..f93b148cf55e 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -3196,11 +3196,6 @@ static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain) domain_flush_complete(dom); } -static void amd_iommu_iotlb_range_add(struct iommu_domain *domain, - unsigned long iova, size_t size) -{ -} - const struct iommu_ops amd_iommu_ops = { .capable = amd_iommu_capable, .domain_alloc = amd_iommu_domain_alloc, @@ -3219,7 +3214,6 @@ const struct iommu_ops amd_iommu_ops = { .is_attach_deferred = amd_iommu_is_attach_deferred, .pgsize_bitmap = AMD_IOMMU_PGSIZES, .flush_iotlb_all = amd_iommu_flush_iotlb_all, - .iotlb_range_add = amd_iommu_iotlb_range_add, .iotlb_sync = amd_iommu_flush_iotlb_all, }; diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 0c674d80c37f..6d7b25fe2474 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -1903,9 +1903,6 @@ static size_t __iommu_unmap(struct iommu_domain *domain, if (!unmapped_page) break; - if (sync && ops->iotlb_range_add) - ops->iotlb_range_add(domain, iova, pgsize); - pr_debug("unmapped: iova 0x%lx size 0x%zx\n", iova, unmapped_page); diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 054391f30fa8..fad7fd8c167c 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -696,7 +696,6 @@ static size_t unmap_unpin_fast(struct vfio_domain *domain, if (!unmapped) { kfree(entry); } else { - iommu_tlb_range_add(domain->domain, *iova, unmapped); entry->iova = *iova; entry->phys = phys; entry->len = unmapped; diff --git a/include/linux/iommu.h b/include/linux/iommu.h index fdc355ccc570..1e21431262d9 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -201,7 +201,6 @@ struct iommu_sva_ops { * @map: map a physically contiguous memory region to an iommu domain * @unmap: unmap a physically contiguous memory region from an iommu domain * @flush_iotlb_all: Synchronously flush all hardware TLBs for this domain - * @iotlb_range_add: Add a given iova range to the flush queue for this domain * @iotlb_sync_map: Sync mappings created recently using @map to the hardware * @iotlb_sync: Flush all queued ranges from the hardware TLBs and empty flush * queue @@ -244,8 +243,6 @@ struct iommu_ops { size_t (*unmap)(struct iommu_domain *domain, unsigned long iova, size_t size); void (*flush_iotlb_all)(struct iommu_domain *domain); - void (*iotlb_range_add)(struct iommu_domain *domain, - unsigned long iova, size_t size); void (*iotlb_sync_map)(struct iommu_domain *domain); void (*iotlb_sync)(struct iommu_domain *domain); phys_addr_t (*iova_to_phys)(struct iommu_domain *domain, dma_addr_t iova); @@ -476,13 +473,6 @@ static inline void iommu_flush_tlb_all(struct iommu_domain *domain) domain->ops->flush_iotlb_all(domain); } -static inline void iommu_tlb_range_add(struct iommu_domain *domain, - unsigned long iova, size_t size) -{ - if (domain->ops->iotlb_range_add) - domain->ops->iotlb_range_add(domain, iova, size); -} - static inline void iommu_tlb_sync(struct iommu_domain *domain) { if (domain->ops->iotlb_sync) @@ -637,11 +627,6 @@ static inline void iommu_flush_tlb_all(struct iommu_domain *domain) { } -static inline void iommu_tlb_range_add(struct iommu_domain *domain, - unsigned long iova, size_t size) -{ -} - static inline void iommu_tlb_sync(struct iommu_domain *domain) { } -- cgit v1.2.3 From 298f78895b081911e0b3605f07d79ebd3d4cf7b0 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 2 Jul 2019 16:43:34 +0100 Subject: iommu/io-pgtable: Rename iommu_gather_ops to iommu_flush_ops In preparation for TLB flush gathering in the IOMMU API, rename the iommu_gather_ops structure in io-pgtable to iommu_flush_ops, which better describes its purpose and avoids the potential for confusion between different levels of the API. $ find linux/ -type f -name '*.[ch]' | xargs sed -i 's/gather_ops/flush_ops/g' Signed-off-by: Will Deacon --- drivers/gpu/drm/panfrost/panfrost_mmu.c | 2 +- drivers/iommu/arm-smmu-v3.c | 4 ++-- drivers/iommu/arm-smmu.c | 8 ++++---- drivers/iommu/io-pgtable-arm-v7s.c | 2 +- drivers/iommu/io-pgtable-arm.c | 2 +- drivers/iommu/ipmmu-vmsa.c | 4 ++-- drivers/iommu/msm_iommu.c | 4 ++-- drivers/iommu/mtk_iommu.c | 4 ++-- drivers/iommu/qcom_iommu.c | 4 ++-- include/linux/io-pgtable.h | 6 +++--- 10 files changed, 20 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/panfrost/panfrost_mmu.c b/drivers/gpu/drm/panfrost/panfrost_mmu.c index 92ac995dd9c6..17bceb11e708 100644 --- a/drivers/gpu/drm/panfrost/panfrost_mmu.c +++ b/drivers/gpu/drm/panfrost/panfrost_mmu.c @@ -257,7 +257,7 @@ static void mmu_tlb_sync_context(void *cookie) // TODO: Wait 1000 GPU cycles for HW_ISSUE_6367/T60X } -static const struct iommu_gather_ops mmu_tlb_ops = { +static const struct iommu_flush_ops mmu_tlb_ops = { .tlb_flush_all = mmu_tlb_inv_context_s1, .tlb_add_flush = mmu_tlb_inv_range_nosync, .tlb_sync = mmu_tlb_sync_context, diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c index a9a9fabd3968..7e137e1e28f1 100644 --- a/drivers/iommu/arm-smmu-v3.c +++ b/drivers/iommu/arm-smmu-v3.c @@ -1603,7 +1603,7 @@ static void arm_smmu_tlb_inv_range_nosync(unsigned long iova, size_t size, } while (size -= granule); } -static const struct iommu_gather_ops arm_smmu_gather_ops = { +static const struct iommu_flush_ops arm_smmu_flush_ops = { .tlb_flush_all = arm_smmu_tlb_inv_context, .tlb_add_flush = arm_smmu_tlb_inv_range_nosync, .tlb_sync = arm_smmu_tlb_sync, @@ -1796,7 +1796,7 @@ static int arm_smmu_domain_finalise(struct iommu_domain *domain) .ias = ias, .oas = oas, .coherent_walk = smmu->features & ARM_SMMU_FEAT_COHERENCY, - .tlb = &arm_smmu_gather_ops, + .tlb = &arm_smmu_flush_ops, .iommu_dev = smmu->dev, }; diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c index 64977c131ee6..dc08db347ef3 100644 --- a/drivers/iommu/arm-smmu.c +++ b/drivers/iommu/arm-smmu.c @@ -251,7 +251,7 @@ enum arm_smmu_domain_stage { struct arm_smmu_domain { struct arm_smmu_device *smmu; struct io_pgtable_ops *pgtbl_ops; - const struct iommu_gather_ops *tlb_ops; + const struct iommu_flush_ops *tlb_ops; struct arm_smmu_cfg cfg; enum arm_smmu_domain_stage stage; bool non_strict; @@ -547,19 +547,19 @@ static void arm_smmu_tlb_inv_vmid_nosync(unsigned long iova, size_t size, writel_relaxed(smmu_domain->cfg.vmid, base + ARM_SMMU_GR0_TLBIVMID); } -static const struct iommu_gather_ops arm_smmu_s1_tlb_ops = { +static const struct iommu_flush_ops arm_smmu_s1_tlb_ops = { .tlb_flush_all = arm_smmu_tlb_inv_context_s1, .tlb_add_flush = arm_smmu_tlb_inv_range_nosync, .tlb_sync = arm_smmu_tlb_sync_context, }; -static const struct iommu_gather_ops arm_smmu_s2_tlb_ops_v2 = { +static const struct iommu_flush_ops arm_smmu_s2_tlb_ops_v2 = { .tlb_flush_all = arm_smmu_tlb_inv_context_s2, .tlb_add_flush = arm_smmu_tlb_inv_range_nosync, .tlb_sync = arm_smmu_tlb_sync_context, }; -static const struct iommu_gather_ops arm_smmu_s2_tlb_ops_v1 = { +static const struct iommu_flush_ops arm_smmu_s2_tlb_ops_v1 = { .tlb_flush_all = arm_smmu_tlb_inv_context_s2, .tlb_add_flush = arm_smmu_tlb_inv_vmid_nosync, .tlb_sync = arm_smmu_tlb_sync_vmid, diff --git a/drivers/iommu/io-pgtable-arm-v7s.c b/drivers/iommu/io-pgtable-arm-v7s.c index a62733c6a632..116f97ee991e 100644 --- a/drivers/iommu/io-pgtable-arm-v7s.c +++ b/drivers/iommu/io-pgtable-arm-v7s.c @@ -817,7 +817,7 @@ static void dummy_tlb_sync(void *cookie) WARN_ON(cookie != cfg_cookie); } -static const struct iommu_gather_ops dummy_tlb_ops = { +static const struct iommu_flush_ops dummy_tlb_ops = { .tlb_flush_all = dummy_tlb_flush_all, .tlb_add_flush = dummy_tlb_add_flush, .tlb_sync = dummy_tlb_sync, diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c index 0d6633921c1e..402f913b6f6d 100644 --- a/drivers/iommu/io-pgtable-arm.c +++ b/drivers/iommu/io-pgtable-arm.c @@ -1081,7 +1081,7 @@ static void dummy_tlb_sync(void *cookie) WARN_ON(cookie != cfg_cookie); } -static const struct iommu_gather_ops dummy_tlb_ops __initconst = { +static const struct iommu_flush_ops dummy_tlb_ops __initconst = { .tlb_flush_all = dummy_tlb_flush_all, .tlb_add_flush = dummy_tlb_add_flush, .tlb_sync = dummy_tlb_sync, diff --git a/drivers/iommu/ipmmu-vmsa.c b/drivers/iommu/ipmmu-vmsa.c index ad0098c0c87c..2c14a2c65b22 100644 --- a/drivers/iommu/ipmmu-vmsa.c +++ b/drivers/iommu/ipmmu-vmsa.c @@ -367,7 +367,7 @@ static void ipmmu_tlb_add_flush(unsigned long iova, size_t size, /* The hardware doesn't support selective TLB flush. */ } -static const struct iommu_gather_ops ipmmu_gather_ops = { +static const struct iommu_flush_ops ipmmu_flush_ops = { .tlb_flush_all = ipmmu_tlb_flush_all, .tlb_add_flush = ipmmu_tlb_add_flush, .tlb_sync = ipmmu_tlb_flush_all, @@ -480,7 +480,7 @@ static int ipmmu_domain_init_context(struct ipmmu_vmsa_domain *domain) domain->cfg.pgsize_bitmap = SZ_1G | SZ_2M | SZ_4K; domain->cfg.ias = 32; domain->cfg.oas = 40; - domain->cfg.tlb = &ipmmu_gather_ops; + domain->cfg.tlb = &ipmmu_flush_ops; domain->io_domain.geometry.aperture_end = DMA_BIT_MASK(32); domain->io_domain.geometry.force_aperture = true; /* diff --git a/drivers/iommu/msm_iommu.c b/drivers/iommu/msm_iommu.c index b25e2eb9e038..8b602384a385 100644 --- a/drivers/iommu/msm_iommu.c +++ b/drivers/iommu/msm_iommu.c @@ -178,7 +178,7 @@ static void __flush_iotlb_sync(void *cookie) */ } -static const struct iommu_gather_ops msm_iommu_gather_ops = { +static const struct iommu_flush_ops msm_iommu_flush_ops = { .tlb_flush_all = __flush_iotlb, .tlb_add_flush = __flush_iotlb_range, .tlb_sync = __flush_iotlb_sync, @@ -345,7 +345,7 @@ static int msm_iommu_domain_config(struct msm_priv *priv) .pgsize_bitmap = msm_iommu_ops.pgsize_bitmap, .ias = 32, .oas = 32, - .tlb = &msm_iommu_gather_ops, + .tlb = &msm_iommu_flush_ops, .iommu_dev = priv->dev, }; diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c index 82e4be4dfdaf..fed77658d67e 100644 --- a/drivers/iommu/mtk_iommu.c +++ b/drivers/iommu/mtk_iommu.c @@ -188,7 +188,7 @@ static void mtk_iommu_tlb_sync(void *cookie) } } -static const struct iommu_gather_ops mtk_iommu_gather_ops = { +static const struct iommu_flush_ops mtk_iommu_flush_ops = { .tlb_flush_all = mtk_iommu_tlb_flush_all, .tlb_add_flush = mtk_iommu_tlb_add_flush_nosync, .tlb_sync = mtk_iommu_tlb_sync, @@ -267,7 +267,7 @@ static int mtk_iommu_domain_finalise(struct mtk_iommu_domain *dom) .pgsize_bitmap = mtk_iommu_ops.pgsize_bitmap, .ias = 32, .oas = 32, - .tlb = &mtk_iommu_gather_ops, + .tlb = &mtk_iommu_flush_ops, .iommu_dev = data->dev, }; diff --git a/drivers/iommu/qcom_iommu.c b/drivers/iommu/qcom_iommu.c index 34d0b9783b3e..fd9d9f4da735 100644 --- a/drivers/iommu/qcom_iommu.c +++ b/drivers/iommu/qcom_iommu.c @@ -164,7 +164,7 @@ static void qcom_iommu_tlb_inv_range_nosync(unsigned long iova, size_t size, } } -static const struct iommu_gather_ops qcom_gather_ops = { +static const struct iommu_flush_ops qcom_flush_ops = { .tlb_flush_all = qcom_iommu_tlb_inv_context, .tlb_add_flush = qcom_iommu_tlb_inv_range_nosync, .tlb_sync = qcom_iommu_tlb_sync, @@ -215,7 +215,7 @@ static int qcom_iommu_init_domain(struct iommu_domain *domain, .pgsize_bitmap = qcom_iommu_ops.pgsize_bitmap, .ias = 32, .oas = 40, - .tlb = &qcom_gather_ops, + .tlb = &qcom_flush_ops, .iommu_dev = qcom_iommu->dev, }; diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h index b5a450a3bb47..6292ea15d674 100644 --- a/include/linux/io-pgtable.h +++ b/include/linux/io-pgtable.h @@ -17,7 +17,7 @@ enum io_pgtable_fmt { }; /** - * struct iommu_gather_ops - IOMMU callbacks for TLB and page table management. + * struct iommu_flush_ops - IOMMU callbacks for TLB and page table management. * * @tlb_flush_all: Synchronously invalidate the entire TLB context. * @tlb_add_flush: Queue up a TLB invalidation for a virtual address range. @@ -28,7 +28,7 @@ enum io_pgtable_fmt { * Note that these can all be called in atomic context and must therefore * not block. */ -struct iommu_gather_ops { +struct iommu_flush_ops { void (*tlb_flush_all)(void *cookie); void (*tlb_add_flush)(unsigned long iova, size_t size, size_t granule, bool leaf, void *cookie); @@ -84,7 +84,7 @@ struct io_pgtable_cfg { unsigned int ias; unsigned int oas; bool coherent_walk; - const struct iommu_gather_ops *tlb; + const struct iommu_flush_ops *tlb; struct device *iommu_dev; /* Low-level data specific to the table format */ -- cgit v1.2.3 From a7d20dc19d9ea7012227be5144353012ffa3ddc4 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 2 Jul 2019 16:43:48 +0100 Subject: iommu: Introduce struct iommu_iotlb_gather for batching TLB flushes To permit batching of TLB flushes across multiple calls to the IOMMU driver's ->unmap() implementation, introduce a new structure for tracking the address range to be flushed and the granularity at which the flushing is required. This is hooked into the IOMMU API and its caller are updated to make use of the new structure. Subsequent patches will plumb this into the IOMMU drivers as well, but for now the gathering information is ignored. Signed-off-by: Will Deacon --- drivers/iommu/dma-iommu.c | 9 +++++++-- drivers/iommu/iommu.c | 19 +++++++++++------- drivers/vfio/vfio_iommu_type1.c | 26 ++++++++++++++++--------- include/linux/iommu.h | 43 +++++++++++++++++++++++++++++++++++++---- 4 files changed, 75 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index a7f9c3edbcb2..80beb1f5994a 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -444,13 +444,18 @@ static void __iommu_dma_unmap(struct device *dev, dma_addr_t dma_addr, struct iommu_dma_cookie *cookie = domain->iova_cookie; struct iova_domain *iovad = &cookie->iovad; size_t iova_off = iova_offset(iovad, dma_addr); + struct iommu_iotlb_gather iotlb_gather; + size_t unmapped; dma_addr -= iova_off; size = iova_align(iovad, size + iova_off); + iommu_iotlb_gather_init(&iotlb_gather); + + unmapped = iommu_unmap_fast(domain, dma_addr, size, &iotlb_gather); + WARN_ON(unmapped != size); - WARN_ON(iommu_unmap_fast(domain, dma_addr, size) != size); if (!cookie->fq_domain) - iommu_tlb_sync(domain); + iommu_tlb_sync(domain, &iotlb_gather); iommu_dma_free_iova(cookie, dma_addr, size); } diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 6d7b25fe2474..d67222fdfe44 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -1862,7 +1862,7 @@ EXPORT_SYMBOL_GPL(iommu_map); static size_t __iommu_unmap(struct iommu_domain *domain, unsigned long iova, size_t size, - bool sync) + struct iommu_iotlb_gather *iotlb_gather) { const struct iommu_ops *ops = domain->ops; size_t unmapped_page, unmapped = 0; @@ -1910,9 +1910,6 @@ static size_t __iommu_unmap(struct iommu_domain *domain, unmapped += unmapped_page; } - if (sync && ops->iotlb_sync) - ops->iotlb_sync(domain); - trace_unmap(orig_iova, size, unmapped); return unmapped; } @@ -1920,14 +1917,22 @@ static size_t __iommu_unmap(struct iommu_domain *domain, size_t iommu_unmap(struct iommu_domain *domain, unsigned long iova, size_t size) { - return __iommu_unmap(domain, iova, size, true); + struct iommu_iotlb_gather iotlb_gather; + size_t ret; + + iommu_iotlb_gather_init(&iotlb_gather); + ret = __iommu_unmap(domain, iova, size, &iotlb_gather); + iommu_tlb_sync(domain, &iotlb_gather); + + return ret; } EXPORT_SYMBOL_GPL(iommu_unmap); size_t iommu_unmap_fast(struct iommu_domain *domain, - unsigned long iova, size_t size) + unsigned long iova, size_t size, + struct iommu_iotlb_gather *iotlb_gather) { - return __iommu_unmap(domain, iova, size, false); + return __iommu_unmap(domain, iova, size, iotlb_gather); } EXPORT_SYMBOL_GPL(iommu_unmap_fast); diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index fad7fd8c167c..ad830abe1021 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -650,12 +650,13 @@ unpin_exit: } static long vfio_sync_unpin(struct vfio_dma *dma, struct vfio_domain *domain, - struct list_head *regions) + struct list_head *regions, + struct iommu_iotlb_gather *iotlb_gather) { long unlocked = 0; struct vfio_regions *entry, *next; - iommu_tlb_sync(domain->domain); + iommu_tlb_sync(domain->domain, iotlb_gather); list_for_each_entry_safe(entry, next, regions, list) { unlocked += vfio_unpin_pages_remote(dma, @@ -685,13 +686,15 @@ static size_t unmap_unpin_fast(struct vfio_domain *domain, struct vfio_dma *dma, dma_addr_t *iova, size_t len, phys_addr_t phys, long *unlocked, struct list_head *unmapped_list, - int *unmapped_cnt) + int *unmapped_cnt, + struct iommu_iotlb_gather *iotlb_gather) { size_t unmapped = 0; struct vfio_regions *entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (entry) { - unmapped = iommu_unmap_fast(domain->domain, *iova, len); + unmapped = iommu_unmap_fast(domain->domain, *iova, len, + iotlb_gather); if (!unmapped) { kfree(entry); @@ -711,8 +714,8 @@ static size_t unmap_unpin_fast(struct vfio_domain *domain, * or in case of errors. */ if (*unmapped_cnt >= VFIO_IOMMU_TLB_SYNC_MAX || !unmapped) { - *unlocked += vfio_sync_unpin(dma, domain, - unmapped_list); + *unlocked += vfio_sync_unpin(dma, domain, unmapped_list, + iotlb_gather); *unmapped_cnt = 0; } @@ -743,6 +746,7 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma, dma_addr_t iova = dma->iova, end = dma->iova + dma->size; struct vfio_domain *domain, *d; LIST_HEAD(unmapped_region_list); + struct iommu_iotlb_gather iotlb_gather; int unmapped_region_cnt = 0; long unlocked = 0; @@ -767,6 +771,7 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma, cond_resched(); } + iommu_iotlb_gather_init(&iotlb_gather); while (iova < end) { size_t unmapped, len; phys_addr_t phys, next; @@ -795,7 +800,8 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma, */ unmapped = unmap_unpin_fast(domain, dma, &iova, len, phys, &unlocked, &unmapped_region_list, - &unmapped_region_cnt); + &unmapped_region_cnt, + &iotlb_gather); if (!unmapped) { unmapped = unmap_unpin_slow(domain, dma, &iova, len, phys, &unlocked); @@ -806,8 +812,10 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma, dma->iommu_mapped = false; - if (unmapped_region_cnt) - unlocked += vfio_sync_unpin(dma, domain, &unmapped_region_list); + if (unmapped_region_cnt) { + unlocked += vfio_sync_unpin(dma, domain, &unmapped_region_list, + &iotlb_gather); + } if (do_accounting) { vfio_lock_acct(dma, -unlocked, true); diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 1e21431262d9..aaf073010a9a 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -191,6 +191,23 @@ struct iommu_sva_ops { #ifdef CONFIG_IOMMU_API +/** + * struct iommu_iotlb_gather - Range information for a pending IOTLB flush + * + * @start: IOVA representing the start of the range to be flushed + * @end: IOVA representing the end of the range to be flushed (exclusive) + * @pgsize: The interval at which to perform the flush + * + * This structure is intended to be updated by multiple calls to the + * ->unmap() function in struct iommu_ops before eventually being passed + * into ->iotlb_sync(). + */ +struct iommu_iotlb_gather { + unsigned long start; + unsigned long end; + size_t pgsize; +}; + /** * struct iommu_ops - iommu ops and capabilities * @capable: check capability @@ -375,6 +392,13 @@ static inline struct iommu_device *dev_to_iommu_device(struct device *dev) return (struct iommu_device *)dev_get_drvdata(dev); } +static inline void iommu_iotlb_gather_init(struct iommu_iotlb_gather *gather) +{ + *gather = (struct iommu_iotlb_gather) { + .start = ULONG_MAX, + }; +} + #define IOMMU_GROUP_NOTIFY_ADD_DEVICE 1 /* Device added */ #define IOMMU_GROUP_NOTIFY_DEL_DEVICE 2 /* Pre Device removed */ #define IOMMU_GROUP_NOTIFY_BIND_DRIVER 3 /* Pre Driver bind */ @@ -399,7 +423,8 @@ extern int iommu_map(struct iommu_domain *domain, unsigned long iova, extern size_t iommu_unmap(struct iommu_domain *domain, unsigned long iova, size_t size); extern size_t iommu_unmap_fast(struct iommu_domain *domain, - unsigned long iova, size_t size); + unsigned long iova, size_t size, + struct iommu_iotlb_gather *iotlb_gather); extern size_t iommu_map_sg(struct iommu_domain *domain, unsigned long iova, struct scatterlist *sg,unsigned int nents, int prot); extern phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova); @@ -473,10 +498,13 @@ static inline void iommu_flush_tlb_all(struct iommu_domain *domain) domain->ops->flush_iotlb_all(domain); } -static inline void iommu_tlb_sync(struct iommu_domain *domain) +static inline void iommu_tlb_sync(struct iommu_domain *domain, + struct iommu_iotlb_gather *iotlb_gather) { if (domain->ops->iotlb_sync) domain->ops->iotlb_sync(domain); + + iommu_iotlb_gather_init(iotlb_gather); } /* PCI device grouping function */ @@ -557,6 +585,7 @@ struct iommu_group {}; struct iommu_fwspec {}; struct iommu_device {}; struct iommu_fault_param {}; +struct iommu_iotlb_gather {}; static inline bool iommu_present(struct bus_type *bus) { @@ -611,7 +640,8 @@ static inline size_t iommu_unmap(struct iommu_domain *domain, } static inline size_t iommu_unmap_fast(struct iommu_domain *domain, - unsigned long iova, int gfp_order) + unsigned long iova, int gfp_order, + struct iommu_iotlb_gather *iotlb_gather) { return 0; } @@ -627,7 +657,8 @@ static inline void iommu_flush_tlb_all(struct iommu_domain *domain) { } -static inline void iommu_tlb_sync(struct iommu_domain *domain) +static inline void iommu_tlb_sync(struct iommu_domain *domain, + struct iommu_iotlb_gather *iotlb_gather) { } @@ -812,6 +843,10 @@ static inline struct iommu_device *dev_to_iommu_device(struct device *dev) return NULL; } +static inline void iommu_iotlb_gather_init(struct iommu_iotlb_gather *gather) +{ +} + static inline void iommu_device_unregister(struct iommu_device *iommu) { } -- cgit v1.2.3 From 4fcf8544fc677fc8af135f1d86b3ba69c4ad429d Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 2 Jul 2019 16:43:57 +0100 Subject: iommu: Introduce iommu_iotlb_gather_add_page() Introduce a helper function for drivers to use when updating an iommu_iotlb_gather structure in response to an ->unmap() call, rather than having to open-code the logic in every page-table implementation. Signed-off-by: Will Deacon --- include/linux/iommu.h | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'include') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index aaf073010a9a..ad41aee55bc6 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -507,6 +507,31 @@ static inline void iommu_tlb_sync(struct iommu_domain *domain, iommu_iotlb_gather_init(iotlb_gather); } +static inline void iommu_iotlb_gather_add_page(struct iommu_domain *domain, + struct iommu_iotlb_gather *gather, + unsigned long iova, size_t size) +{ + unsigned long start = iova, end = start + size; + + /* + * If the new page is disjoint from the current range or is mapped at + * a different granularity, then sync the TLB so that the gather + * structure can be rewritten. + */ + if (gather->pgsize != size || + end < gather->start || start > gather->end) { + if (gather->pgsize) + iommu_tlb_sync(domain, gather); + gather->pgsize = size; + } + + if (gather->end < end) + gather->end = end; + + if (gather->start > start) + gather->start = start; +} + /* PCI device grouping function */ extern struct iommu_group *pci_device_group(struct device *dev); /* Generic device grouping function */ @@ -847,6 +872,12 @@ static inline void iommu_iotlb_gather_init(struct iommu_iotlb_gather *gather) { } +static inline void iommu_iotlb_gather_add_page(struct iommu_domain *domain, + struct iommu_iotlb_gather *gather, + unsigned long iova, size_t size) +{ +} + static inline void iommu_device_unregister(struct iommu_device *iommu) { } -- cgit v1.2.3 From 511885d7061eda3eb1faf3f57dcc936ff75863f1 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 24 Jul 2019 08:23:23 -0700 Subject: lib/timerqueue: Rely on rbtree semantics for next timer Simplify the timerqueue code by using cached rbtrees and rely on the tree leftmost node semantics to get the timer with earliest expiration time. This is a drop in conversion, and therefore semantics remain untouched. The runtime overhead of cached rbtrees is be pretty much the same as the current head->next method, noting that when removing the leftmost node, a common operation for the timerqueue, the rb_next(leftmost) is O(1) as well, so the next timer will either be the right node or its parent. Therefore no extra pointer chasing. Finally, the size of the struct timerqueue_head remains the same. Passes several hours of rcutorture. Signed-off-by: Davidlohr Bueso Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20190724152323.bojciei3muvfxalm@linux-r8p5 --- include/linux/timerqueue.h | 13 ++++++------- lib/timerqueue.c | 30 ++++++++++++------------------ 2 files changed, 18 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/include/linux/timerqueue.h b/include/linux/timerqueue.h index 78b8cc73f12f..aff122f1062a 100644 --- a/include/linux/timerqueue.h +++ b/include/linux/timerqueue.h @@ -12,8 +12,7 @@ struct timerqueue_node { }; struct timerqueue_head { - struct rb_root head; - struct timerqueue_node *next; + struct rb_root_cached rb_root; }; @@ -29,13 +28,14 @@ extern struct timerqueue_node *timerqueue_iterate_next( * * @head: head of timerqueue * - * Returns a pointer to the timer node that has the - * earliest expiration time. + * Returns a pointer to the timer node that has the earliest expiration time. */ static inline struct timerqueue_node *timerqueue_getnext(struct timerqueue_head *head) { - return head->next; + struct rb_node *leftmost = rb_first_cached(&head->rb_root); + + return rb_entry(leftmost, struct timerqueue_node, node); } static inline void timerqueue_init(struct timerqueue_node *node) @@ -45,7 +45,6 @@ static inline void timerqueue_init(struct timerqueue_node *node) static inline void timerqueue_init_head(struct timerqueue_head *head) { - head->head = RB_ROOT; - head->next = NULL; + head->rb_root = RB_ROOT_CACHED; } #endif /* _LINUX_TIMERQUEUE_H */ diff --git a/lib/timerqueue.c b/lib/timerqueue.c index bc7e64df27df..c52710964593 100644 --- a/lib/timerqueue.c +++ b/lib/timerqueue.c @@ -26,9 +26,10 @@ */ bool timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node) { - struct rb_node **p = &head->head.rb_node; + struct rb_node **p = &head->rb_root.rb_root.rb_node; struct rb_node *parent = NULL; - struct timerqueue_node *ptr; + struct timerqueue_node *ptr; + bool leftmost = true; /* Make sure we don't add nodes that are already added */ WARN_ON_ONCE(!RB_EMPTY_NODE(&node->node)); @@ -36,19 +37,17 @@ bool timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node) while (*p) { parent = *p; ptr = rb_entry(parent, struct timerqueue_node, node); - if (node->expires < ptr->expires) + if (node->expires < ptr->expires) { p = &(*p)->rb_left; - else + } else { p = &(*p)->rb_right; + leftmost = false; + } } rb_link_node(&node->node, parent, p); - rb_insert_color(&node->node, &head->head); + rb_insert_color_cached(&node->node, &head->rb_root, leftmost); - if (!head->next || node->expires < head->next->expires) { - head->next = node; - return true; - } - return false; + return leftmost; } EXPORT_SYMBOL_GPL(timerqueue_add); @@ -65,15 +64,10 @@ bool timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node) { WARN_ON_ONCE(RB_EMPTY_NODE(&node->node)); - /* update next pointer */ - if (head->next == node) { - struct rb_node *rbn = rb_next(&node->node); - - head->next = rb_entry_safe(rbn, struct timerqueue_node, node); - } - rb_erase(&node->node, &head->head); + rb_erase_cached(&node->node, &head->rb_root); RB_CLEAR_NODE(&node->node); - return head->next != NULL; + + return !RB_EMPTY_ROOT(&head->rb_root.rb_root); } EXPORT_SYMBOL_GPL(timerqueue_del); -- cgit v1.2.3 From d7852fbd0f0423937fa287a598bfde188bb68c22 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 11 Jul 2019 09:54:40 -0700 Subject: access: avoid the RCU grace period for the temporary subjective credentials It turns out that 'access()' (and 'faccessat()') can cause a lot of RCU work because it installs a temporary credential that gets allocated and freed for each system call. The allocation and freeing overhead is mostly benign, but because credentials can be accessed under the RCU read lock, the freeing involves a RCU grace period. Which is not a huge deal normally, but if you have a lot of access() calls, this causes a fair amount of seconday damage: instead of having a nice alloc/free patterns that hits in hot per-CPU slab caches, you have all those delayed free's, and on big machines with hundreds of cores, the RCU overhead can end up being enormous. But it turns out that all of this is entirely unnecessary. Exactly because access() only installs the credential as the thread-local subjective credential, the temporary cred pointer doesn't actually need to be RCU free'd at all. Once we're done using it, we can just free it synchronously and avoid all the RCU overhead. So add a 'non_rcu' flag to 'struct cred', which can be set by users that know they only use it in non-RCU context (there are other potential users for this). We can make it a union with the rcu freeing list head that we need for the RCU case, so this doesn't need any extra storage. Note that this also makes 'get_current_cred()' clear the new non_rcu flag, in case we have filesystems that take a long-term reference to the cred and then expect the RCU delayed freeing afterwards. It's not entirely clear that this is required, but it makes for clear semantics: the subjective cred remains non-RCU as long as you only access it synchronously using the thread-local accessors, but you _can_ use it as a generic cred if you want to. It is possible that we should just remove the whole RCU markings for ->cred entirely. Only ->real_cred is really supposed to be accessed through RCU, and the long-term cred copies that nfs uses might want to explicitly re-enable RCU freeing if required, rather than have get_current_cred() do it implicitly. But this is a "minimal semantic changes" change for the immediate problem. Acked-by: Peter Zijlstra (Intel) Acked-by: Eric Dumazet Acked-by: Paul E. McKenney Cc: Oleg Nesterov Cc: Jan Glauber Cc: Jiri Kosina Cc: Jayachandran Chandrasekharan Nair Cc: Greg KH Cc: Kees Cook Cc: David Howells Cc: Miklos Szeredi Cc: Al Viro Signed-off-by: Linus Torvalds --- fs/open.c | 19 +++++++++++++++++++ include/linux/cred.h | 8 +++++++- kernel/cred.c | 21 +++++++++++++++++++-- 3 files changed, 45 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/fs/open.c b/fs/open.c index b5b80469b93d..a59abe3c669a 100644 --- a/fs/open.c +++ b/fs/open.c @@ -374,6 +374,25 @@ long do_faccessat(int dfd, const char __user *filename, int mode) override_cred->cap_permitted; } + /* + * The new set of credentials can *only* be used in + * task-synchronous circumstances, and does not need + * RCU freeing, unless somebody then takes a separate + * reference to it. + * + * NOTE! This is _only_ true because this credential + * is used purely for override_creds() that installs + * it as the subjective cred. Other threads will be + * accessing ->real_cred, not the subjective cred. + * + * If somebody _does_ make a copy of this (using the + * 'get_current_cred()' function), that will clear the + * non_rcu field, because now that other user may be + * expecting RCU freeing. But normal thread-synchronous + * cred accesses will keep things non-RCY. + */ + override_cred->non_rcu = 1; + old_cred = override_creds(override_cred); retry: res = user_path_at(dfd, filename, lookup_flags, &path); diff --git a/include/linux/cred.h b/include/linux/cred.h index 7eb43a038330..f7a30e0099be 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -145,7 +145,11 @@ struct cred { struct user_struct *user; /* real user ID subscription */ struct user_namespace *user_ns; /* user_ns the caps and keyrings are relative to. */ struct group_info *group_info; /* supplementary groups for euid/fsgid */ - struct rcu_head rcu; /* RCU deletion hook */ + /* RCU deletion */ + union { + int non_rcu; /* Can we skip RCU deletion? */ + struct rcu_head rcu; /* RCU deletion hook */ + }; } __randomize_layout; extern void __put_cred(struct cred *); @@ -246,6 +250,7 @@ static inline const struct cred *get_cred(const struct cred *cred) if (!cred) return cred; validate_creds(cred); + nonconst_cred->non_rcu = 0; return get_new_cred(nonconst_cred); } @@ -257,6 +262,7 @@ static inline const struct cred *get_cred_rcu(const struct cred *cred) if (!atomic_inc_not_zero(&nonconst_cred->usage)) return NULL; validate_creds(cred); + nonconst_cred->non_rcu = 0; return cred; } diff --git a/kernel/cred.c b/kernel/cred.c index c73a87a4df13..153ae369e024 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -144,7 +144,10 @@ void __put_cred(struct cred *cred) BUG_ON(cred == current->cred); BUG_ON(cred == current->real_cred); - call_rcu(&cred->rcu, put_cred_rcu); + if (cred->non_rcu) + put_cred_rcu(&cred->rcu); + else + call_rcu(&cred->rcu, put_cred_rcu); } EXPORT_SYMBOL(__put_cred); @@ -256,6 +259,7 @@ struct cred *prepare_creds(void) old = task->cred; memcpy(new, old, sizeof(struct cred)); + new->non_rcu = 0; atomic_set(&new->usage, 1); set_cred_subscribers(new, 0); get_group_info(new->group_info); @@ -535,7 +539,19 @@ const struct cred *override_creds(const struct cred *new) validate_creds(old); validate_creds(new); - get_cred(new); + + /* + * NOTE! This uses 'get_new_cred()' rather than 'get_cred()'. + * + * That means that we do not clear the 'non_rcu' flag, since + * we are only installing the cred into the thread-synchronous + * '->cred' pointer, not the '->real_cred' pointer that is + * visible to other threads under RCU. + * + * Also note that we did validate_creds() manually, not depending + * on the validation in 'get_cred()'. + */ + get_new_cred((struct cred *)new); alter_cred_subscribers(new, 1); rcu_assign_pointer(current->cred, new); alter_cred_subscribers(old, -1); @@ -672,6 +688,7 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) validate_creds(old); *new = *old; + new->non_rcu = 0; atomic_set(&new->usage, 1); set_cred_subscribers(new, 0); get_uid(new->user); -- cgit v1.2.3 From 71d8e94dabee7fceac473d87445a03e848469a71 Mon Sep 17 00:00:00 2001 From: Moritz Fischer Date: Wed, 26 Jun 2019 17:33:09 -0700 Subject: fpga: altera-pr-ip: Make alt_pr_unregister function void Make alt_pr_unregister function void, since it always returns 0, and nothing would act on the value anyways. Signed-off-by: Moritz Fischer --- drivers/fpga/altera-pr-ip-core-plat.c | 4 +++- drivers/fpga/altera-pr-ip-core.c | 4 +--- include/linux/fpga/altera-pr-ip-core.h | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/fpga/altera-pr-ip-core-plat.c b/drivers/fpga/altera-pr-ip-core-plat.c index b293d83143f1..99b9cc0e70f0 100644 --- a/drivers/fpga/altera-pr-ip-core-plat.c +++ b/drivers/fpga/altera-pr-ip-core-plat.c @@ -32,7 +32,9 @@ static int alt_pr_platform_remove(struct platform_device *pdev) { struct device *dev = &pdev->dev; - return alt_pr_unregister(dev); + alt_pr_unregister(dev); + + return 0; } static const struct of_device_id alt_pr_of_match[] = { diff --git a/drivers/fpga/altera-pr-ip-core.c b/drivers/fpga/altera-pr-ip-core.c index a7a3bf0b5202..2cf25fd5e897 100644 --- a/drivers/fpga/altera-pr-ip-core.c +++ b/drivers/fpga/altera-pr-ip-core.c @@ -201,15 +201,13 @@ int alt_pr_register(struct device *dev, void __iomem *reg_base) } EXPORT_SYMBOL_GPL(alt_pr_register); -int alt_pr_unregister(struct device *dev) +void alt_pr_unregister(struct device *dev) { struct fpga_manager *mgr = dev_get_drvdata(dev); dev_dbg(dev, "%s\n", __func__); fpga_mgr_unregister(mgr); - - return 0; } EXPORT_SYMBOL_GPL(alt_pr_unregister); diff --git a/include/linux/fpga/altera-pr-ip-core.h b/include/linux/fpga/altera-pr-ip-core.h index 7d4664730d60..0b08ac20ab16 100644 --- a/include/linux/fpga/altera-pr-ip-core.h +++ b/include/linux/fpga/altera-pr-ip-core.h @@ -13,6 +13,6 @@ #include int alt_pr_register(struct device *dev, void __iomem *reg_base); -int alt_pr_unregister(struct device *dev); +void alt_pr_unregister(struct device *dev); #endif /* _ALT_PR_IP_CORE_H */ -- cgit v1.2.3 From 3b51c44bd6936e86a7180abd9aebc4387a479253 Mon Sep 17 00:00:00 2001 From: Atif Niyaz Date: Wed, 24 Jul 2019 22:26:31 +0300 Subject: Input: allow drivers specify timestamp for input events Currently, evdev stamps events with timestamps acquired in evdev_events() However, this timestamping may not be accurate in terms of measuring when the actual event happened. Let's allow individual drivers specify timestamp in order to provide a more accurate sense of time for the event. It is expected that drivers will set the timestamp in their hard interrupt routine. Signed-off-by: Atif Niyaz Signed-off-by: Dmitry Torokhov --- drivers/input/evdev.c | 35 ++++++++--------------------------- drivers/input/input.c | 40 ++++++++++++++++++++++++++++++++++++++++ include/linux/input.h | 14 ++++++++++++++ 3 files changed, 62 insertions(+), 27 deletions(-) (limited to 'include') diff --git a/drivers/input/evdev.c b/drivers/input/evdev.c index 867c2cfd0038..d7dd6fcf2db0 100644 --- a/drivers/input/evdev.c +++ b/drivers/input/evdev.c @@ -25,13 +25,6 @@ #include #include "input-compat.h" -enum evdev_clock_type { - EV_CLK_REAL = 0, - EV_CLK_MONO, - EV_CLK_BOOT, - EV_CLK_MAX -}; - struct evdev { int open; struct input_handle handle; @@ -53,7 +46,7 @@ struct evdev_client { struct fasync_struct *fasync; struct evdev *evdev; struct list_head node; - unsigned int clk_type; + enum input_clock_type clk_type; bool revoked; unsigned long *evmasks[EV_CNT]; unsigned int bufsize; @@ -149,17 +142,10 @@ static void __evdev_flush_queue(struct evdev_client *client, unsigned int type) static void __evdev_queue_syn_dropped(struct evdev_client *client) { + ktime_t *ev_time = input_get_timestamp(client->evdev->handle.dev); + struct timespec64 ts = ktime_to_timespec64(ev_time[client->clk_type]); struct input_event ev; - ktime_t time; - struct timespec64 ts; - time = client->clk_type == EV_CLK_REAL ? - ktime_get_real() : - client->clk_type == EV_CLK_MONO ? - ktime_get() : - ktime_get_boottime(); - - ts = ktime_to_timespec64(time); ev.input_event_sec = ts.tv_sec; ev.input_event_usec = ts.tv_nsec / NSEC_PER_USEC; ev.type = EV_SYN; @@ -188,18 +174,18 @@ static void evdev_queue_syn_dropped(struct evdev_client *client) static int evdev_set_clk_type(struct evdev_client *client, unsigned int clkid) { unsigned long flags; - unsigned int clk_type; + enum input_clock_type clk_type; switch (clkid) { case CLOCK_REALTIME: - clk_type = EV_CLK_REAL; + clk_type = INPUT_CLK_REAL; break; case CLOCK_MONOTONIC: - clk_type = EV_CLK_MONO; + clk_type = INPUT_CLK_MONO; break; case CLOCK_BOOTTIME: - clk_type = EV_CLK_BOOT; + clk_type = INPUT_CLK_BOOT; break; default: return -EINVAL; @@ -307,12 +293,7 @@ static void evdev_events(struct input_handle *handle, { struct evdev *evdev = handle->private; struct evdev_client *client; - ktime_t ev_time[EV_CLK_MAX]; - - ev_time[EV_CLK_MONO] = ktime_get(); - ev_time[EV_CLK_REAL] = ktime_mono_to_real(ev_time[EV_CLK_MONO]); - ev_time[EV_CLK_BOOT] = ktime_mono_to_any(ev_time[EV_CLK_MONO], - TK_OFFS_BOOT); + ktime_t *ev_time = input_get_timestamp(handle->dev); rcu_read_lock(); diff --git a/drivers/input/input.c b/drivers/input/input.c index 7f3c5fcb9ed6..7494a0dede79 100644 --- a/drivers/input/input.c +++ b/drivers/input/input.c @@ -1894,6 +1894,46 @@ void input_free_device(struct input_dev *dev) } EXPORT_SYMBOL(input_free_device); +/** + * input_set_timestamp - set timestamp for input events + * @dev: input device to set timestamp for + * @timestamp: the time at which the event has occurred + * in CLOCK_MONOTONIC + * + * This function is intended to provide to the input system a more + * accurate time of when an event actually occurred. The driver should + * call this function as soon as a timestamp is acquired ensuring + * clock conversions in input_set_timestamp are done correctly. + * + * The system entering suspend state between timestamp acquisition and + * calling input_set_timestamp can result in inaccurate conversions. + */ +void input_set_timestamp(struct input_dev *dev, ktime_t timestamp) +{ + dev->timestamp[INPUT_CLK_MONO] = timestamp; + dev->timestamp[INPUT_CLK_REAL] = ktime_mono_to_real(timestamp); + dev->timestamp[INPUT_CLK_BOOT] = ktime_mono_to_any(timestamp, + TK_OFFS_BOOT); +} +EXPORT_SYMBOL(input_set_timestamp); + +/** + * input_get_timestamp - get timestamp for input events + * @dev: input device to get timestamp from + * + * A valid timestamp is a timestamp of non-zero value. + */ +ktime_t *input_get_timestamp(struct input_dev *dev) +{ + const ktime_t invalid_timestamp = ktime_set(0, 0); + + if (!ktime_compare(dev->timestamp[INPUT_CLK_MONO], invalid_timestamp)) + input_set_timestamp(dev, ktime_get()); + + return dev->timestamp; +} +EXPORT_SYMBOL(input_get_timestamp); + /** * input_set_capability - mark device as capable of a certain event * @dev: device that is capable of emitting or accepting event diff --git a/include/linux/input.h b/include/linux/input.h index 510e78558c10..e95a439d8bd5 100644 --- a/include/linux/input.h +++ b/include/linux/input.h @@ -33,6 +33,13 @@ struct input_value { __s32 value; }; +enum input_clock_type { + INPUT_CLK_REAL = 0, + INPUT_CLK_MONO, + INPUT_CLK_BOOT, + INPUT_CLK_MAX +}; + /** * struct input_dev - represents an input device * @name: name of the device @@ -114,6 +121,8 @@ struct input_value { * @vals: array of values queued in the current frame * @devres_managed: indicates that devices is managed with devres framework * and needs not be explicitly unregistered or freed. + * @timestamp: storage for a timestamp set by input_set_timestamp called + * by a driver */ struct input_dev { const char *name; @@ -184,6 +193,8 @@ struct input_dev { struct input_value *vals; bool devres_managed; + + ktime_t timestamp[INPUT_CLK_MAX]; }; #define to_input_dev(d) container_of(d, struct input_dev, dev) @@ -382,6 +393,9 @@ void input_close_device(struct input_handle *); int input_flush_device(struct input_handle *handle, struct file *file); +void input_set_timestamp(struct input_dev *dev, ktime_t timestamp); +ktime_t *input_get_timestamp(struct input_dev *dev); + void input_event(struct input_dev *dev, unsigned int type, unsigned int code, int value); void input_inject_event(struct input_handle *handle, unsigned int type, unsigned int code, int value); -- cgit v1.2.3 From d9c5252295218df4cfe64353aa860d7b5c8700ef Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 25 Jul 2019 16:58:31 +0900 Subject: treewide: add "WITH Linux-syscall-note" to SPDX tag of uapi headers UAPI headers licensed under GPL are supposed to have exception "WITH Linux-syscall-note" so that they can be included into non-GPL user space application code. The exception note is missing in some UAPI headers. Some of them slipped in by the treewide conversion commit b24413180f56 ("License cleanup: add SPDX GPL-2.0 license identifier to files with no license"). Just run: $ git show --oneline b24413180f56 -- arch/x86/include/uapi/asm/ I believe they are not intentional, and should be fixed too. This patch was generated by the following script: git grep -l --not -e Linux-syscall-note --and -e SPDX-License-Identifier \ -- :arch/*/include/uapi/asm/*.h :include/uapi/ :^*/Kbuild | while read file do sed -i -e '/[[:space:]]OR[[:space:]]/s/\(GPL-[^[:space:]]*\)/(\1 WITH Linux-syscall-note)/g' \ -e '/[[:space:]]or[[:space:]]/s/\(GPL-[^[:space:]]*\)/(\1 WITH Linux-syscall-note)/g' \ -e '/[[:space:]]OR[[:space:]]/!{/[[:space:]]or[[:space:]]/!s/\(GPL-[^[:space:]]*\)/\1 WITH Linux-syscall-note/g}' $file done After this patch is applied, there are 5 UAPI headers that do not contain "WITH Linux-syscall-note". They are kept untouched since this exception applies only to GPL variants. $ git grep --not -e Linux-syscall-note --and -e SPDX-License-Identifier \ -- :arch/*/include/uapi/asm/*.h :include/uapi/ :^*/Kbuild include/uapi/drm/panfrost_drm.h:/* SPDX-License-Identifier: MIT */ include/uapi/linux/batman_adv.h:/* SPDX-License-Identifier: MIT */ include/uapi/linux/qemu_fw_cfg.h:/* SPDX-License-Identifier: BSD-3-Clause */ include/uapi/linux/vbox_err.h:/* SPDX-License-Identifier: MIT */ include/uapi/linux/virtio_iommu.h:/* SPDX-License-Identifier: BSD-3-Clause */ Signed-off-by: Masahiro Yamada Reviewed-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- arch/arm64/include/uapi/asm/bpf_perf_event.h | 2 +- arch/csky/include/uapi/asm/byteorder.h | 2 +- arch/csky/include/uapi/asm/cachectl.h | 2 +- arch/csky/include/uapi/asm/perf_regs.h | 2 +- arch/csky/include/uapi/asm/ptrace.h | 2 +- arch/csky/include/uapi/asm/sigcontext.h | 2 +- arch/csky/include/uapi/asm/unistd.h | 2 +- arch/nds32/include/uapi/asm/auxvec.h | 2 +- arch/nds32/include/uapi/asm/byteorder.h | 2 +- arch/nds32/include/uapi/asm/cachectl.h | 2 +- arch/nds32/include/uapi/asm/fp_udfiex_crtl.h | 2 +- arch/nds32/include/uapi/asm/param.h | 2 +- arch/nds32/include/uapi/asm/ptrace.h | 2 +- arch/nds32/include/uapi/asm/sigcontext.h | 2 +- arch/nds32/include/uapi/asm/unistd.h | 2 +- arch/powerpc/include/uapi/asm/bpf_perf_event.h | 2 +- arch/riscv/include/uapi/asm/auxvec.h | 2 +- arch/riscv/include/uapi/asm/bitsperlong.h | 2 +- arch/riscv/include/uapi/asm/byteorder.h | 2 +- arch/riscv/include/uapi/asm/hwcap.h | 2 +- arch/riscv/include/uapi/asm/ptrace.h | 2 +- arch/riscv/include/uapi/asm/sigcontext.h | 2 +- arch/riscv/include/uapi/asm/ucontext.h | 2 +- arch/s390/include/uapi/asm/bpf_perf_event.h | 2 +- arch/s390/include/uapi/asm/ipl.h | 2 +- arch/sh/include/uapi/asm/setup.h | 2 +- arch/sh/include/uapi/asm/types.h | 2 +- arch/sparc/include/uapi/asm/oradax.h | 2 +- arch/x86/include/uapi/asm/byteorder.h | 2 +- arch/x86/include/uapi/asm/hwcap2.h | 2 +- arch/x86/include/uapi/asm/sigcontext32.h | 2 +- arch/x86/include/uapi/asm/types.h | 2 +- include/uapi/linux/bpfilter.h | 2 +- include/uapi/linux/ipmi_bmc.h | 2 +- include/uapi/linux/isst_if.h | 2 +- include/uapi/linux/netfilter/nf_synproxy.h | 2 +- include/uapi/linux/psp-sev.h | 2 +- include/uapi/linux/rxrpc.h | 2 +- include/uapi/linux/usb/g_uvc.h | 2 +- include/uapi/linux/vbox_vmmdev_types.h | 2 +- include/uapi/linux/vboxguest.h | 2 +- include/uapi/linux/virtio_pmem.h | 2 +- include/uapi/linux/vmcore.h | 2 +- include/uapi/linux/wmi.h | 2 +- include/uapi/misc/fastrpc.h | 2 +- include/uapi/rdma/rvt-abi.h | 2 +- include/uapi/rdma/siw-abi.h | 2 +- include/uapi/scsi/scsi_bsg_ufs.h | 2 +- include/uapi/sound/skl-tplg-interface.h | 2 +- 49 files changed, 49 insertions(+), 49 deletions(-) (limited to 'include') diff --git a/arch/arm64/include/uapi/asm/bpf_perf_event.h b/arch/arm64/include/uapi/asm/bpf_perf_event.h index b551b741653d..5e1e648aeec4 100644 --- a/arch/arm64/include/uapi/asm/bpf_perf_event.h +++ b/arch/arm64/include/uapi/asm/bpf_perf_event.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _UAPI__ASM_BPF_PERF_EVENT_H__ #define _UAPI__ASM_BPF_PERF_EVENT_H__ diff --git a/arch/csky/include/uapi/asm/byteorder.h b/arch/csky/include/uapi/asm/byteorder.h index b079ec715cdf..d150cd664873 100644 --- a/arch/csky/include/uapi/asm/byteorder.h +++ b/arch/csky/include/uapi/asm/byteorder.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ // Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd. #ifndef __ASM_CSKY_BYTEORDER_H diff --git a/arch/csky/include/uapi/asm/cachectl.h b/arch/csky/include/uapi/asm/cachectl.h index ddf2f39aa925..ed7fad1ea20d 100644 --- a/arch/csky/include/uapi/asm/cachectl.h +++ b/arch/csky/include/uapi/asm/cachectl.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef __ASM_CSKY_CACHECTL_H #define __ASM_CSKY_CACHECTL_H diff --git a/arch/csky/include/uapi/asm/perf_regs.h b/arch/csky/include/uapi/asm/perf_regs.h index ee323d818592..49d4e147a559 100644 --- a/arch/csky/include/uapi/asm/perf_regs.h +++ b/arch/csky/include/uapi/asm/perf_regs.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ // Copyright (C) 2019 Hangzhou C-SKY Microsystems co.,ltd. #ifndef _ASM_CSKY_PERF_REGS_H diff --git a/arch/csky/include/uapi/asm/ptrace.h b/arch/csky/include/uapi/asm/ptrace.h index 4e248d5b86ef..66b2268e324e 100644 --- a/arch/csky/include/uapi/asm/ptrace.h +++ b/arch/csky/include/uapi/asm/ptrace.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ // Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd. #ifndef _CSKY_PTRACE_H diff --git a/arch/csky/include/uapi/asm/sigcontext.h b/arch/csky/include/uapi/asm/sigcontext.h index e81e7ff11e36..670c020f2cb8 100644 --- a/arch/csky/include/uapi/asm/sigcontext.h +++ b/arch/csky/include/uapi/asm/sigcontext.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ // Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd. #ifndef __ASM_CSKY_SIGCONTEXT_H diff --git a/arch/csky/include/uapi/asm/unistd.h b/arch/csky/include/uapi/asm/unistd.h index ec60e49cea66..211c983c7282 100644 --- a/arch/csky/include/uapi/asm/unistd.h +++ b/arch/csky/include/uapi/asm/unistd.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ // Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd. #define __ARCH_WANT_SYS_CLONE diff --git a/arch/nds32/include/uapi/asm/auxvec.h b/arch/nds32/include/uapi/asm/auxvec.h index b5d58ea8decb..bc0b92ab8c15 100644 --- a/arch/nds32/include/uapi/asm/auxvec.h +++ b/arch/nds32/include/uapi/asm/auxvec.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ // Copyright (C) 2005-2017 Andes Technology Corporation #ifndef __ASM_AUXVEC_H diff --git a/arch/nds32/include/uapi/asm/byteorder.h b/arch/nds32/include/uapi/asm/byteorder.h index 511e653c709d..c264ef12c49c 100644 --- a/arch/nds32/include/uapi/asm/byteorder.h +++ b/arch/nds32/include/uapi/asm/byteorder.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ // Copyright (C) 2005-2017 Andes Technology Corporation #ifndef __NDS32_BYTEORDER_H__ diff --git a/arch/nds32/include/uapi/asm/cachectl.h b/arch/nds32/include/uapi/asm/cachectl.h index 73793662815c..31b9b439d819 100644 --- a/arch/nds32/include/uapi/asm/cachectl.h +++ b/arch/nds32/include/uapi/asm/cachectl.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ // Copyright (C) 1994, 1995, 1996 by Ralf Baechle // Copyright (C) 2005-2017 Andes Technology Corporation #ifndef _ASM_CACHECTL diff --git a/arch/nds32/include/uapi/asm/fp_udfiex_crtl.h b/arch/nds32/include/uapi/asm/fp_udfiex_crtl.h index d54a5d6c6538..f17396db16ec 100644 --- a/arch/nds32/include/uapi/asm/fp_udfiex_crtl.h +++ b/arch/nds32/include/uapi/asm/fp_udfiex_crtl.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ /* Copyright (C) 2005-2019 Andes Technology Corporation */ #ifndef _FP_UDF_IEX_CRTL_H #define _FP_UDF_IEX_CRTL_H diff --git a/arch/nds32/include/uapi/asm/param.h b/arch/nds32/include/uapi/asm/param.h index 2977534a6bd3..48d00328d328 100644 --- a/arch/nds32/include/uapi/asm/param.h +++ b/arch/nds32/include/uapi/asm/param.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ // Copyright (C) 2005-2017 Andes Technology Corporation #ifndef __ASM_NDS32_PARAM_H diff --git a/arch/nds32/include/uapi/asm/ptrace.h b/arch/nds32/include/uapi/asm/ptrace.h index 1a6e01c00e6f..d76217c7c010 100644 --- a/arch/nds32/include/uapi/asm/ptrace.h +++ b/arch/nds32/include/uapi/asm/ptrace.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ // Copyright (C) 2005-2017 Andes Technology Corporation #ifndef __UAPI_ASM_NDS32_PTRACE_H diff --git a/arch/nds32/include/uapi/asm/sigcontext.h b/arch/nds32/include/uapi/asm/sigcontext.h index dc89af7ddcc3..6c1e6648878f 100644 --- a/arch/nds32/include/uapi/asm/sigcontext.h +++ b/arch/nds32/include/uapi/asm/sigcontext.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ // Copyright (C) 2005-2017 Andes Technology Corporation #ifndef _ASMNDS32_SIGCONTEXT_H diff --git a/arch/nds32/include/uapi/asm/unistd.h b/arch/nds32/include/uapi/asm/unistd.h index a0b2f7b9c0f2..410795e280fe 100644 --- a/arch/nds32/include/uapi/asm/unistd.h +++ b/arch/nds32/include/uapi/asm/unistd.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ // Copyright (C) 2005-2017 Andes Technology Corporation #define __ARCH_WANT_STAT64 diff --git a/arch/powerpc/include/uapi/asm/bpf_perf_event.h b/arch/powerpc/include/uapi/asm/bpf_perf_event.h index b551b741653d..5e1e648aeec4 100644 --- a/arch/powerpc/include/uapi/asm/bpf_perf_event.h +++ b/arch/powerpc/include/uapi/asm/bpf_perf_event.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _UAPI__ASM_BPF_PERF_EVENT_H__ #define _UAPI__ASM_BPF_PERF_EVENT_H__ diff --git a/arch/riscv/include/uapi/asm/auxvec.h b/arch/riscv/include/uapi/asm/auxvec.h index 62716653554b..d86cb17bbabe 100644 --- a/arch/riscv/include/uapi/asm/auxvec.h +++ b/arch/riscv/include/uapi/asm/auxvec.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ +/* SPDX-License-Identifier: GPL-2.0-only WITH Linux-syscall-note */ /* * Copyright (C) 2012 ARM Ltd. * Copyright (C) 2015 Regents of the University of California diff --git a/arch/riscv/include/uapi/asm/bitsperlong.h b/arch/riscv/include/uapi/asm/bitsperlong.h index 0b9b58b57ff6..7d0b32e3b701 100644 --- a/arch/riscv/include/uapi/asm/bitsperlong.h +++ b/arch/riscv/include/uapi/asm/bitsperlong.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ +/* SPDX-License-Identifier: GPL-2.0-only WITH Linux-syscall-note */ /* * Copyright (C) 2012 ARM Ltd. * Copyright (C) 2015 Regents of the University of California diff --git a/arch/riscv/include/uapi/asm/byteorder.h b/arch/riscv/include/uapi/asm/byteorder.h index 1920debc09c0..f671e16bf6af 100644 --- a/arch/riscv/include/uapi/asm/byteorder.h +++ b/arch/riscv/include/uapi/asm/byteorder.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ +/* SPDX-License-Identifier: GPL-2.0-only WITH Linux-syscall-note */ /* * Copyright (C) 2012 ARM Ltd. * Copyright (C) 2015 Regents of the University of California diff --git a/arch/riscv/include/uapi/asm/hwcap.h b/arch/riscv/include/uapi/asm/hwcap.h index 7d786145183b..4e7646077056 100644 --- a/arch/riscv/include/uapi/asm/hwcap.h +++ b/arch/riscv/include/uapi/asm/hwcap.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ +/* SPDX-License-Identifier: GPL-2.0-only WITH Linux-syscall-note */ /* * Copied from arch/arm64/include/asm/hwcap.h * diff --git a/arch/riscv/include/uapi/asm/ptrace.h b/arch/riscv/include/uapi/asm/ptrace.h index 92d8f7cd8f84..882547f6bd5c 100644 --- a/arch/riscv/include/uapi/asm/ptrace.h +++ b/arch/riscv/include/uapi/asm/ptrace.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ +/* SPDX-License-Identifier: GPL-2.0-only WITH Linux-syscall-note */ /* * Copyright (C) 2012 Regents of the University of California */ diff --git a/arch/riscv/include/uapi/asm/sigcontext.h b/arch/riscv/include/uapi/asm/sigcontext.h index 053f809e52ce..84f2dfcfdbce 100644 --- a/arch/riscv/include/uapi/asm/sigcontext.h +++ b/arch/riscv/include/uapi/asm/sigcontext.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ +/* SPDX-License-Identifier: GPL-2.0-only WITH Linux-syscall-note */ /* * Copyright (C) 2012 Regents of the University of California */ diff --git a/arch/riscv/include/uapi/asm/ucontext.h b/arch/riscv/include/uapi/asm/ucontext.h index b58e00cee2ec..411dd7b52ed6 100644 --- a/arch/riscv/include/uapi/asm/ucontext.h +++ b/arch/riscv/include/uapi/asm/ucontext.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ +/* SPDX-License-Identifier: GPL-2.0-only WITH Linux-syscall-note */ /* * Copyright (C) 2012 ARM Ltd. * Copyright (C) 2017 SiFive, Inc. diff --git a/arch/s390/include/uapi/asm/bpf_perf_event.h b/arch/s390/include/uapi/asm/bpf_perf_event.h index cefe7c7cd4f6..3ed42ff6da94 100644 --- a/arch/s390/include/uapi/asm/bpf_perf_event.h +++ b/arch/s390/include/uapi/asm/bpf_perf_event.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _UAPI__ASM_BPF_PERF_EVENT_H__ #define _UAPI__ASM_BPF_PERF_EVENT_H__ diff --git a/arch/s390/include/uapi/asm/ipl.h b/arch/s390/include/uapi/asm/ipl.h index fd32b1cd80d2..451ba7d08905 100644 --- a/arch/s390/include/uapi/asm/ipl.h +++ b/arch/s390/include/uapi/asm/ipl.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _ASM_S390_UAPI_IPL_H #define _ASM_S390_UAPI_IPL_H diff --git a/arch/sh/include/uapi/asm/setup.h b/arch/sh/include/uapi/asm/setup.h index 1170dd2fb998..4bd19f80f9b0 100644 --- a/arch/sh/include/uapi/asm/setup.h +++ b/arch/sh/include/uapi/asm/setup.h @@ -1,2 +1,2 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #include diff --git a/arch/sh/include/uapi/asm/types.h b/arch/sh/include/uapi/asm/types.h index f83795fdc0da..68100e108ea6 100644 --- a/arch/sh/include/uapi/asm/types.h +++ b/arch/sh/include/uapi/asm/types.h @@ -1,2 +1,2 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #include diff --git a/arch/sparc/include/uapi/asm/oradax.h b/arch/sparc/include/uapi/asm/oradax.h index 64c67f2ea33f..0dace69058ab 100644 --- a/arch/sparc/include/uapi/asm/oradax.h +++ b/arch/sparc/include/uapi/asm/oradax.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* SPDX-License-Identifier: GPL-2.0-or-later WITH Linux-syscall-note */ /* * Copyright (c) 2017, Oracle and/or its affiliates. All rights reserved. */ diff --git a/arch/x86/include/uapi/asm/byteorder.h b/arch/x86/include/uapi/asm/byteorder.h index 484e3cfd7ef2..149143cab9ff 100644 --- a/arch/x86/include/uapi/asm/byteorder.h +++ b/arch/x86/include/uapi/asm/byteorder.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _ASM_X86_BYTEORDER_H #define _ASM_X86_BYTEORDER_H diff --git a/arch/x86/include/uapi/asm/hwcap2.h b/arch/x86/include/uapi/asm/hwcap2.h index 6ebaae90e207..8b2effe6efb8 100644 --- a/arch/x86/include/uapi/asm/hwcap2.h +++ b/arch/x86/include/uapi/asm/hwcap2.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _ASM_X86_HWCAP2_H #define _ASM_X86_HWCAP2_H diff --git a/arch/x86/include/uapi/asm/sigcontext32.h b/arch/x86/include/uapi/asm/sigcontext32.h index 6b18e88de8a6..7114801d0499 100644 --- a/arch/x86/include/uapi/asm/sigcontext32.h +++ b/arch/x86/include/uapi/asm/sigcontext32.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _ASM_X86_SIGCONTEXT32_H #define _ASM_X86_SIGCONTEXT32_H diff --git a/arch/x86/include/uapi/asm/types.h b/arch/x86/include/uapi/asm/types.h index df55e1ddb0c9..9d5c11a24279 100644 --- a/arch/x86/include/uapi/asm/types.h +++ b/arch/x86/include/uapi/asm/types.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _ASM_X86_TYPES_H #define _ASM_X86_TYPES_H diff --git a/include/uapi/linux/bpfilter.h b/include/uapi/linux/bpfilter.h index 2ec3cc99ea4c..cbc1f5813f50 100644 --- a/include/uapi/linux/bpfilter.h +++ b/include/uapi/linux/bpfilter.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _UAPI_LINUX_BPFILTER_H #define _UAPI_LINUX_BPFILTER_H diff --git a/include/uapi/linux/ipmi_bmc.h b/include/uapi/linux/ipmi_bmc.h index 1670f0944227..782a03eb1086 100644 --- a/include/uapi/linux/ipmi_bmc.h +++ b/include/uapi/linux/ipmi_bmc.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ /* * Copyright (c) 2015-2018, Intel Corporation. */ diff --git a/include/uapi/linux/isst_if.h b/include/uapi/linux/isst_if.h index d10b832c58c5..0a52b7b093d3 100644 --- a/include/uapi/linux/isst_if.h +++ b/include/uapi/linux/isst_if.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ /* * Intel Speed Select Interface: OS to hardware Interface * Copyright (c) 2019, Intel Corporation. diff --git a/include/uapi/linux/netfilter/nf_synproxy.h b/include/uapi/linux/netfilter/nf_synproxy.h index 6f3791c8946f..00d787f0260e 100644 --- a/include/uapi/linux/netfilter/nf_synproxy.h +++ b/include/uapi/linux/netfilter/nf_synproxy.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _NF_SYNPROXY_H #define _NF_SYNPROXY_H diff --git a/include/uapi/linux/psp-sev.h b/include/uapi/linux/psp-sev.h index 8654b2442f6a..592a0c1b77c9 100644 --- a/include/uapi/linux/psp-sev.h +++ b/include/uapi/linux/psp-sev.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ +/* SPDX-License-Identifier: GPL-2.0-only WITH Linux-syscall-note */ /* * Userspace interface for AMD Secure Encrypted Virtualization (SEV) * platform management commands. diff --git a/include/uapi/linux/rxrpc.h b/include/uapi/linux/rxrpc.h index 782069dcf607..4accfa7e266d 100644 --- a/include/uapi/linux/rxrpc.h +++ b/include/uapi/linux/rxrpc.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* SPDX-License-Identifier: GPL-2.0-or-later WITH Linux-syscall-note */ /* Types and definitions for AF_RXRPC. * * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. diff --git a/include/uapi/linux/usb/g_uvc.h b/include/uapi/linux/usb/g_uvc.h index 3c9ee3020cbb..652f169a019e 100644 --- a/include/uapi/linux/usb/g_uvc.h +++ b/include/uapi/linux/usb/g_uvc.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0+ */ +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ /* * g_uvc.h -- USB Video Class Gadget driver API * diff --git a/include/uapi/linux/vbox_vmmdev_types.h b/include/uapi/linux/vbox_vmmdev_types.h index 26f39816af14..c27289fd619a 100644 --- a/include/uapi/linux/vbox_vmmdev_types.h +++ b/include/uapi/linux/vbox_vmmdev_types.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: (GPL-2.0 OR CDDL-1.0) */ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR CDDL-1.0) */ /* * Virtual Device for Guest <-> VMM/Host communication, type definitions * which are also used for the vboxguest ioctl interface / by vboxsf diff --git a/include/uapi/linux/vboxguest.h b/include/uapi/linux/vboxguest.h index 612f0c7d3558..9cec58a6a5ea 100644 --- a/include/uapi/linux/vboxguest.h +++ b/include/uapi/linux/vboxguest.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: (GPL-2.0 OR CDDL-1.0) */ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR CDDL-1.0) */ /* * VBoxGuest - VirtualBox Guest Additions Driver Interface. * diff --git a/include/uapi/linux/virtio_pmem.h b/include/uapi/linux/virtio_pmem.h index 9a63ed6d062f..b022787ffb94 100644 --- a/include/uapi/linux/virtio_pmem.h +++ b/include/uapi/linux/virtio_pmem.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ +/* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause */ /* * Definitions for virtio-pmem devices. * diff --git a/include/uapi/linux/vmcore.h b/include/uapi/linux/vmcore.h index 022619668e0e..3e9da91866ff 100644 --- a/include/uapi/linux/vmcore.h +++ b/include/uapi/linux/vmcore.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _UAPI_VMCORE_H #define _UAPI_VMCORE_H diff --git a/include/uapi/linux/wmi.h b/include/uapi/linux/wmi.h index c36f2d7675a4..7085c5dca9fa 100644 --- a/include/uapi/linux/wmi.h +++ b/include/uapi/linux/wmi.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ +/* SPDX-License-Identifier: GPL-2.0-only WITH Linux-syscall-note */ /* * User API methods for ACPI-WMI mapping driver * diff --git a/include/uapi/misc/fastrpc.h b/include/uapi/misc/fastrpc.h index 6d701af9fc42..fb792e882cef 100644 --- a/include/uapi/misc/fastrpc.h +++ b/include/uapi/misc/fastrpc.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef __QCOM_FASTRPC_H__ #define __QCOM_FASTRPC_H__ diff --git a/include/uapi/rdma/rvt-abi.h b/include/uapi/rdma/rvt-abi.h index 7328293c715c..7c05a02d2be5 100644 --- a/include/uapi/rdma/rvt-abi.h +++ b/include/uapi/rdma/rvt-abi.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ /* * This file contains defines, structures, etc. that are used diff --git a/include/uapi/rdma/siw-abi.h b/include/uapi/rdma/siw-abi.h index 3dd8071ace7b..7de68f1dc707 100644 --- a/include/uapi/rdma/siw-abi.h +++ b/include/uapi/rdma/siw-abi.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ +/* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) or BSD-3-Clause */ /* Authors: Bernard Metzler */ /* Copyright (c) 2008-2019, IBM Corporation */ diff --git a/include/uapi/scsi/scsi_bsg_ufs.h b/include/uapi/scsi/scsi_bsg_ufs.h index 17c7abd0803a..9988db6ad244 100644 --- a/include/uapi/scsi/scsi_bsg_ufs.h +++ b/include/uapi/scsi/scsi_bsg_ufs.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ /* * UFS Transport SGIO v4 BSG Message Support * diff --git a/include/uapi/sound/skl-tplg-interface.h b/include/uapi/sound/skl-tplg-interface.h index f39352cef382..9eee32f5e407 100644 --- a/include/uapi/sound/skl-tplg-interface.h +++ b/include/uapi/sound/skl-tplg-interface.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ /* * skl-tplg-interface.h - Intel DSP FW private data interface * -- cgit v1.2.3 From dc3bf49ea330414724e429e4e9b291899c134e3b Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 25 Jul 2019 16:58:32 +0900 Subject: treewide: remove SPDX "WITH Linux-syscall-note" from kernel-space headers again The "WITH Linux-syscall-note" exception exists for headers exported to user space. It is strange to add it to non-exported headers. Commit 687a3e4d8e61 ("treewide: remove SPDX "WITH Linux-syscall-note" from kernel-space headers") did cleanups some months ago, but it looks like we need to do this periodically. This patch was generated by the following script: git grep -l -e Linux-syscall-note \ -- :*.h :^arch/*/include/uapi/asm/*.h :^include/uapi/ :^tools | while read file do sed -i -e 's/(\(GPL-[^[:space:]]*\) WITH Linux-syscall-note)/\1/g' \ -e 's/ WITH Linux-syscall-note//g' $file done I did not commit drivers/staging/android/uapi/ion.h . This header is not currently exported, but somebody may plan to move it to include/uapi/ when the time comes. I am not sure. Anyway, it will be better to check the license inconsistency in drivers/staging/android/uapi/. Signed-off-by: Masahiro Yamada Reviewed-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- include/sound/sof/control.h | 2 +- include/sound/sof/dai-intel.h | 2 +- include/sound/sof/dai.h | 2 +- include/sound/sof/header.h | 2 +- include/sound/sof/info.h | 2 +- include/sound/sof/pm.h | 2 +- include/sound/sof/stream.h | 2 +- include/sound/sof/topology.h | 2 +- include/sound/sof/trace.h | 2 +- include/sound/sof/xtensa.h | 2 +- samples/vfio-mdev/mdpy-defs.h | 2 +- 11 files changed, 11 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/sound/sof/control.h b/include/sound/sof/control.h index bded69e696d4..6080ea0facd7 100644 --- a/include/sound/sof/control.h +++ b/include/sound/sof/control.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */ /* * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. diff --git a/include/sound/sof/dai-intel.h b/include/sound/sof/dai-intel.h index 4bb8ee138ba7..65e4c20e567c 100644 --- a/include/sound/sof/dai-intel.h +++ b/include/sound/sof/dai-intel.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */ /* * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. diff --git a/include/sound/sof/dai.h b/include/sound/sof/dai.h index 3d174e20aa53..5b8de1b1983c 100644 --- a/include/sound/sof/dai.h +++ b/include/sound/sof/dai.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */ /* * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. diff --git a/include/sound/sof/header.h b/include/sound/sof/header.h index 12867bbd4372..10f00c08dbb7 100644 --- a/include/sound/sof/header.h +++ b/include/sound/sof/header.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */ /* * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. diff --git a/include/sound/sof/info.h b/include/sound/sof/info.h index 16528d2b4a50..a9156b4a062c 100644 --- a/include/sound/sof/info.h +++ b/include/sound/sof/info.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */ /* * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. diff --git a/include/sound/sof/pm.h b/include/sound/sof/pm.h index 8ae3ad45bdf7..003879401d63 100644 --- a/include/sound/sof/pm.h +++ b/include/sound/sof/pm.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */ /* * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. diff --git a/include/sound/sof/stream.h b/include/sound/sof/stream.h index 643f175cb479..0b71b381b952 100644 --- a/include/sound/sof/stream.h +++ b/include/sound/sof/stream.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */ /* * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. diff --git a/include/sound/sof/topology.h b/include/sound/sof/topology.h index 41dcabf89899..c47b36240920 100644 --- a/include/sound/sof/topology.h +++ b/include/sound/sof/topology.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */ /* * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. diff --git a/include/sound/sof/trace.h b/include/sound/sof/trace.h index 9257d5473d97..fda6e8f6ead4 100644 --- a/include/sound/sof/trace.h +++ b/include/sound/sof/trace.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */ /* * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. diff --git a/include/sound/sof/xtensa.h b/include/sound/sof/xtensa.h index d25c764b10e8..dd53d36b34e1 100644 --- a/include/sound/sof/xtensa.h +++ b/include/sound/sof/xtensa.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */ /* * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. diff --git a/samples/vfio-mdev/mdpy-defs.h b/samples/vfio-mdev/mdpy-defs.h index 96b3b1b49d34..eb26421b6429 100644 --- a/samples/vfio-mdev/mdpy-defs.h +++ b/samples/vfio-mdev/mdpy-defs.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Simple pci display device. * -- cgit v1.2.3 From 110f87a6a5f65e825852632b9557f463e65251d7 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 21 Jul 2019 23:49:08 +0900 Subject: usb: host: oxu210hp-hcd: remove include/linux/oxu210hp.h struct oxu210hp_platform_data is defined, but not used at all. $ git grep oxu210hp_platform_data include/linux/oxu210hp.h:struct oxu210hp_platform_data { include/linux/oxu210hp.h exists just for defining an unused structure, so it can go away. Signed-off-by: Masahiro Yamada Link: https://lore.kernel.org/r/20190721144909.5295-1-yamada.masahiro@socionext.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/oxu210hp.h | 2 -- include/linux/oxu210hp.h | 8 -------- 2 files changed, 10 deletions(-) delete mode 100644 include/linux/oxu210hp.h (limited to 'include') diff --git a/drivers/usb/host/oxu210hp.h b/drivers/usb/host/oxu210hp.h index 437044147862..67ebea4993b6 100644 --- a/drivers/usb/host/oxu210hp.h +++ b/drivers/usb/host/oxu210hp.h @@ -444,5 +444,3 @@ enum ehci_timer_action { TIMER_ASYNC_SHRINK, TIMER_ASYNC_OFF, }; - -#include diff --git a/include/linux/oxu210hp.h b/include/linux/oxu210hp.h deleted file mode 100644 index 94cd25165c08..000000000000 --- a/include/linux/oxu210hp.h +++ /dev/null @@ -1,8 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* platform data for the OXU210HP HCD */ - -struct oxu210hp_platform_data { - unsigned int bus16:1; - unsigned int use_hcd_otg:1; - unsigned int use_hcd_sph:1; -}; -- cgit v1.2.3 From 4a2b8560e3dff8637ccb09524650864f60ebab7f Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Mon, 22 Jul 2019 08:51:46 +0200 Subject: tty: serial: netx: Delete driver The Netx ARM machine was deleted from the kernel. This driver had no users and has to go. Cc: Robert Schwebel Cc: Sascha Hauer Signed-off-by: Linus Walleij Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20190722065146.4844-1-linus.walleij@linaro.org Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/Kconfig | 19 - drivers/tty/serial/Makefile | 1 - drivers/tty/serial/netx-serial.c | 733 --------------------------------------- include/uapi/linux/serial_core.h | 3 - 4 files changed, 756 deletions(-) delete mode 100644 drivers/tty/serial/netx-serial.c (limited to 'include') diff --git a/drivers/tty/serial/Kconfig b/drivers/tty/serial/Kconfig index fd385c8c53a5..3083dbae35f7 100644 --- a/drivers/tty/serial/Kconfig +++ b/drivers/tty/serial/Kconfig @@ -1035,25 +1035,6 @@ config SERIAL_VT8500_CONSOLE depends on SERIAL_VT8500=y select SERIAL_CORE_CONSOLE -config SERIAL_NETX - tristate "NetX serial port support" - depends on ARCH_NETX - select SERIAL_CORE - help - If you have a machine based on a Hilscher NetX SoC you - can enable its onboard serial port by enabling this option. - - To compile this driver as a module, choose M here: the - module will be called netx-serial. - -config SERIAL_NETX_CONSOLE - bool "Console on NetX serial port" - depends on SERIAL_NETX=y - select SERIAL_CORE_CONSOLE - help - If you have enabled the serial port on the Hilscher NetX SoC - you can make it the console by answering Y to this option. - config SERIAL_OMAP tristate "OMAP serial port support" depends on ARCH_OMAP2PLUS diff --git a/drivers/tty/serial/Makefile b/drivers/tty/serial/Makefile index 7cd7cabfa6c4..15a0fccadf7e 100644 --- a/drivers/tty/serial/Makefile +++ b/drivers/tty/serial/Makefile @@ -59,7 +59,6 @@ obj-$(CONFIG_SERIAL_ATMEL) += atmel_serial.o obj-$(CONFIG_SERIAL_UARTLITE) += uartlite.o obj-$(CONFIG_SERIAL_MSM) += msm_serial.o obj-$(CONFIG_SERIAL_QCOM_GENI) += qcom_geni_serial.o -obj-$(CONFIG_SERIAL_NETX) += netx-serial.o obj-$(CONFIG_SERIAL_KS8695) += serial_ks8695.o obj-$(CONFIG_SERIAL_OMAP) += omap-serial.o obj-$(CONFIG_SERIAL_ALTERA_UART) += altera_uart.o diff --git a/drivers/tty/serial/netx-serial.c b/drivers/tty/serial/netx-serial.c deleted file mode 100644 index b3556863491f..000000000000 --- a/drivers/tty/serial/netx-serial.c +++ /dev/null @@ -1,733 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (c) 2005 Sascha Hauer , Pengutronix - */ - -#if defined(CONFIG_SERIAL_NETX_CONSOLE) && defined(CONFIG_MAGIC_SYSRQ) -#define SUPPORT_SYSRQ -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -/* We've been assigned a range on the "Low-density serial ports" major */ -#define SERIAL_NX_MAJOR 204 -#define MINOR_START 170 - -enum uart_regs { - UART_DR = 0x00, - UART_SR = 0x04, - UART_LINE_CR = 0x08, - UART_BAUDDIV_MSB = 0x0c, - UART_BAUDDIV_LSB = 0x10, - UART_CR = 0x14, - UART_FR = 0x18, - UART_IIR = 0x1c, - UART_ILPR = 0x20, - UART_RTS_CR = 0x24, - UART_RTS_LEAD = 0x28, - UART_RTS_TRAIL = 0x2c, - UART_DRV_ENABLE = 0x30, - UART_BRM_CR = 0x34, - UART_RXFIFO_IRQLEVEL = 0x38, - UART_TXFIFO_IRQLEVEL = 0x3c, -}; - -#define SR_FE (1<<0) -#define SR_PE (1<<1) -#define SR_BE (1<<2) -#define SR_OE (1<<3) - -#define LINE_CR_BRK (1<<0) -#define LINE_CR_PEN (1<<1) -#define LINE_CR_EPS (1<<2) -#define LINE_CR_STP2 (1<<3) -#define LINE_CR_FEN (1<<4) -#define LINE_CR_5BIT (0<<5) -#define LINE_CR_6BIT (1<<5) -#define LINE_CR_7BIT (2<<5) -#define LINE_CR_8BIT (3<<5) -#define LINE_CR_BITS_MASK (3<<5) - -#define CR_UART_EN (1<<0) -#define CR_SIREN (1<<1) -#define CR_SIRLP (1<<2) -#define CR_MSIE (1<<3) -#define CR_RIE (1<<4) -#define CR_TIE (1<<5) -#define CR_RTIE (1<<6) -#define CR_LBE (1<<7) - -#define FR_CTS (1<<0) -#define FR_DSR (1<<1) -#define FR_DCD (1<<2) -#define FR_BUSY (1<<3) -#define FR_RXFE (1<<4) -#define FR_TXFF (1<<5) -#define FR_RXFF (1<<6) -#define FR_TXFE (1<<7) - -#define IIR_MIS (1<<0) -#define IIR_RIS (1<<1) -#define IIR_TIS (1<<2) -#define IIR_RTIS (1<<3) -#define IIR_MASK 0xf - -#define RTS_CR_AUTO (1<<0) -#define RTS_CR_RTS (1<<1) -#define RTS_CR_COUNT (1<<2) -#define RTS_CR_MOD2 (1<<3) -#define RTS_CR_RTS_POL (1<<4) -#define RTS_CR_CTS_CTR (1<<5) -#define RTS_CR_CTS_POL (1<<6) -#define RTS_CR_STICK (1<<7) - -#define UART_PORT_SIZE 0x40 -#define DRIVER_NAME "netx-uart" - -struct netx_port { - struct uart_port port; -}; - -static void netx_stop_tx(struct uart_port *port) -{ - unsigned int val; - val = readl(port->membase + UART_CR); - writel(val & ~CR_TIE, port->membase + UART_CR); -} - -static void netx_stop_rx(struct uart_port *port) -{ - unsigned int val; - val = readl(port->membase + UART_CR); - writel(val & ~CR_RIE, port->membase + UART_CR); -} - -static void netx_enable_ms(struct uart_port *port) -{ - unsigned int val; - val = readl(port->membase + UART_CR); - writel(val | CR_MSIE, port->membase + UART_CR); -} - -static inline void netx_transmit_buffer(struct uart_port *port) -{ - struct circ_buf *xmit = &port->state->xmit; - - if (port->x_char) { - writel(port->x_char, port->membase + UART_DR); - port->icount.tx++; - port->x_char = 0; - return; - } - - if (uart_tx_stopped(port) || uart_circ_empty(xmit)) { - netx_stop_tx(port); - return; - } - - do { - /* send xmit->buf[xmit->tail] - * out the port here */ - writel(xmit->buf[xmit->tail], port->membase + UART_DR); - xmit->tail = (xmit->tail + 1) & - (UART_XMIT_SIZE - 1); - port->icount.tx++; - if (uart_circ_empty(xmit)) - break; - } while (!(readl(port->membase + UART_FR) & FR_TXFF)); - - if (uart_circ_empty(xmit)) - netx_stop_tx(port); -} - -static void netx_start_tx(struct uart_port *port) -{ - writel( - readl(port->membase + UART_CR) | CR_TIE, port->membase + UART_CR); - - if (!(readl(port->membase + UART_FR) & FR_TXFF)) - netx_transmit_buffer(port); -} - -static unsigned int netx_tx_empty(struct uart_port *port) -{ - return readl(port->membase + UART_FR) & FR_BUSY ? 0 : TIOCSER_TEMT; -} - -static void netx_txint(struct uart_port *port) -{ - struct circ_buf *xmit = &port->state->xmit; - - if (uart_circ_empty(xmit) || uart_tx_stopped(port)) { - netx_stop_tx(port); - return; - } - - netx_transmit_buffer(port); - - if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS) - uart_write_wakeup(port); -} - -static void netx_rxint(struct uart_port *port, unsigned long *flags) -{ - unsigned char rx, flg, status; - - while (!(readl(port->membase + UART_FR) & FR_RXFE)) { - rx = readl(port->membase + UART_DR); - flg = TTY_NORMAL; - port->icount.rx++; - status = readl(port->membase + UART_SR); - if (status & SR_BE) { - writel(0, port->membase + UART_SR); - if (uart_handle_break(port)) - continue; - } - - if (unlikely(status & (SR_FE | SR_PE | SR_OE))) { - - if (status & SR_PE) - port->icount.parity++; - else if (status & SR_FE) - port->icount.frame++; - if (status & SR_OE) - port->icount.overrun++; - - status &= port->read_status_mask; - - if (status & SR_BE) - flg = TTY_BREAK; - else if (status & SR_PE) - flg = TTY_PARITY; - else if (status & SR_FE) - flg = TTY_FRAME; - } - - if (uart_handle_sysrq_char(port, rx)) - continue; - - uart_insert_char(port, status, SR_OE, rx, flg); - } - - spin_unlock_irqrestore(&port->lock, *flags); - tty_flip_buffer_push(&port->state->port); - spin_lock_irqsave(&port->lock, *flags); -} - -static irqreturn_t netx_int(int irq, void *dev_id) -{ - struct uart_port *port = dev_id; - unsigned long flags; - unsigned char status; - - spin_lock_irqsave(&port->lock,flags); - - status = readl(port->membase + UART_IIR) & IIR_MASK; - while (status) { - if (status & IIR_RIS) - netx_rxint(port, &flags); - if (status & IIR_TIS) - netx_txint(port); - if (status & IIR_MIS) { - if (readl(port->membase + UART_FR) & FR_CTS) - uart_handle_cts_change(port, 1); - else - uart_handle_cts_change(port, 0); - } - writel(0, port->membase + UART_IIR); - status = readl(port->membase + UART_IIR) & IIR_MASK; - } - - spin_unlock_irqrestore(&port->lock,flags); - return IRQ_HANDLED; -} - -static unsigned int netx_get_mctrl(struct uart_port *port) -{ - unsigned int ret = TIOCM_DSR | TIOCM_CAR; - - if (readl(port->membase + UART_FR) & FR_CTS) - ret |= TIOCM_CTS; - - return ret; -} - -static void netx_set_mctrl(struct uart_port *port, unsigned int mctrl) -{ - unsigned int val; - - /* FIXME: Locking needed ? */ - if (mctrl & TIOCM_RTS) { - val = readl(port->membase + UART_RTS_CR); - writel(val | RTS_CR_RTS, port->membase + UART_RTS_CR); - } -} - -static void netx_break_ctl(struct uart_port *port, int break_state) -{ - unsigned int line_cr; - spin_lock_irq(&port->lock); - - line_cr = readl(port->membase + UART_LINE_CR); - if (break_state != 0) - line_cr |= LINE_CR_BRK; - else - line_cr &= ~LINE_CR_BRK; - writel(line_cr, port->membase + UART_LINE_CR); - - spin_unlock_irq(&port->lock); -} - -static int netx_startup(struct uart_port *port) -{ - int ret; - - ret = request_irq(port->irq, netx_int, 0, - DRIVER_NAME, port); - if (ret) { - dev_err(port->dev, "unable to grab irq%d\n",port->irq); - goto exit; - } - - writel(readl(port->membase + UART_LINE_CR) | LINE_CR_FEN, - port->membase + UART_LINE_CR); - - writel(CR_MSIE | CR_RIE | CR_TIE | CR_RTIE | CR_UART_EN, - port->membase + UART_CR); - -exit: - return ret; -} - -static void netx_shutdown(struct uart_port *port) -{ - writel(0, port->membase + UART_CR) ; - - free_irq(port->irq, port); -} - -static void -netx_set_termios(struct uart_port *port, struct ktermios *termios, - struct ktermios *old) -{ - unsigned int baud, quot; - unsigned char old_cr; - unsigned char line_cr = LINE_CR_FEN; - unsigned char rts_cr = 0; - - switch (termios->c_cflag & CSIZE) { - case CS5: - line_cr |= LINE_CR_5BIT; - break; - case CS6: - line_cr |= LINE_CR_6BIT; - break; - case CS7: - line_cr |= LINE_CR_7BIT; - break; - case CS8: - line_cr |= LINE_CR_8BIT; - break; - } - - if (termios->c_cflag & CSTOPB) - line_cr |= LINE_CR_STP2; - - if (termios->c_cflag & PARENB) { - line_cr |= LINE_CR_PEN; - if (!(termios->c_cflag & PARODD)) - line_cr |= LINE_CR_EPS; - } - - if (termios->c_cflag & CRTSCTS) - rts_cr = RTS_CR_AUTO | RTS_CR_CTS_CTR | RTS_CR_RTS_POL; - - baud = uart_get_baud_rate(port, termios, old, 0, port->uartclk/16); - quot = baud * 4096; - quot /= 1000; - quot *= 256; - quot /= 100000; - - spin_lock_irq(&port->lock); - - uart_update_timeout(port, termios->c_cflag, baud); - - old_cr = readl(port->membase + UART_CR); - - /* disable interrupts */ - writel(old_cr & ~(CR_MSIE | CR_RIE | CR_TIE | CR_RTIE), - port->membase + UART_CR); - - /* drain transmitter */ - while (readl(port->membase + UART_FR) & FR_BUSY); - - /* disable UART */ - writel(old_cr & ~CR_UART_EN, port->membase + UART_CR); - - /* modem status interrupts */ - old_cr &= ~CR_MSIE; - if (UART_ENABLE_MS(port, termios->c_cflag)) - old_cr |= CR_MSIE; - - writel((quot>>8) & 0xff, port->membase + UART_BAUDDIV_MSB); - writel(quot & 0xff, port->membase + UART_BAUDDIV_LSB); - writel(line_cr, port->membase + UART_LINE_CR); - - writel(rts_cr, port->membase + UART_RTS_CR); - - /* - * Characters to ignore - */ - port->ignore_status_mask = 0; - if (termios->c_iflag & IGNPAR) - port->ignore_status_mask |= SR_PE; - if (termios->c_iflag & IGNBRK) { - port->ignore_status_mask |= SR_BE; - /* - * If we're ignoring parity and break indicators, - * ignore overruns too (for real raw support). - */ - if (termios->c_iflag & IGNPAR) - port->ignore_status_mask |= SR_PE; - } - - port->read_status_mask = 0; - if (termios->c_iflag & (IGNBRK | BRKINT | PARMRK)) - port->read_status_mask |= SR_BE; - if (termios->c_iflag & INPCK) - port->read_status_mask |= SR_PE | SR_FE; - - writel(old_cr, port->membase + UART_CR); - - spin_unlock_irq(&port->lock); -} - -static const char *netx_type(struct uart_port *port) -{ - return port->type == PORT_NETX ? "NETX" : NULL; -} - -static void netx_release_port(struct uart_port *port) -{ - release_mem_region(port->mapbase, UART_PORT_SIZE); -} - -static int netx_request_port(struct uart_port *port) -{ - return request_mem_region(port->mapbase, UART_PORT_SIZE, - DRIVER_NAME) != NULL ? 0 : -EBUSY; -} - -static void netx_config_port(struct uart_port *port, int flags) -{ - if (flags & UART_CONFIG_TYPE && netx_request_port(port) == 0) - port->type = PORT_NETX; -} - -static int -netx_verify_port(struct uart_port *port, struct serial_struct *ser) -{ - int ret = 0; - - if (ser->type != PORT_UNKNOWN && ser->type != PORT_NETX) - ret = -EINVAL; - - return ret; -} - -static struct uart_ops netx_pops = { - .tx_empty = netx_tx_empty, - .set_mctrl = netx_set_mctrl, - .get_mctrl = netx_get_mctrl, - .stop_tx = netx_stop_tx, - .start_tx = netx_start_tx, - .stop_rx = netx_stop_rx, - .enable_ms = netx_enable_ms, - .break_ctl = netx_break_ctl, - .startup = netx_startup, - .shutdown = netx_shutdown, - .set_termios = netx_set_termios, - .type = netx_type, - .release_port = netx_release_port, - .request_port = netx_request_port, - .config_port = netx_config_port, - .verify_port = netx_verify_port, -}; - -static struct netx_port netx_ports[] = { - { - .port = { - .type = PORT_NETX, - .iotype = UPIO_MEM, - .membase = (char __iomem *)io_p2v(NETX_PA_UART0), - .mapbase = NETX_PA_UART0, - .irq = NETX_IRQ_UART0, - .uartclk = 100000000, - .fifosize = 16, - .flags = UPF_BOOT_AUTOCONF, - .ops = &netx_pops, - .line = 0, - }, - }, { - .port = { - .type = PORT_NETX, - .iotype = UPIO_MEM, - .membase = (char __iomem *)io_p2v(NETX_PA_UART1), - .mapbase = NETX_PA_UART1, - .irq = NETX_IRQ_UART1, - .uartclk = 100000000, - .fifosize = 16, - .flags = UPF_BOOT_AUTOCONF, - .ops = &netx_pops, - .line = 1, - }, - }, { - .port = { - .type = PORT_NETX, - .iotype = UPIO_MEM, - .membase = (char __iomem *)io_p2v(NETX_PA_UART2), - .mapbase = NETX_PA_UART2, - .irq = NETX_IRQ_UART2, - .uartclk = 100000000, - .fifosize = 16, - .flags = UPF_BOOT_AUTOCONF, - .ops = &netx_pops, - .line = 2, - }, - } -}; - -#ifdef CONFIG_SERIAL_NETX_CONSOLE - -static void netx_console_putchar(struct uart_port *port, int ch) -{ - while (readl(port->membase + UART_FR) & FR_BUSY); - writel(ch, port->membase + UART_DR); -} - -static void -netx_console_write(struct console *co, const char *s, unsigned int count) -{ - struct uart_port *port = &netx_ports[co->index].port; - unsigned char cr_save; - - cr_save = readl(port->membase + UART_CR); - writel(cr_save | CR_UART_EN, port->membase + UART_CR); - - uart_console_write(port, s, count, netx_console_putchar); - - while (readl(port->membase + UART_FR) & FR_BUSY); - writel(cr_save, port->membase + UART_CR); -} - -static void __init -netx_console_get_options(struct uart_port *port, int *baud, - int *parity, int *bits, int *flow) -{ - unsigned char line_cr; - - *baud = (readl(port->membase + UART_BAUDDIV_MSB) << 8) | - readl(port->membase + UART_BAUDDIV_LSB); - *baud *= 1000; - *baud /= 4096; - *baud *= 1000; - *baud /= 256; - *baud *= 100; - - line_cr = readl(port->membase + UART_LINE_CR); - *parity = 'n'; - if (line_cr & LINE_CR_PEN) { - if (line_cr & LINE_CR_EPS) - *parity = 'e'; - else - *parity = 'o'; - } - - switch (line_cr & LINE_CR_BITS_MASK) { - case LINE_CR_8BIT: - *bits = 8; - break; - case LINE_CR_7BIT: - *bits = 7; - break; - case LINE_CR_6BIT: - *bits = 6; - break; - case LINE_CR_5BIT: - *bits = 5; - break; - } - - if (readl(port->membase + UART_RTS_CR) & RTS_CR_AUTO) - *flow = 'r'; -} - -static int __init -netx_console_setup(struct console *co, char *options) -{ - struct netx_port *sport; - int baud = 9600; - int bits = 8; - int parity = 'n'; - int flow = 'n'; - - /* - * Check whether an invalid uart number has been specified, and - * if so, search for the first available port that does have - * console support. - */ - if (co->index == -1 || co->index >= ARRAY_SIZE(netx_ports)) - co->index = 0; - sport = &netx_ports[co->index]; - - if (options) { - uart_parse_options(options, &baud, &parity, &bits, &flow); - } else { - /* if the UART is enabled, assume it has been correctly setup - * by the bootloader and get the options - */ - if (readl(sport->port.membase + UART_CR) & CR_UART_EN) { - netx_console_get_options(&sport->port, &baud, - &parity, &bits, &flow); - } - - } - - return uart_set_options(&sport->port, co, baud, parity, bits, flow); -} - -static struct uart_driver netx_reg; -static struct console netx_console = { - .name = "ttyNX", - .write = netx_console_write, - .device = uart_console_device, - .setup = netx_console_setup, - .flags = CON_PRINTBUFFER, - .index = -1, - .data = &netx_reg, -}; - -static int __init netx_console_init(void) -{ - register_console(&netx_console); - return 0; -} -console_initcall(netx_console_init); - -#define NETX_CONSOLE &netx_console -#else -#define NETX_CONSOLE NULL -#endif - -static struct uart_driver netx_reg = { - .owner = THIS_MODULE, - .driver_name = DRIVER_NAME, - .dev_name = "ttyNX", - .major = SERIAL_NX_MAJOR, - .minor = MINOR_START, - .nr = ARRAY_SIZE(netx_ports), - .cons = NETX_CONSOLE, -}; - -static int serial_netx_suspend(struct platform_device *pdev, pm_message_t state) -{ - struct netx_port *sport = platform_get_drvdata(pdev); - - if (sport) - uart_suspend_port(&netx_reg, &sport->port); - - return 0; -} - -static int serial_netx_resume(struct platform_device *pdev) -{ - struct netx_port *sport = platform_get_drvdata(pdev); - - if (sport) - uart_resume_port(&netx_reg, &sport->port); - - return 0; -} - -static int serial_netx_probe(struct platform_device *pdev) -{ - struct uart_port *port = &netx_ports[pdev->id].port; - - dev_info(&pdev->dev, "initialising\n"); - - port->dev = &pdev->dev; - - writel(1, port->membase + UART_RXFIFO_IRQLEVEL); - uart_add_one_port(&netx_reg, &netx_ports[pdev->id].port); - platform_set_drvdata(pdev, &netx_ports[pdev->id]); - - return 0; -} - -static int serial_netx_remove(struct platform_device *pdev) -{ - struct netx_port *sport = platform_get_drvdata(pdev); - - if (sport) - uart_remove_one_port(&netx_reg, &sport->port); - - return 0; -} - -static struct platform_driver serial_netx_driver = { - .probe = serial_netx_probe, - .remove = serial_netx_remove, - - .suspend = serial_netx_suspend, - .resume = serial_netx_resume, - - .driver = { - .name = DRIVER_NAME, - }, -}; - -static int __init netx_serial_init(void) -{ - int ret; - - printk(KERN_INFO "Serial: NetX driver\n"); - - ret = uart_register_driver(&netx_reg); - if (ret) - return ret; - - ret = platform_driver_register(&serial_netx_driver); - if (ret != 0) - uart_unregister_driver(&netx_reg); - - return 0; -} - -static void __exit netx_serial_exit(void) -{ - platform_driver_unregister(&serial_netx_driver); - uart_unregister_driver(&netx_reg); -} - -module_init(netx_serial_init); -module_exit(netx_serial_exit); - -MODULE_AUTHOR("Sascha Hauer"); -MODULE_DESCRIPTION("NetX serial port driver"); -MODULE_LICENSE("GPL"); -MODULE_ALIAS("platform:" DRIVER_NAME); diff --git a/include/uapi/linux/serial_core.h b/include/uapi/linux/serial_core.h index 5642c05e0da0..3cc3af1c2ee1 100644 --- a/include/uapi/linux/serial_core.h +++ b/include/uapi/linux/serial_core.h @@ -150,9 +150,6 @@ #define PORT_PNX8XXX 70 -/* Hilscher netx */ -#define PORT_NETX 71 - /* SUN4V Hypervisor Console */ #define PORT_SUNHV 72 -- cgit v1.2.3 From 173f6eacc8a89ae4b08b166735930e6c46951a81 Mon Sep 17 00:00:00 2001 From: Ezequiel Garcia Date: Sat, 20 Jul 2019 07:47:07 -0400 Subject: media: v4l: ctrls: Add debug messages Currently, the v4l2 control code is a bit silent on errors. Add debug messages on (hopefully) most of the error paths. Signed-off-by: Ezequiel Garcia Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- Documentation/media/kapi/v4l2-dev.rst | 1 + drivers/media/platform/omap3isp/ispvideo.c | 4 +- drivers/media/v4l2-core/v4l2-ctrls.c | 126 +++++++++++++++++++++++------ drivers/media/v4l2-core/v4l2-ioctl.c | 18 +++-- drivers/media/v4l2-core/v4l2-subdev.c | 6 +- include/media/v4l2-ctrls.h | 9 ++- include/media/v4l2-ioctl.h | 2 + 7 files changed, 127 insertions(+), 39 deletions(-) (limited to 'include') diff --git a/Documentation/media/kapi/v4l2-dev.rst b/Documentation/media/kapi/v4l2-dev.rst index b359f1804bbe..4c5a15c53dbf 100644 --- a/Documentation/media/kapi/v4l2-dev.rst +++ b/Documentation/media/kapi/v4l2-dev.rst @@ -288,6 +288,7 @@ Mask Description 0x08 Log the read and write file operations and the VIDIOC_QBUF and VIDIOC_DQBUF ioctls. 0x10 Log the poll file operation. +0x20 Log error and messages in the control operations. ===== ================================================================ Video device cleanup diff --git a/drivers/media/platform/omap3isp/ispvideo.c b/drivers/media/platform/omap3isp/ispvideo.c index b52c6fe825ac..ee183c35ff3b 100644 --- a/drivers/media/platform/omap3isp/ispvideo.c +++ b/drivers/media/platform/omap3isp/ispvideo.c @@ -1020,8 +1020,8 @@ static int isp_video_check_external_subdevs(struct isp_video *video, ctrls.count = 1; ctrls.controls = &ctrl; - - ret = v4l2_g_ext_ctrls(pipe->external->ctrl_handler, NULL, &ctrls); + ret = v4l2_g_ext_ctrls(pipe->external->ctrl_handler, &video->video, + NULL, &ctrls); if (ret < 0) { dev_warn(isp->dev, "no pixel rate control in subdev %s\n", pipe->external->name); diff --git a/drivers/media/v4l2-core/v4l2-ctrls.c b/drivers/media/v4l2-core/v4l2-ctrls.c index 13236c191796..76fa2db0e8fb 100644 --- a/drivers/media/v4l2-core/v4l2-ctrls.c +++ b/drivers/media/v4l2-core/v4l2-ctrls.c @@ -6,6 +6,8 @@ */ +#define pr_fmt(fmt) "v4l2-ctrls: " fmt + #include #include #include @@ -16,6 +18,12 @@ #include #include +#define dprintk(vdev, fmt, arg...) do { \ + if (!WARN_ON(!(vdev)) && ((vdev)->dev_debug & V4L2_DEV_DEBUG_CTRL)) \ + printk(KERN_DEBUG pr_fmt("%s: %s: " fmt), \ + __func__, video_device_node_name(vdev), ##arg); \ +} while (0) + #define has_op(master, op) \ (master->ops && master->ops->op) #define call_op(master, op) \ @@ -3260,6 +3268,7 @@ static int v4l2_ctrl_request_bind(struct media_request *req, static int prepare_ext_ctrls(struct v4l2_ctrl_handler *hdl, struct v4l2_ext_controls *cs, struct v4l2_ctrl_helper *helpers, + struct video_device *vdev, bool get) { struct v4l2_ctrl_helper *h; @@ -3277,20 +3286,31 @@ static int prepare_ext_ctrls(struct v4l2_ctrl_handler *hdl, if (cs->which && cs->which != V4L2_CTRL_WHICH_DEF_VAL && cs->which != V4L2_CTRL_WHICH_REQUEST_VAL && - V4L2_CTRL_ID2WHICH(id) != cs->which) + V4L2_CTRL_ID2WHICH(id) != cs->which) { + dprintk(vdev, + "invalid which 0x%x or control id 0x%x\n", + cs->which, id); return -EINVAL; + } /* Old-style private controls are not allowed for extended controls */ - if (id >= V4L2_CID_PRIVATE_BASE) + if (id >= V4L2_CID_PRIVATE_BASE) { + dprintk(vdev, + "old-style private controls not allowed\n"); return -EINVAL; + } ref = find_ref_lock(hdl, id); - if (ref == NULL) + if (ref == NULL) { + dprintk(vdev, "cannot find control id 0x%x\n", id); return -EINVAL; + } h->ref = ref; ctrl = ref->ctrl; - if (ctrl->flags & V4L2_CTRL_FLAG_DISABLED) + if (ctrl->flags & V4L2_CTRL_FLAG_DISABLED) { + dprintk(vdev, "control id 0x%x is disabled\n", id); return -EINVAL; + } if (ctrl->cluster[0]->ncontrols > 1) have_clusters = true; @@ -3300,10 +3320,17 @@ static int prepare_ext_ctrls(struct v4l2_ctrl_handler *hdl, unsigned tot_size = ctrl->elems * ctrl->elem_size; if (c->size < tot_size) { + /* + * In the get case the application first + * queries to obtain the size of the control. + */ if (get) { c->size = tot_size; return -ENOSPC; } + dprintk(vdev, + "pointer control id 0x%x size too small, %d bytes but %d bytes needed\n", + id, c->size, tot_size); return -EFAULT; } c->size = tot_size; @@ -3364,7 +3391,8 @@ static int class_check(struct v4l2_ctrl_handler *hdl, u32 which) /* Get extended controls. Allocates the helpers array if needed. */ static int v4l2_g_ext_ctrls_common(struct v4l2_ctrl_handler *hdl, - struct v4l2_ext_controls *cs) + struct v4l2_ext_controls *cs, + struct video_device *vdev) { struct v4l2_ctrl_helper helper[4]; struct v4l2_ctrl_helper *helpers = helper; @@ -3390,7 +3418,7 @@ static int v4l2_g_ext_ctrls_common(struct v4l2_ctrl_handler *hdl, return -ENOMEM; } - ret = prepare_ext_ctrls(hdl, cs, helpers, true); + ret = prepare_ext_ctrls(hdl, cs, helpers, vdev, true); cs->error_idx = cs->count; for (i = 0; !ret && i < cs->count; i++) @@ -3483,8 +3511,8 @@ v4l2_ctrls_find_req_obj(struct v4l2_ctrl_handler *hdl, return obj; } -int v4l2_g_ext_ctrls(struct v4l2_ctrl_handler *hdl, struct media_device *mdev, - struct v4l2_ext_controls *cs) +int v4l2_g_ext_ctrls(struct v4l2_ctrl_handler *hdl, struct video_device *vdev, + struct media_device *mdev, struct v4l2_ext_controls *cs) { struct media_request_object *obj = NULL; struct media_request *req = NULL; @@ -3520,7 +3548,7 @@ int v4l2_g_ext_ctrls(struct v4l2_ctrl_handler *hdl, struct media_device *mdev, req_obj); } - ret = v4l2_g_ext_ctrls_common(hdl, cs); + ret = v4l2_g_ext_ctrls_common(hdl, cs, vdev); if (obj) { media_request_unlock_for_access(req); @@ -3663,7 +3691,9 @@ static int try_or_set_cluster(struct v4l2_fh *fh, struct v4l2_ctrl *master, /* Validate controls. */ static int validate_ctrls(struct v4l2_ext_controls *cs, - struct v4l2_ctrl_helper *helpers, bool set) + struct v4l2_ctrl_helper *helpers, + struct video_device *vdev, + bool set) { unsigned i; int ret = 0; @@ -3675,16 +3705,24 @@ static int validate_ctrls(struct v4l2_ext_controls *cs, cs->error_idx = i; - if (ctrl->flags & V4L2_CTRL_FLAG_READ_ONLY) + if (ctrl->flags & V4L2_CTRL_FLAG_READ_ONLY) { + dprintk(vdev, + "control id 0x%x is read-only\n", + ctrl->id); return -EACCES; + } /* This test is also done in try_set_control_cluster() which is called in atomic context, so that has the final say, but it makes sense to do an up-front check as well. Once an error occurs in try_set_control_cluster() some other controls may have been set already and we want to do a best-effort to avoid that. */ - if (set && (ctrl->flags & V4L2_CTRL_FLAG_GRABBED)) + if (set && (ctrl->flags & V4L2_CTRL_FLAG_GRABBED)) { + dprintk(vdev, + "control id 0x%x is grabbed, cannot set\n", + ctrl->id); return -EBUSY; + } /* * Skip validation for now if the payload needs to be copied * from userspace into kernelspace. We'll validate those later. @@ -3719,7 +3757,8 @@ static void update_from_auto_cluster(struct v4l2_ctrl *master) /* Try or try-and-set controls */ static int try_set_ext_ctrls_common(struct v4l2_fh *fh, struct v4l2_ctrl_handler *hdl, - struct v4l2_ext_controls *cs, bool set) + struct v4l2_ext_controls *cs, + struct video_device *vdev, bool set) { struct v4l2_ctrl_helper helper[4]; struct v4l2_ctrl_helper *helpers = helper; @@ -3729,13 +3768,19 @@ static int try_set_ext_ctrls_common(struct v4l2_fh *fh, cs->error_idx = cs->count; /* Default value cannot be changed */ - if (cs->which == V4L2_CTRL_WHICH_DEF_VAL) + if (cs->which == V4L2_CTRL_WHICH_DEF_VAL) { + dprintk(vdev, "%s: cannot change default value\n", + video_device_node_name(vdev)); return -EINVAL; + } cs->which = V4L2_CTRL_ID2WHICH(cs->which); - if (hdl == NULL) + if (hdl == NULL) { + dprintk(vdev, "%s: invalid null control handler\n", + video_device_node_name(vdev)); return -EINVAL; + } if (cs->count == 0) return class_check(hdl, cs->which); @@ -3746,9 +3791,9 @@ static int try_set_ext_ctrls_common(struct v4l2_fh *fh, if (!helpers) return -ENOMEM; } - ret = prepare_ext_ctrls(hdl, cs, helpers, false); + ret = prepare_ext_ctrls(hdl, cs, helpers, vdev, false); if (!ret) - ret = validate_ctrls(cs, helpers, set); + ret = validate_ctrls(cs, helpers, vdev, set); if (ret && set) cs->error_idx = cs->count; for (i = 0; !ret && i < cs->count; i++) { @@ -3833,7 +3878,9 @@ static int try_set_ext_ctrls_common(struct v4l2_fh *fh, } static int try_set_ext_ctrls(struct v4l2_fh *fh, - struct v4l2_ctrl_handler *hdl, struct media_device *mdev, + struct v4l2_ctrl_handler *hdl, + struct video_device *vdev, + struct media_device *mdev, struct v4l2_ext_controls *cs, bool set) { struct media_request_object *obj = NULL; @@ -3841,21 +3888,39 @@ static int try_set_ext_ctrls(struct v4l2_fh *fh, int ret; if (cs->which == V4L2_CTRL_WHICH_REQUEST_VAL) { - if (!mdev || cs->request_fd < 0) + if (!mdev) { + dprintk(vdev, "%s: missing media device\n", + video_device_node_name(vdev)); + return -EINVAL; + } + + if (cs->request_fd < 0) { + dprintk(vdev, "%s: invalid request fd %d\n", + video_device_node_name(vdev), cs->request_fd); return -EINVAL; + } req = media_request_get_by_fd(mdev, cs->request_fd); - if (IS_ERR(req)) + if (IS_ERR(req)) { + dprintk(vdev, "%s: cannot find request fd %d\n", + video_device_node_name(vdev), cs->request_fd); return PTR_ERR(req); + } ret = media_request_lock_for_update(req); if (ret) { + dprintk(vdev, "%s: cannot lock request fd %d\n", + video_device_node_name(vdev), cs->request_fd); media_request_put(req); return ret; } obj = v4l2_ctrls_find_req_obj(hdl, req, set); if (IS_ERR(obj)) { + dprintk(vdev, + "%s: cannot find request object for request fd %d\n", + video_device_node_name(vdev), + cs->request_fd); media_request_unlock_for_update(req); media_request_put(req); return PTR_ERR(obj); @@ -3864,7 +3929,11 @@ static int try_set_ext_ctrls(struct v4l2_fh *fh, req_obj); } - ret = try_set_ext_ctrls_common(fh, hdl, cs, set); + ret = try_set_ext_ctrls_common(fh, hdl, cs, vdev, set); + if (ret) + dprintk(vdev, + "%s: try_set_ext_ctrls_common failed (%d)\n", + video_device_node_name(vdev), ret); if (obj) { media_request_unlock_for_update(req); @@ -3875,17 +3944,22 @@ static int try_set_ext_ctrls(struct v4l2_fh *fh, return ret; } -int v4l2_try_ext_ctrls(struct v4l2_ctrl_handler *hdl, struct media_device *mdev, +int v4l2_try_ext_ctrls(struct v4l2_ctrl_handler *hdl, + struct video_device *vdev, + struct media_device *mdev, struct v4l2_ext_controls *cs) { - return try_set_ext_ctrls(NULL, hdl, mdev, cs, false); + return try_set_ext_ctrls(NULL, hdl, vdev, mdev, cs, false); } EXPORT_SYMBOL(v4l2_try_ext_ctrls); -int v4l2_s_ext_ctrls(struct v4l2_fh *fh, struct v4l2_ctrl_handler *hdl, - struct media_device *mdev, struct v4l2_ext_controls *cs) +int v4l2_s_ext_ctrls(struct v4l2_fh *fh, + struct v4l2_ctrl_handler *hdl, + struct video_device *vdev, + struct media_device *mdev, + struct v4l2_ext_controls *cs) { - return try_set_ext_ctrls(fh, hdl, mdev, cs, true); + return try_set_ext_ctrls(fh, hdl, vdev, mdev, cs, true); } EXPORT_SYMBOL(v4l2_s_ext_ctrls); diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c index e36629ae2203..38765af9ad2d 100644 --- a/drivers/media/v4l2-core/v4l2-ioctl.c +++ b/drivers/media/v4l2-core/v4l2-ioctl.c @@ -2186,9 +2186,11 @@ static int v4l_g_ext_ctrls(const struct v4l2_ioctl_ops *ops, p->error_idx = p->count; if (vfh && vfh->ctrl_handler) - return v4l2_g_ext_ctrls(vfh->ctrl_handler, vfd->v4l2_dev->mdev, p); + return v4l2_g_ext_ctrls(vfh->ctrl_handler, + vfd, vfd->v4l2_dev->mdev, p); if (vfd->ctrl_handler) - return v4l2_g_ext_ctrls(vfd->ctrl_handler, vfd->v4l2_dev->mdev, p); + return v4l2_g_ext_ctrls(vfd->ctrl_handler, + vfd, vfd->v4l2_dev->mdev, p); if (ops->vidioc_g_ext_ctrls == NULL) return -ENOTTY; return check_ext_ctrls(p, 0) ? ops->vidioc_g_ext_ctrls(file, fh, p) : @@ -2205,9 +2207,11 @@ static int v4l_s_ext_ctrls(const struct v4l2_ioctl_ops *ops, p->error_idx = p->count; if (vfh && vfh->ctrl_handler) - return v4l2_s_ext_ctrls(vfh, vfh->ctrl_handler, vfd->v4l2_dev->mdev, p); + return v4l2_s_ext_ctrls(vfh, vfh->ctrl_handler, + vfd, vfd->v4l2_dev->mdev, p); if (vfd->ctrl_handler) - return v4l2_s_ext_ctrls(NULL, vfd->ctrl_handler, vfd->v4l2_dev->mdev, p); + return v4l2_s_ext_ctrls(NULL, vfd->ctrl_handler, + vfd, vfd->v4l2_dev->mdev, p); if (ops->vidioc_s_ext_ctrls == NULL) return -ENOTTY; return check_ext_ctrls(p, 0) ? ops->vidioc_s_ext_ctrls(file, fh, p) : @@ -2224,9 +2228,11 @@ static int v4l_try_ext_ctrls(const struct v4l2_ioctl_ops *ops, p->error_idx = p->count; if (vfh && vfh->ctrl_handler) - return v4l2_try_ext_ctrls(vfh->ctrl_handler, vfd->v4l2_dev->mdev, p); + return v4l2_try_ext_ctrls(vfh->ctrl_handler, + vfd, vfd->v4l2_dev->mdev, p); if (vfd->ctrl_handler) - return v4l2_try_ext_ctrls(vfd->ctrl_handler, vfd->v4l2_dev->mdev, p); + return v4l2_try_ext_ctrls(vfd->ctrl_handler, + vfd, vfd->v4l2_dev->mdev, p); if (ops->vidioc_try_ext_ctrls == NULL) return -ENOTTY; return check_ext_ctrls(p, 0) ? ops->vidioc_try_ext_ctrls(file, fh, p) : diff --git a/drivers/media/v4l2-core/v4l2-subdev.c b/drivers/media/v4l2-core/v4l2-subdev.c index 25c73c13cc7e..f725cd9b66b9 100644 --- a/drivers/media/v4l2-core/v4l2-subdev.c +++ b/drivers/media/v4l2-core/v4l2-subdev.c @@ -372,19 +372,19 @@ static long subdev_do_ioctl(struct file *file, unsigned int cmd, void *arg) if (!vfh->ctrl_handler) return -ENOTTY; return v4l2_g_ext_ctrls(vfh->ctrl_handler, - sd->v4l2_dev->mdev, arg); + vdev, sd->v4l2_dev->mdev, arg); case VIDIOC_S_EXT_CTRLS: if (!vfh->ctrl_handler) return -ENOTTY; return v4l2_s_ext_ctrls(vfh, vfh->ctrl_handler, - sd->v4l2_dev->mdev, arg); + vdev, sd->v4l2_dev->mdev, arg); case VIDIOC_TRY_EXT_CTRLS: if (!vfh->ctrl_handler) return -ENOTTY; return v4l2_try_ext_ctrls(vfh->ctrl_handler, - sd->v4l2_dev->mdev, arg); + vdev, sd->v4l2_dev->mdev, arg); case VIDIOC_DQEVENT: if (!(sd->flags & V4L2_SUBDEV_FL_HAS_EVENTS)) diff --git a/include/media/v4l2-ctrls.h b/include/media/v4l2-ctrls.h index 6e9dc9c44bb1..570ff4b0205a 100644 --- a/include/media/v4l2-ctrls.h +++ b/include/media/v4l2-ctrls.h @@ -1268,25 +1268,28 @@ int v4l2_s_ctrl(struct v4l2_fh *fh, struct v4l2_ctrl_handler *hdl, * :ref:`VIDIOC_G_EXT_CTRLS ` ioctl * * @hdl: pointer to &struct v4l2_ctrl_handler + * @vdev: pointer to &struct video_device * @mdev: pointer to &struct media_device * @c: pointer to &struct v4l2_ext_controls * * If hdl == NULL then they will all return -EINVAL. */ -int v4l2_g_ext_ctrls(struct v4l2_ctrl_handler *hdl, struct media_device *mdev, - struct v4l2_ext_controls *c); +int v4l2_g_ext_ctrls(struct v4l2_ctrl_handler *hdl, struct video_device *vdev, + struct media_device *mdev, struct v4l2_ext_controls *c); /** * v4l2_try_ext_ctrls - Helper function to implement * :ref:`VIDIOC_TRY_EXT_CTRLS ` ioctl * * @hdl: pointer to &struct v4l2_ctrl_handler + * @vdev: pointer to &struct video_device * @mdev: pointer to &struct media_device * @c: pointer to &struct v4l2_ext_controls * * If hdl == NULL then they will all return -EINVAL. */ int v4l2_try_ext_ctrls(struct v4l2_ctrl_handler *hdl, + struct video_device *vdev, struct media_device *mdev, struct v4l2_ext_controls *c); @@ -1296,12 +1299,14 @@ int v4l2_try_ext_ctrls(struct v4l2_ctrl_handler *hdl, * * @fh: pointer to &struct v4l2_fh * @hdl: pointer to &struct v4l2_ctrl_handler + * @vdev: pointer to &struct video_device * @mdev: pointer to &struct media_device * @c: pointer to &struct v4l2_ext_controls * * If hdl == NULL then they will all return -EINVAL. */ int v4l2_s_ext_ctrls(struct v4l2_fh *fh, struct v4l2_ctrl_handler *hdl, + struct video_device *vdev, struct media_device *mdev, struct v4l2_ext_controls *c); diff --git a/include/media/v4l2-ioctl.h b/include/media/v4l2-ioctl.h index 400f2e46c108..4bba65a59d46 100644 --- a/include/media/v4l2-ioctl.h +++ b/include/media/v4l2-ioctl.h @@ -602,6 +602,8 @@ struct v4l2_ioctl_ops { #define V4L2_DEV_DEBUG_STREAMING 0x08 /* Log poll() */ #define V4L2_DEV_DEBUG_POLL 0x10 +/* Log controls */ +#define V4L2_DEV_DEBUG_CTRL 0x20 /* Video standard functions */ -- cgit v1.2.3 From ee484875af0080a0c8621e58facf06f92c658a4b Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Tue, 23 Jul 2019 08:58:01 -0400 Subject: media: davinci/vpfe_capture.c: drop unused format descriptions Simplify vpfe_pixel_format to just contain the pixelformat and bpp fields. All others are unused. Signed-off-by: Hans Verkuil Acked-by: Lad, Prabhakar Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/davinci/vpfe_capture.c | 51 +++++---------------------- include/media/davinci/vpfe_capture.h | 2 +- 2 files changed, 10 insertions(+), 43 deletions(-) (limited to 'include') diff --git a/drivers/media/platform/davinci/vpfe_capture.c b/drivers/media/platform/davinci/vpfe_capture.c index 852fc357e19d..916ed743d716 100644 --- a/drivers/media/platform/davinci/vpfe_capture.c +++ b/drivers/media/platform/davinci/vpfe_capture.c @@ -119,57 +119,27 @@ static const struct vpfe_standard vpfe_standards[] = { /* Used when raw Bayer image from ccdc is directly captured to SDRAM */ static const struct vpfe_pixel_format vpfe_pix_fmts[] = { { - .fmtdesc = { - .index = 0, - .type = V4L2_BUF_TYPE_VIDEO_CAPTURE, - .description = "Bayer GrRBGb 8bit A-Law compr.", - .pixelformat = V4L2_PIX_FMT_SBGGR8, - }, + .pixelformat = V4L2_PIX_FMT_SBGGR8, .bpp = 1, }, { - .fmtdesc = { - .index = 1, - .type = V4L2_BUF_TYPE_VIDEO_CAPTURE, - .description = "Bayer GrRBGb - 16bit", - .pixelformat = V4L2_PIX_FMT_SBGGR16, - }, + .pixelformat = V4L2_PIX_FMT_SBGGR16, .bpp = 2, }, { - .fmtdesc = { - .index = 2, - .type = V4L2_BUF_TYPE_VIDEO_CAPTURE, - .description = "Bayer GrRBGb 8bit DPCM compr.", - .pixelformat = V4L2_PIX_FMT_SGRBG10DPCM8, - }, + .pixelformat = V4L2_PIX_FMT_SGRBG10DPCM8, .bpp = 1, }, { - .fmtdesc = { - .index = 3, - .type = V4L2_BUF_TYPE_VIDEO_CAPTURE, - .description = "YCbCr 4:2:2 Interleaved UYVY", - .pixelformat = V4L2_PIX_FMT_UYVY, - }, + .pixelformat = V4L2_PIX_FMT_UYVY, .bpp = 2, }, { - .fmtdesc = { - .index = 4, - .type = V4L2_BUF_TYPE_VIDEO_CAPTURE, - .description = "YCbCr 4:2:2 Interleaved YUYV", - .pixelformat = V4L2_PIX_FMT_YUYV, - }, + .pixelformat = V4L2_PIX_FMT_YUYV, .bpp = 2, }, { - .fmtdesc = { - .index = 5, - .type = V4L2_BUF_TYPE_VIDEO_CAPTURE, - .description = "Y/CbCr 4:2:0 - Semi planar", - .pixelformat = V4L2_PIX_FMT_NV12, - }, + .pixelformat = V4L2_PIX_FMT_NV12, .bpp = 1, }, }; @@ -183,7 +153,7 @@ static const struct vpfe_pixel_format *vpfe_lookup_pix_format(u32 pix_format) int i; for (i = 0; i < ARRAY_SIZE(vpfe_pix_fmts); i++) { - if (pix_format == vpfe_pix_fmts[i].fmtdesc.pixelformat) + if (pix_format == vpfe_pix_fmts[i].pixelformat) return &vpfe_pix_fmts[i]; } return NULL; @@ -782,7 +752,7 @@ static const struct vpfe_pixel_format * temp = 0; found = 0; while (ccdc_dev->hw_ops.enum_pix(&pix, temp) >= 0) { - if (vpfe_pix_fmt->fmtdesc.pixelformat == pix) { + if (vpfe_pix_fmt->pixelformat == pix) { found = 1; break; } @@ -899,7 +869,6 @@ static int vpfe_enum_fmt_vid_cap(struct file *file, void *priv, { struct vpfe_device *vpfe_dev = video_drvdata(file); const struct vpfe_pixel_format *pix_fmt; - int temp_index; u32 pix; v4l2_dbg(1, debug, &vpfe_dev->v4l2_dev, "vpfe_enum_fmt_vid_cap\n"); @@ -910,9 +879,7 @@ static int vpfe_enum_fmt_vid_cap(struct file *file, void *priv, /* Fill in the information about format */ pix_fmt = vpfe_lookup_pix_format(pix); if (pix_fmt) { - temp_index = fmt->index; - *fmt = pix_fmt->fmtdesc; - fmt->index = temp_index; + fmt->pixelformat = fmt->pixelformat; return 0; } return -EINVAL; diff --git a/include/media/davinci/vpfe_capture.h b/include/media/davinci/vpfe_capture.h index 2c5b3eacf527..4ad53031e2f7 100644 --- a/include/media/davinci/vpfe_capture.h +++ b/include/media/davinci/vpfe_capture.h @@ -32,7 +32,7 @@ #define CAPTURE_DRV_NAME "vpfe-capture" struct vpfe_pixel_format { - struct v4l2_fmtdesc fmtdesc; + u32 pixelformat; /* bytes per pixel */ int bpp; }; -- cgit v1.2.3 From 615c164da0eb42cbfb1688cb429cc4d5039db5d8 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Fri, 5 Jul 2019 17:14:21 +0300 Subject: intel_th: msu: Introduce buffer interface Introduces a concept of external buffers, which is a mechanism for creating trace sinks that would receive trace data from MSC buffers and transfer it elsewhere. A external buffer can implement its own window allocation/deallocation if it has to. It must provide a callback that's used to notify it when a window fills up, so that it can then start a DMA transaction from that window 'elsewhere'. This window remains in a 'locked' state and won't be used for storing new trace data until the buffer 'unlocks' it with a provided API call, at which point the window can be used again for storing trace data. This relies on a functional "last block" interrupt, so not all versions of Trace Hub can use this feature, which does not reflect on existing users. Signed-off-by: Alexander Shishkin Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20190705141425.19894-2-alexander.shishkin@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- .../ABI/testing/sysfs-bus-intel_th-devices-msc | 3 +- MAINTAINERS | 1 + drivers/hwtracing/intel_th/msu.c | 388 +++++++++++++++++++-- drivers/hwtracing/intel_th/msu.h | 20 +- include/linux/intel_th.h | 79 +++++ 5 files changed, 461 insertions(+), 30 deletions(-) create mode 100644 include/linux/intel_th.h (limited to 'include') diff --git a/Documentation/ABI/testing/sysfs-bus-intel_th-devices-msc b/Documentation/ABI/testing/sysfs-bus-intel_th-devices-msc index f54ae244f3f1..456cb62b384c 100644 --- a/Documentation/ABI/testing/sysfs-bus-intel_th-devices-msc +++ b/Documentation/ABI/testing/sysfs-bus-intel_th-devices-msc @@ -12,7 +12,8 @@ Description: (RW) Configure MSC operating mode: - "single", for contiguous buffer mode (high-order alloc); - "multi", for multiblock mode; - "ExI", for DCI handler mode; - - "debug", for debug mode. + - "debug", for debug mode; + - any of the currently loaded buffer sinks. If operating mode changes, existing buffer is deallocated, provided there are no active users and tracing is not enabled, otherwise the write will fail. diff --git a/MAINTAINERS b/MAINTAINERS index 783569e3c4b4..c8c506b8423b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8360,6 +8360,7 @@ M: Alexander Shishkin S: Supported F: Documentation/trace/intel_th.rst F: drivers/hwtracing/intel_th/ +F: include/linux/intel_th.h INTEL(R) TRUSTED EXECUTION TECHNOLOGY (TXT) M: Ning Sun diff --git a/drivers/hwtracing/intel_th/msu.c b/drivers/hwtracing/intel_th/msu.c index 8ab28e5fb366..08413e6a075f 100644 --- a/drivers/hwtracing/intel_th/msu.c +++ b/drivers/hwtracing/intel_th/msu.c @@ -17,21 +17,48 @@ #include #include #include +#include #include #ifdef CONFIG_X86 #include #endif +#include #include "intel_th.h" #include "msu.h" #define msc_dev(x) (&(x)->thdev->dev) +/* + * Lockout state transitions: + * READY -> INUSE -+-> LOCKED -+-> READY -> etc. + * \-----------/ + * WIN_READY: window can be used by HW + * WIN_INUSE: window is in use + * WIN_LOCKED: window is filled up and is being processed by the buffer + * handling code + * + * All state transitions happen automatically, except for the LOCKED->READY, + * which needs to be signalled by the buffer code by calling + * intel_th_msc_window_unlock(). + * + * When the interrupt handler has to switch to the next window, it checks + * whether it's READY, and if it is, it performs the switch and tracing + * continues. If it's LOCKED, it stops the trace. + */ +enum lockout_state { + WIN_READY = 0, + WIN_INUSE, + WIN_LOCKED +}; + /** * struct msc_window - multiblock mode window descriptor * @entry: window list linkage (msc::win_list) * @pgoff: page offset into the buffer that this window starts at + * @lockout: lockout state, see comment below + * @lo_lock: lockout state serialization * @nr_blocks: number of blocks (pages) in this window * @nr_segs: number of segments in this window (<= @nr_blocks) * @_sgt: array of block descriptors @@ -40,6 +67,8 @@ struct msc_window { struct list_head entry; unsigned long pgoff; + enum lockout_state lockout; + spinlock_t lo_lock; unsigned int nr_blocks; unsigned int nr_segs; struct msc *msc; @@ -77,6 +106,8 @@ struct msc_iter { * struct msc - MSC device representation * @reg_base: register window base address * @thdev: intel_th_device pointer + * @mbuf: MSU buffer, if assigned + * @mbuf_priv MSU buffer's private data, if @mbuf * @win_list: list of windows in multiblock mode * @single_sgt: single mode buffer * @cur_win: current window @@ -100,6 +131,10 @@ struct msc { void __iomem *msu_base; struct intel_th_device *thdev; + const struct msu_buffer *mbuf; + void *mbuf_priv; + + struct work_struct work; struct list_head win_list; struct sg_table single_sgt; struct msc_window *cur_win; @@ -126,6 +161,101 @@ struct msc { unsigned int index; }; +static LIST_HEAD(msu_buffer_list); +static struct mutex msu_buffer_mutex; + +/** + * struct msu_buffer_entry - internal MSU buffer bookkeeping + * @entry: link to msu_buffer_list + * @mbuf: MSU buffer object + * @owner: module that provides this MSU buffer + */ +struct msu_buffer_entry { + struct list_head entry; + const struct msu_buffer *mbuf; + struct module *owner; +}; + +static struct msu_buffer_entry *__msu_buffer_entry_find(const char *name) +{ + struct msu_buffer_entry *mbe; + + lockdep_assert_held(&msu_buffer_mutex); + + list_for_each_entry(mbe, &msu_buffer_list, entry) { + if (!strcmp(mbe->mbuf->name, name)) + return mbe; + } + + return NULL; +} + +static const struct msu_buffer * +msu_buffer_get(const char *name) +{ + struct msu_buffer_entry *mbe; + + mutex_lock(&msu_buffer_mutex); + mbe = __msu_buffer_entry_find(name); + if (mbe && !try_module_get(mbe->owner)) + mbe = NULL; + mutex_unlock(&msu_buffer_mutex); + + return mbe ? mbe->mbuf : NULL; +} + +static void msu_buffer_put(const struct msu_buffer *mbuf) +{ + struct msu_buffer_entry *mbe; + + mutex_lock(&msu_buffer_mutex); + mbe = __msu_buffer_entry_find(mbuf->name); + if (mbe) + module_put(mbe->owner); + mutex_unlock(&msu_buffer_mutex); +} + +int intel_th_msu_buffer_register(const struct msu_buffer *mbuf, + struct module *owner) +{ + struct msu_buffer_entry *mbe; + int ret = 0; + + mbe = kzalloc(sizeof(*mbe), GFP_KERNEL); + if (!mbe) + return -ENOMEM; + + mutex_lock(&msu_buffer_mutex); + if (__msu_buffer_entry_find(mbuf->name)) { + ret = -EEXIST; + kfree(mbe); + goto unlock; + } + + mbe->mbuf = mbuf; + mbe->owner = owner; + list_add_tail(&mbe->entry, &msu_buffer_list); +unlock: + mutex_unlock(&msu_buffer_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(intel_th_msu_buffer_register); + +void intel_th_msu_buffer_unregister(const struct msu_buffer *mbuf) +{ + struct msu_buffer_entry *mbe; + + mutex_lock(&msu_buffer_mutex); + mbe = __msu_buffer_entry_find(mbuf->name); + if (mbe) { + list_del(&mbe->entry); + kfree(mbe); + } + mutex_unlock(&msu_buffer_mutex); +} +EXPORT_SYMBOL_GPL(intel_th_msu_buffer_unregister); + static inline bool msc_block_is_empty(struct msc_block_desc *bdesc) { /* header hasn't been written */ @@ -188,6 +318,25 @@ static struct msc_window *msc_next_window(struct msc_window *win) return list_next_entry(win, entry); } +static size_t msc_win_total_sz(struct msc_window *win) +{ + unsigned int blk; + size_t size = 0; + + for (blk = 0; blk < win->nr_segs; blk++) { + struct msc_block_desc *bdesc = msc_win_block(win, blk); + + if (msc_block_wrapped(bdesc)) + return win->nr_blocks << PAGE_SHIFT; + + size += msc_total_sz(bdesc); + if (msc_block_last_written(bdesc)) + break; + } + + return size; +} + /** * msc_find_window() - find a window matching a given sg_table * @msc: MSC device @@ -527,6 +676,9 @@ static int intel_th_msu_init(struct msc *msc) if (!msc->do_irq) return 0; + if (!msc->mbuf) + return 0; + mintctl = ioread32(msc->msu_base + REG_MSU_MINTCTL); mintctl |= msc->index ? M1BLIE : M0BLIE; iowrite32(mintctl, msc->msu_base + REG_MSU_MINTCTL); @@ -554,6 +706,44 @@ static void intel_th_msu_deinit(struct msc *msc) iowrite32(mintctl, msc->msu_base + REG_MSU_MINTCTL); } +static int msc_win_set_lockout(struct msc_window *win, + enum lockout_state expect, + enum lockout_state new) +{ + enum lockout_state old; + unsigned long flags; + int ret = 0; + + if (!win->msc->mbuf) + return 0; + + spin_lock_irqsave(&win->lo_lock, flags); + old = win->lockout; + + if (old != expect) { + ret = -EINVAL; + dev_warn_ratelimited(msc_dev(win->msc), + "expected lockout state %d, got %d\n", + expect, old); + goto unlock; + } + + win->lockout = new; + +unlock: + spin_unlock_irqrestore(&win->lo_lock, flags); + + if (ret) { + if (expect == WIN_READY && old == WIN_LOCKED) + return -EBUSY; + + /* from intel_th_msc_window_unlock(), don't warn if not locked */ + if (expect == WIN_LOCKED && old == new) + return 0; + } + + return ret; +} /** * msc_configure() - set up MSC hardware * @msc: the MSC device to configure @@ -571,8 +761,12 @@ static int msc_configure(struct msc *msc) if (msc->mode > MSC_MODE_MULTI) return -ENOTSUPP; - if (msc->mode == MSC_MODE_MULTI) + if (msc->mode == MSC_MODE_MULTI) { + if (msc_win_set_lockout(msc->cur_win, WIN_READY, WIN_INUSE)) + return -EBUSY; + msc_buffer_clear_hw_header(msc); + } reg = msc->base_addr >> PAGE_SHIFT; iowrite32(reg, msc->reg_base + REG_MSU_MSC0BAR); @@ -594,10 +788,14 @@ static int msc_configure(struct msc *msc) iowrite32(reg, msc->reg_base + REG_MSU_MSC0CTL); + intel_th_msu_init(msc); + msc->thdev->output.multiblock = msc->mode == MSC_MODE_MULTI; intel_th_trace_enable(msc->thdev); msc->enabled = 1; + if (msc->mbuf && msc->mbuf->activate) + msc->mbuf->activate(msc->mbuf_priv); return 0; } @@ -611,10 +809,17 @@ static int msc_configure(struct msc *msc) */ static void msc_disable(struct msc *msc) { + struct msc_window *win = msc->cur_win; u32 reg; lockdep_assert_held(&msc->buf_mutex); + if (msc->mode == MSC_MODE_MULTI) + msc_win_set_lockout(win, WIN_INUSE, WIN_LOCKED); + + if (msc->mbuf && msc->mbuf->deactivate) + msc->mbuf->deactivate(msc->mbuf_priv); + intel_th_msu_deinit(msc); intel_th_trace_disable(msc->thdev); if (msc->mode == MSC_MODE_SINGLE) { @@ -630,6 +835,11 @@ static void msc_disable(struct msc *msc) reg = ioread32(msc->reg_base + REG_MSU_MSC0CTL); reg &= ~MSC_EN; iowrite32(reg, msc->reg_base + REG_MSU_MSC0CTL); + + if (msc->mbuf && msc->mbuf->ready) + msc->mbuf->ready(msc->mbuf_priv, win->sgt, + msc_win_total_sz(win)); + msc->enabled = 0; iowrite32(0, msc->reg_base + REG_MSU_MSC0BAR); @@ -640,6 +850,10 @@ static void msc_disable(struct msc *msc) reg = ioread32(msc->reg_base + REG_MSU_MSC0STS); dev_dbg(msc_dev(msc), "MSCnSTS: %08x\n", reg); + + reg = ioread32(msc->reg_base + REG_MSU_MSUSTS); + reg &= msc->index ? MSUSTS_MSC1BLAST : MSUSTS_MSC0BLAST; + iowrite32(reg, msc->reg_base + REG_MSU_MSUSTS); } static int intel_th_msc_activate(struct intel_th_device *thdev) @@ -856,6 +1070,8 @@ static int msc_buffer_win_alloc(struct msc *msc, unsigned int nr_blocks) win->msc = msc; win->sgt = &win->_sgt; + win->lockout = WIN_READY; + spin_lock_init(&win->lo_lock); if (!list_empty(&msc->win_list)) { struct msc_window *prev = list_last_entry(&msc->win_list, @@ -865,8 +1081,13 @@ static int msc_buffer_win_alloc(struct msc *msc, unsigned int nr_blocks) win->pgoff = prev->pgoff + prev->nr_blocks; } - ret = __msc_buffer_win_alloc(win, nr_blocks); - if (ret < 0) + if (msc->mbuf && msc->mbuf->alloc_window) + ret = msc->mbuf->alloc_window(msc->mbuf_priv, &win->sgt, + nr_blocks << PAGE_SHIFT); + else + ret = __msc_buffer_win_alloc(win, nr_blocks); + + if (ret <= 0) goto err_nomem; msc_buffer_set_uc(win, ret); @@ -925,7 +1146,10 @@ static void msc_buffer_win_free(struct msc *msc, struct msc_window *win) msc_buffer_set_wb(win); - __msc_buffer_win_free(msc, win); + if (msc->mbuf && msc->mbuf->free_window) + msc->mbuf->free_window(msc->mbuf_priv, win->sgt); + else + __msc_buffer_win_free(msc, win); kfree(win); } @@ -1462,18 +1686,77 @@ static void msc_win_switch(struct msc *msc) intel_th_trace_switch(msc->thdev); } +/** + * intel_th_msc_window_unlock - put the window back in rotation + * @dev: MSC device to which this relates + * @sgt: buffer's sg_table for the window, does nothing if NULL + */ +void intel_th_msc_window_unlock(struct device *dev, struct sg_table *sgt) +{ + struct msc *msc = dev_get_drvdata(dev); + struct msc_window *win; + + if (!sgt) + return; + + win = msc_find_window(msc, sgt, false); + if (!win) + return; + + msc_win_set_lockout(win, WIN_LOCKED, WIN_READY); +} +EXPORT_SYMBOL_GPL(intel_th_msc_window_unlock); + +static void msc_work(struct work_struct *work) +{ + struct msc *msc = container_of(work, struct msc, work); + + intel_th_msc_deactivate(msc->thdev); +} + static irqreturn_t intel_th_msc_interrupt(struct intel_th_device *thdev) { struct msc *msc = dev_get_drvdata(&thdev->dev); u32 msusts = ioread32(msc->msu_base + REG_MSU_MSUSTS); u32 mask = msc->index ? MSUSTS_MSC1BLAST : MSUSTS_MSC0BLAST; + struct msc_window *win, *next_win; - if (!(msusts & mask)) { - if (msc->enabled) - return IRQ_HANDLED; + if (!msc->do_irq || !msc->mbuf) return IRQ_NONE; + + msusts &= mask; + + if (!msusts) + return msc->enabled ? IRQ_HANDLED : IRQ_NONE; + + iowrite32(msusts, msc->msu_base + REG_MSU_MSUSTS); + + if (!msc->enabled) + return IRQ_NONE; + + /* grab the window before we do the switch */ + win = msc->cur_win; + if (!win) + return IRQ_HANDLED; + next_win = msc_next_window(win); + if (!next_win) + return IRQ_HANDLED; + + /* next window: if READY, proceed, if LOCKED, stop the trace */ + if (msc_win_set_lockout(next_win, WIN_READY, WIN_INUSE)) { + schedule_work(&msc->work); + return IRQ_HANDLED; } + /* current window: INUSE -> LOCKED */ + msc_win_set_lockout(win, WIN_INUSE, WIN_LOCKED); + + msc_win_switch(msc); + + if (msc->mbuf && msc->mbuf->ready) + msc->mbuf->ready(msc->mbuf_priv, win->sgt, + msc_win_total_sz(win)); + return IRQ_HANDLED; } @@ -1511,21 +1794,43 @@ wrap_store(struct device *dev, struct device_attribute *attr, const char *buf, static DEVICE_ATTR_RW(wrap); +static void msc_buffer_unassign(struct msc *msc) +{ + lockdep_assert_held(&msc->buf_mutex); + + if (!msc->mbuf) + return; + + msc->mbuf->unassign(msc->mbuf_priv); + msu_buffer_put(msc->mbuf); + msc->mbuf_priv = NULL; + msc->mbuf = NULL; +} + static ssize_t mode_show(struct device *dev, struct device_attribute *attr, char *buf) { struct msc *msc = dev_get_drvdata(dev); + const char *mode = msc_mode[msc->mode]; + ssize_t ret; + + mutex_lock(&msc->buf_mutex); + if (msc->mbuf) + mode = msc->mbuf->name; + ret = scnprintf(buf, PAGE_SIZE, "%s\n", mode); + mutex_unlock(&msc->buf_mutex); - return scnprintf(buf, PAGE_SIZE, "%s\n", msc_mode[msc->mode]); + return ret; } static ssize_t mode_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t size) { + const struct msu_buffer *mbuf = NULL; struct msc *msc = dev_get_drvdata(dev); size_t len = size; - char *cp; + char *cp, *mode; int i, ret; if (!capable(CAP_SYS_RAWIO)) @@ -1535,17 +1840,59 @@ mode_store(struct device *dev, struct device_attribute *attr, const char *buf, if (cp) len = cp - buf; - for (i = 0; i < ARRAY_SIZE(msc_mode); i++) - if (!strncmp(msc_mode[i], buf, len)) - goto found; + mode = kstrndup(buf, len, GFP_KERNEL); + i = match_string(msc_mode, ARRAY_SIZE(msc_mode), mode); + if (i >= 0) + goto found; + + /* Buffer sinks only work with a usable IRQ */ + if (!msc->do_irq) { + kfree(mode); + return -EINVAL; + } + + mbuf = msu_buffer_get(mode); + kfree(mode); + if (mbuf) + goto found; return -EINVAL; found: mutex_lock(&msc->buf_mutex); + ret = 0; + + /* Same buffer: do nothing */ + if (mbuf && mbuf == msc->mbuf) { + /* put the extra reference we just got */ + msu_buffer_put(mbuf); + goto unlock; + } + ret = msc_buffer_unlocked_free_unless_used(msc); - if (!ret) - msc->mode = i; + if (ret) + goto unlock; + + if (mbuf) { + void *mbuf_priv = mbuf->assign(dev, &i); + + if (!mbuf_priv) { + ret = -ENOMEM; + goto unlock; + } + + msc_buffer_unassign(msc); + msc->mbuf_priv = mbuf_priv; + msc->mbuf = mbuf; + } else { + msc_buffer_unassign(msc); + } + + msc->mode = i; + +unlock: + if (ret && mbuf) + msu_buffer_put(mbuf); mutex_unlock(&msc->buf_mutex); return ret ? ret : size; @@ -1667,7 +2014,12 @@ win_switch_store(struct device *dev, struct device_attribute *attr, return -EINVAL; mutex_lock(&msc->buf_mutex); - if (msc->mode != MSC_MODE_MULTI) + /* + * Window switch can only happen in the "multi" mode. + * If a external buffer is engaged, they have the full + * control over window switching. + */ + if (msc->mode != MSC_MODE_MULTI || msc->mbuf) ret = -ENOTSUPP; else msc_win_switch(msc); @@ -1720,10 +2072,7 @@ static int intel_th_msc_probe(struct intel_th_device *thdev) msc->reg_base = base + msc->index * 0x100; msc->msu_base = base; - err = intel_th_msu_init(msc); - if (err) - return err; - + INIT_WORK(&msc->work, msc_work); err = intel_th_msc_init(msc); if (err) return err; @@ -1739,7 +2088,6 @@ static void intel_th_msc_remove(struct intel_th_device *thdev) int ret; intel_th_msc_deactivate(thdev); - intel_th_msu_deinit(msc); /* * Buffers should not be used at this point except if the diff --git a/drivers/hwtracing/intel_th/msu.h b/drivers/hwtracing/intel_th/msu.h index 574c16004cb2..3f527dd4d727 100644 --- a/drivers/hwtracing/intel_th/msu.h +++ b/drivers/hwtracing/intel_th/msu.h @@ -44,14 +44,6 @@ enum { #define M0BLIE BIT(16) #define M1BLIE BIT(24) -/* MSC operating modes (MSC_MODE) */ -enum { - MSC_MODE_SINGLE = 0, - MSC_MODE_MULTI, - MSC_MODE_EXI, - MSC_MODE_DEBUG, -}; - /* MSCnSTS bits */ #define MSCSTS_WRAPSTAT BIT(1) /* Wrap occurred */ #define MSCSTS_PLE BIT(2) /* Pipeline Empty */ @@ -93,6 +85,16 @@ static inline unsigned long msc_data_sz(struct msc_block_desc *bdesc) return bdesc->valid_dw * 4 - MSC_BDESC; } +static inline unsigned long msc_total_sz(struct msc_block_desc *bdesc) +{ + return bdesc->valid_dw * 4; +} + +static inline unsigned long msc_block_sz(struct msc_block_desc *bdesc) +{ + return bdesc->block_sz * 64 - MSC_BDESC; +} + static inline bool msc_block_wrapped(struct msc_block_desc *bdesc) { if (bdesc->hw_tag & (MSC_HW_TAG_BLOCKWRAP | MSC_HW_TAG_WINWRAP)) @@ -104,7 +106,7 @@ static inline bool msc_block_wrapped(struct msc_block_desc *bdesc) static inline bool msc_block_last_written(struct msc_block_desc *bdesc) { if ((bdesc->hw_tag & MSC_HW_TAG_ENDBIT) || - (msc_data_sz(bdesc) != DATA_IN_PAGE)) + (msc_data_sz(bdesc) != msc_block_sz(bdesc))) return true; return false; diff --git a/include/linux/intel_th.h b/include/linux/intel_th.h new file mode 100644 index 000000000000..9b7f4c22499c --- /dev/null +++ b/include/linux/intel_th.h @@ -0,0 +1,79 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Intel(R) Trace Hub data structures for implementing buffer sinks. + * + * Copyright (C) 2019 Intel Corporation. + */ + +#ifndef _INTEL_TH_H_ +#define _INTEL_TH_H_ + +#include + +/* MSC operating modes (MSC_MODE) */ +enum { + MSC_MODE_SINGLE = 0, + MSC_MODE_MULTI, + MSC_MODE_EXI, + MSC_MODE_DEBUG, +}; + +struct msu_buffer { + const char *name; + /* + * ->assign() called when buffer 'mode' is set to this driver + * (aka mode_store()) + * @device: struct device * of the msc + * @mode: allows the driver to set HW mode (see the enum above) + * Returns: a pointer to a private structure associated with this + * msc or NULL in case of error. This private structure + * will then be passed into all other callbacks. + */ + void *(*assign)(struct device *dev, int *mode); + /* ->unassign(): some other mode is selected, clean up */ + void (*unassign)(void *priv); + /* + * ->alloc_window(): allocate memory for the window of a given + * size + * @sgt: pointer to sg_table, can be overridden by the buffer + * driver, or kept intact + * Returns: number of sg table entries <= number of pages; + * 0 is treated as an allocation failure. + */ + int (*alloc_window)(void *priv, struct sg_table **sgt, + size_t size); + void (*free_window)(void *priv, struct sg_table *sgt); + /* ->activate(): trace has started */ + void (*activate)(void *priv); + /* ->deactivate(): trace is about to stop */ + void (*deactivate)(void *priv); + /* + * ->ready(): window @sgt is filled up to the last block OR + * tracing is stopped by the user; this window contains + * @bytes data. The window in question transitions into + * the "LOCKED" state, indicating that it can't be used + * by hardware. To clear this state and make the window + * available to the hardware again, call + * intel_th_msc_window_unlock(). + */ + int (*ready)(void *priv, struct sg_table *sgt, size_t bytes); +}; + +int intel_th_msu_buffer_register(const struct msu_buffer *mbuf, + struct module *owner); +void intel_th_msu_buffer_unregister(const struct msu_buffer *mbuf); +void intel_th_msc_window_unlock(struct device *dev, struct sg_table *sgt); + +#define module_intel_th_msu_buffer(__buffer) \ +static int __init __buffer##_init(void) \ +{ \ + return intel_th_msu_buffer_register(&(__buffer), THIS_MODULE); \ +} \ +module_init(__buffer##_init); \ +static void __exit __buffer##_exit(void) \ +{ \ + intel_th_msu_buffer_unregister(&(__buffer)); \ +} \ +module_exit(__buffer##_exit); + +#endif /* _INTEL_TH_H_ */ -- cgit v1.2.3 From 515db266a9dace92b0cbaed9a6044dd5304b8ca9 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 16 Jul 2019 17:21:06 +0200 Subject: driver core: Remove device link creation limitation If device_link_add() is called for a consumer/supplier pair with an existing device link between them and the existing link's type is not in agreement with the flags passed to that function by its caller, NULL will be returned. That is seriously inconvenient, because it forces the callers of device_link_add() to worry about what others may or may not do even if that is not relevant to them for any other reasons. It turns out, however, that this limitation can be made go away relatively easily. The underlying observation is that if DL_FLAG_STATELESS has been passed to device_link_add() in flags for the given consumer/supplier pair at least once, calling either device_link_del() or device_link_remove() to release the link returned by it should work, but there are no other requirements associated with that flag. In turn, if at least one of the callers of device_link_add() for the given consumer/supplier pair has not passed DL_FLAG_STATELESS to it in flags, the driver core should track the status of the link and act on it as appropriate (ie. the link should be treated as "managed"). This means that DL_FLAG_STATELESS needs to be set for managed device links and it should be valid to call device_link_del() or device_link_remove() to drop references to them in certain sutiations. To allow that to happen, introduce a new (internal) device link flag called DL_FLAG_MANAGED and make device_link_add() set it automatically whenever DL_FLAG_STATELESS is not passed to it. Also make it take additional references to existing device links that were previously stateless (that is, with DL_FLAG_STATELESS set and DL_FLAG_MANAGED unset) and will need to be managed going forward and initialize their status (which has been DL_STATE_NONE so far). Accordingly, when a managed device link is dropped automatically by the driver core, make it clear DL_FLAG_MANAGED, reset the link's status back to DL_STATE_NONE and drop the reference to it associated with DL_FLAG_MANAGED instead of just deleting it right away (to allow it to stay around in case it still needs to be released explicitly by someone). With that, since setting DL_FLAG_STATELESS doesn't mean that the device link in question is not managed any more, replace all of the status-tracking checks against DL_FLAG_STATELESS with analogous checks against DL_FLAG_MANAGED and update the documentation to reflect these changes. While at it, make device_link_add() reject flags that it does not recognize, including DL_FLAG_MANAGED. Signed-off-by: Rafael J. Wysocki Reviewed-by: Saravana Kannan Tested-by: Marek Szyprowski Review-by: Saravana Kannan Link: https://lore.kernel.org/r/2305283.AStDPdUUnE@kreacher Signed-off-by: Greg Kroah-Hartman --- Documentation/driver-api/device_link.rst | 4 +- drivers/base/core.c | 176 +++++++++++++++++-------------- drivers/base/power/runtime.c | 4 +- include/linux/device.h | 4 +- 4 files changed, 106 insertions(+), 82 deletions(-) (limited to 'include') diff --git a/Documentation/driver-api/device_link.rst b/Documentation/driver-api/device_link.rst index ae1e3d0394b0..1b5020ec6517 100644 --- a/Documentation/driver-api/device_link.rst +++ b/Documentation/driver-api/device_link.rst @@ -78,8 +78,8 @@ typically deleted in its ``->remove`` callback for symmetry. That way, if the driver is compiled as a module, the device link is added on module load and orderly deleted on unload. The same restrictions that apply to device link addition (e.g. exclusion of a parallel suspend/resume transition) apply equally -to deletion. Device links with ``DL_FLAG_STATELESS`` unset (i.e. managed -device links) are deleted automatically by the driver core. +to deletion. Device links managed by the driver core are deleted automatically +by it. Several flags may be specified on device link addition, two of which have already been mentioned above: ``DL_FLAG_STATELESS`` to express that no diff --git a/drivers/base/core.c b/drivers/base/core.c index da84a73f2ba6..21cd08162219 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -124,6 +124,50 @@ static int device_is_dependent(struct device *dev, void *target) return ret; } +static void device_link_init_status(struct device_link *link, + struct device *consumer, + struct device *supplier) +{ + switch (supplier->links.status) { + case DL_DEV_PROBING: + switch (consumer->links.status) { + case DL_DEV_PROBING: + /* + * A consumer driver can create a link to a supplier + * that has not completed its probing yet as long as it + * knows that the supplier is already functional (for + * example, it has just acquired some resources from the + * supplier). + */ + link->status = DL_STATE_CONSUMER_PROBE; + break; + default: + link->status = DL_STATE_DORMANT; + break; + } + break; + case DL_DEV_DRIVER_BOUND: + switch (consumer->links.status) { + case DL_DEV_PROBING: + link->status = DL_STATE_CONSUMER_PROBE; + break; + case DL_DEV_DRIVER_BOUND: + link->status = DL_STATE_ACTIVE; + break; + default: + link->status = DL_STATE_AVAILABLE; + break; + } + break; + case DL_DEV_UNBINDING: + link->status = DL_STATE_SUPPLIER_UNBIND; + break; + default: + link->status = DL_STATE_DORMANT; + break; + } +} + static int device_reorder_to_tail(struct device *dev, void *not_used) { struct device_link *link; @@ -165,6 +209,10 @@ void device_pm_move_to_tail(struct device *dev) device_links_read_unlock(idx); } +#define DL_MANAGED_LINK_FLAGS (DL_FLAG_AUTOREMOVE_CONSUMER | \ + DL_FLAG_AUTOREMOVE_SUPPLIER | \ + DL_FLAG_AUTOPROBE_CONSUMER) + /** * device_link_add - Create a link between two devices. * @consumer: Consumer end of the link. @@ -179,9 +227,9 @@ void device_pm_move_to_tail(struct device *dev) * of the link. If DL_FLAG_PM_RUNTIME is not set, DL_FLAG_RPM_ACTIVE will be * ignored. * - * If DL_FLAG_STATELESS is set in @flags, the link is not going to be managed by - * the driver core and, in particular, the caller of this function is expected - * to drop the reference to the link acquired by it directly. + * If DL_FLAG_STATELESS is set in @flags, the caller of this function is + * expected to release the link returned by it directly with the help of either + * device_link_del() or device_link_remove(). * * If that flag is not set, however, the caller of this function is handing the * management of the link over to the driver core entirely and its return value @@ -201,9 +249,16 @@ void device_pm_move_to_tail(struct device *dev) * be used to request the driver core to automaticall probe for a consmer * driver after successfully binding a driver to the supplier device. * - * The combination of DL_FLAG_STATELESS and either DL_FLAG_AUTOREMOVE_CONSUMER - * or DL_FLAG_AUTOREMOVE_SUPPLIER set in @flags at the same time is invalid and - * will cause NULL to be returned upfront. + * The combination of DL_FLAG_STATELESS and one of DL_FLAG_AUTOREMOVE_CONSUMER, + * DL_FLAG_AUTOREMOVE_SUPPLIER, or DL_FLAG_AUTOPROBE_CONSUMER set in @flags at + * the same time is invalid and will cause NULL to be returned upfront. + * However, if a device link between the given @consumer and @supplier pair + * exists already when this function is called for them, the existing link will + * be returned regardless of its current type and status (the link's flags may + * be modified then). The caller of this function is then expected to treat + * the link as though it has just been created, so (in particular) if + * DL_FLAG_STATELESS was passed in @flags, the link needs to be released + * explicitly when not needed any more (as stated above). * * A side effect of the link creation is re-ordering of dpm_list and the * devices_kset list by moving the consumer device and all devices depending @@ -220,10 +275,8 @@ struct device_link *device_link_add(struct device *consumer, struct device_link *link; if (!consumer || !supplier || - (flags & DL_FLAG_STATELESS && - flags & (DL_FLAG_AUTOREMOVE_CONSUMER | - DL_FLAG_AUTOREMOVE_SUPPLIER | - DL_FLAG_AUTOPROBE_CONSUMER)) || + (flags & ~(DL_FLAG_STATELESS | DL_MANAGED_LINK_FLAGS)) || + (flags & DL_FLAG_STATELESS && flags & DL_MANAGED_LINK_FLAGS) || (flags & DL_FLAG_AUTOPROBE_CONSUMER && flags & (DL_FLAG_AUTOREMOVE_CONSUMER | DL_FLAG_AUTOREMOVE_SUPPLIER))) @@ -236,6 +289,9 @@ struct device_link *device_link_add(struct device *consumer, } } + if (!(flags & DL_FLAG_STATELESS)) + flags |= DL_FLAG_MANAGED; + device_links_write_lock(); device_pm_lock(); @@ -262,15 +318,6 @@ struct device_link *device_link_add(struct device *consumer, if (link->consumer != consumer) continue; - /* - * Don't return a stateless link if the caller wants a stateful - * one and vice versa. - */ - if (WARN_ON((flags & DL_FLAG_STATELESS) != (link->flags & DL_FLAG_STATELESS))) { - link = NULL; - goto out; - } - if (flags & DL_FLAG_PM_RUNTIME) { if (!(link->flags & DL_FLAG_PM_RUNTIME)) { pm_runtime_new_link(consumer); @@ -281,6 +328,7 @@ struct device_link *device_link_add(struct device *consumer, } if (flags & DL_FLAG_STATELESS) { + link->flags |= DL_FLAG_STATELESS; kref_get(&link->kref); goto out; } @@ -299,6 +347,11 @@ struct device_link *device_link_add(struct device *consumer, link->flags &= ~(DL_FLAG_AUTOREMOVE_CONSUMER | DL_FLAG_AUTOREMOVE_SUPPLIER); } + if (!(link->flags & DL_FLAG_MANAGED)) { + kref_get(&link->kref); + link->flags |= DL_FLAG_MANAGED; + device_link_init_status(link, consumer, supplier); + } goto out; } @@ -325,48 +378,10 @@ struct device_link *device_link_add(struct device *consumer, kref_init(&link->kref); /* Determine the initial link state. */ - if (flags & DL_FLAG_STATELESS) { + if (flags & DL_FLAG_STATELESS) link->status = DL_STATE_NONE; - } else { - switch (supplier->links.status) { - case DL_DEV_PROBING: - switch (consumer->links.status) { - case DL_DEV_PROBING: - /* - * A consumer driver can create a link to a - * supplier that has not completed its probing - * yet as long as it knows that the supplier is - * already functional (for example, it has just - * acquired some resources from the supplier). - */ - link->status = DL_STATE_CONSUMER_PROBE; - break; - default: - link->status = DL_STATE_DORMANT; - break; - } - break; - case DL_DEV_DRIVER_BOUND: - switch (consumer->links.status) { - case DL_DEV_PROBING: - link->status = DL_STATE_CONSUMER_PROBE; - break; - case DL_DEV_DRIVER_BOUND: - link->status = DL_STATE_ACTIVE; - break; - default: - link->status = DL_STATE_AVAILABLE; - break; - } - break; - case DL_DEV_UNBINDING: - link->status = DL_STATE_SUPPLIER_UNBIND; - break; - default: - link->status = DL_STATE_DORMANT; - break; - } - } + else + device_link_init_status(link, consumer, supplier); /* * Some callers expect the link creation during consumer driver probe to @@ -528,7 +543,7 @@ static void device_links_missing_supplier(struct device *dev) * mark the link as "consumer probe in progress" to make the supplier removal * wait for us to complete (or bad things may happen). * - * Links with the DL_FLAG_STATELESS flag set are ignored. + * Links without the DL_FLAG_MANAGED flag set are ignored. */ int device_links_check_suppliers(struct device *dev) { @@ -538,7 +553,7 @@ int device_links_check_suppliers(struct device *dev) device_links_write_lock(); list_for_each_entry(link, &dev->links.suppliers, c_node) { - if (link->flags & DL_FLAG_STATELESS) + if (!(link->flags & DL_FLAG_MANAGED)) continue; if (link->status != DL_STATE_AVAILABLE) { @@ -563,7 +578,7 @@ int device_links_check_suppliers(struct device *dev) * * Also change the status of @dev's links to suppliers to "active". * - * Links with the DL_FLAG_STATELESS flag set are ignored. + * Links without the DL_FLAG_MANAGED flag set are ignored. */ void device_links_driver_bound(struct device *dev) { @@ -572,7 +587,7 @@ void device_links_driver_bound(struct device *dev) device_links_write_lock(); list_for_each_entry(link, &dev->links.consumers, s_node) { - if (link->flags & DL_FLAG_STATELESS) + if (!(link->flags & DL_FLAG_MANAGED)) continue; /* @@ -593,7 +608,7 @@ void device_links_driver_bound(struct device *dev) } list_for_each_entry(link, &dev->links.suppliers, c_node) { - if (link->flags & DL_FLAG_STATELESS) + if (!(link->flags & DL_FLAG_MANAGED)) continue; WARN_ON(link->status != DL_STATE_CONSUMER_PROBE); @@ -605,6 +620,13 @@ void device_links_driver_bound(struct device *dev) device_links_write_unlock(); } +static void device_link_drop_managed(struct device_link *link) +{ + link->flags &= ~DL_FLAG_MANAGED; + WRITE_ONCE(link->status, DL_STATE_NONE); + kref_put(&link->kref, __device_link_del); +} + /** * __device_links_no_driver - Update links of a device without a driver. * @dev: Device without a drvier. @@ -615,18 +637,18 @@ void device_links_driver_bound(struct device *dev) * unless they already are in the "supplier unbind in progress" state in which * case they need not be updated. * - * Links with the DL_FLAG_STATELESS flag set are ignored. + * Links without the DL_FLAG_MANAGED flag set are ignored. */ static void __device_links_no_driver(struct device *dev) { struct device_link *link, *ln; list_for_each_entry_safe_reverse(link, ln, &dev->links.suppliers, c_node) { - if (link->flags & DL_FLAG_STATELESS) + if (!(link->flags & DL_FLAG_MANAGED)) continue; if (link->flags & DL_FLAG_AUTOREMOVE_CONSUMER) - __device_link_del(&link->kref); + device_link_drop_managed(link); else if (link->status == DL_STATE_CONSUMER_PROBE || link->status == DL_STATE_ACTIVE) WRITE_ONCE(link->status, DL_STATE_AVAILABLE); @@ -643,7 +665,7 @@ static void __device_links_no_driver(struct device *dev) * %__device_links_no_driver() to update links to suppliers for it as * appropriate. * - * Links with the DL_FLAG_STATELESS flag set are ignored. + * Links without the DL_FLAG_MANAGED flag set are ignored. */ void device_links_no_driver(struct device *dev) { @@ -652,7 +674,7 @@ void device_links_no_driver(struct device *dev) device_links_write_lock(); list_for_each_entry(link, &dev->links.consumers, s_node) { - if (link->flags & DL_FLAG_STATELESS) + if (!(link->flags & DL_FLAG_MANAGED)) continue; /* @@ -680,7 +702,7 @@ void device_links_no_driver(struct device *dev) * invoke %__device_links_no_driver() to update links to suppliers for it as * appropriate. * - * Links with the DL_FLAG_STATELESS flag set are ignored. + * Links without the DL_FLAG_MANAGED flag set are ignored. */ void device_links_driver_cleanup(struct device *dev) { @@ -689,7 +711,7 @@ void device_links_driver_cleanup(struct device *dev) device_links_write_lock(); list_for_each_entry_safe(link, ln, &dev->links.consumers, s_node) { - if (link->flags & DL_FLAG_STATELESS) + if (!(link->flags & DL_FLAG_MANAGED)) continue; WARN_ON(link->flags & DL_FLAG_AUTOREMOVE_CONSUMER); @@ -702,7 +724,7 @@ void device_links_driver_cleanup(struct device *dev) */ if (link->status == DL_STATE_SUPPLIER_UNBIND && link->flags & DL_FLAG_AUTOREMOVE_SUPPLIER) - __device_link_del(&link->kref); + device_link_drop_managed(link); WRITE_ONCE(link->status, DL_STATE_DORMANT); } @@ -724,7 +746,7 @@ void device_links_driver_cleanup(struct device *dev) * * Return 'false' if there are no probing or active consumers. * - * Links with the DL_FLAG_STATELESS flag set are ignored. + * Links without the DL_FLAG_MANAGED flag set are ignored. */ bool device_links_busy(struct device *dev) { @@ -734,7 +756,7 @@ bool device_links_busy(struct device *dev) device_links_write_lock(); list_for_each_entry(link, &dev->links.consumers, s_node) { - if (link->flags & DL_FLAG_STATELESS) + if (!(link->flags & DL_FLAG_MANAGED)) continue; if (link->status == DL_STATE_CONSUMER_PROBE @@ -764,7 +786,7 @@ bool device_links_busy(struct device *dev) * driver to unbind and start over (the consumer will not re-probe as we have * changed the state of the link already). * - * Links with the DL_FLAG_STATELESS flag set are ignored. + * Links without the DL_FLAG_MANAGED flag set are ignored. */ void device_links_unbind_consumers(struct device *dev) { @@ -776,7 +798,7 @@ void device_links_unbind_consumers(struct device *dev) list_for_each_entry(link, &dev->links.consumers, s_node) { enum device_link_state status; - if (link->flags & DL_FLAG_STATELESS) + if (!(link->flags & DL_FLAG_MANAGED)) continue; status = link->status; diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c index b75335508d2c..45a8fbe6987a 100644 --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c @@ -1624,7 +1624,7 @@ void pm_runtime_remove(struct device *dev) * runtime PM references to the device, drop the usage counter of the device * (as many times as needed). * - * Links with the DL_FLAG_STATELESS flag set are ignored. + * Links with the DL_FLAG_MANAGED flag unset are ignored. * * Since the device is guaranteed to be runtime-active at the point this is * called, nothing else needs to be done here. @@ -1641,7 +1641,7 @@ void pm_runtime_clean_up_links(struct device *dev) idx = device_links_read_lock(); list_for_each_entry_rcu(link, &dev->links.consumers, s_node) { - if (link->flags & DL_FLAG_STATELESS) + if (!(link->flags & DL_FLAG_MANAGED)) continue; while (refcount_dec_not_one(&link->rpm_active)) diff --git a/include/linux/device.h b/include/linux/device.h index c330b75c6c57..c1c489921e4c 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -833,12 +833,13 @@ enum device_link_state { /* * Device link flags. * - * STATELESS: The core won't track the presence of supplier/consumer drivers. + * STATELESS: The core will not remove this link automatically. * AUTOREMOVE_CONSUMER: Remove the link automatically on consumer driver unbind. * PM_RUNTIME: If set, the runtime PM framework will use this link. * RPM_ACTIVE: Run pm_runtime_get_sync() on the supplier during link creation. * AUTOREMOVE_SUPPLIER: Remove the link automatically on supplier driver unbind. * AUTOPROBE_CONSUMER: Probe consumer driver automatically after supplier binds. + * MANAGED: The core tracks presence of supplier/consumer drivers (internal). */ #define DL_FLAG_STATELESS BIT(0) #define DL_FLAG_AUTOREMOVE_CONSUMER BIT(1) @@ -846,6 +847,7 @@ enum device_link_state { #define DL_FLAG_RPM_ACTIVE BIT(3) #define DL_FLAG_AUTOREMOVE_SUPPLIER BIT(4) #define DL_FLAG_AUTOPROBE_CONSUMER BIT(5) +#define DL_FLAG_MANAGED BIT(6) /** * struct device_link - Device link representation. -- cgit v1.2.3 From 016413d967061fc2eb6798a487b3022bef7698a6 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Thu, 4 Apr 2019 19:43:29 -0400 Subject: media: v4l2-async: Get fwnode reference when putting it to the notifier's list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The v4l2_async_notifier_add_fwnode_subdev() did not take a reference of the added fwnode, relying on the caller to handle that instead, in essence putting the fwnode to be added if there was an error. As the reference is eventually released during the notifier cleanup, this is not intuitive nor logical. Improve this by always getting a reference when the function succeeds, and the caller releasing the reference when it does not *itself* need it anymore. Luckily, perhaps, there were just a handful of callers using the function. Signed-off-by: Sakari Ailus Reviewed-by: Jacopo Mondi Tested-by: Niklas Söderlund Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/am437x/am437x-vpfe.c | 5 ++--- drivers/media/platform/davinci/vpif_capture.c | 13 ++++++------- drivers/media/platform/qcom/camss/camss.c | 2 +- drivers/media/platform/xilinx/xilinx-vipp.c | 2 +- drivers/media/v4l2-core/v4l2-async.c | 3 ++- drivers/media/v4l2-core/v4l2-fwnode.c | 23 ++++++----------------- include/media/v4l2-async.h | 5 +++-- 7 files changed, 21 insertions(+), 32 deletions(-) (limited to 'include') diff --git a/drivers/media/platform/am437x/am437x-vpfe.c b/drivers/media/platform/am437x/am437x-vpfe.c index 105237edbb58..3b1d60ca859b 100644 --- a/drivers/media/platform/am437x/am437x-vpfe.c +++ b/drivers/media/platform/am437x/am437x-vpfe.c @@ -2489,10 +2489,9 @@ vpfe_get_pdata(struct vpfe_device *vpfe) pdata->asd[i] = v4l2_async_notifier_add_fwnode_subdev( &vpfe->notifier, of_fwnode_handle(rem), sizeof(struct v4l2_async_subdev)); - if (IS_ERR(pdata->asd[i])) { - of_node_put(rem); + of_node_put(rem); + if (IS_ERR(pdata->asd[i])) goto cleanup; - } } of_node_put(endpoint); diff --git a/drivers/media/platform/davinci/vpif_capture.c b/drivers/media/platform/davinci/vpif_capture.c index 4bbb4fdeae92..71f4fe882d13 100644 --- a/drivers/media/platform/davinci/vpif_capture.c +++ b/drivers/media/platform/davinci/vpif_capture.c @@ -1502,6 +1502,7 @@ static struct vpif_capture_config * vpif_capture_get_pdata(struct platform_device *pdev) { struct device_node *endpoint = NULL; + struct device_node *rem = NULL; struct vpif_capture_config *pdata; struct vpif_subdev_info *sdinfo; struct vpif_capture_chan_config *chan; @@ -1532,7 +1533,6 @@ vpif_capture_get_pdata(struct platform_device *pdev) for (i = 0; i < VPIF_CAPTURE_NUM_CHANNELS; i++) { struct v4l2_fwnode_endpoint bus_cfg = { .bus_type = 0 }; - struct device_node *rem; unsigned int flags; int err; @@ -1554,10 +1554,8 @@ vpif_capture_get_pdata(struct platform_device *pdev) VPIF_CAPTURE_NUM_CHANNELS, sizeof(*chan->inputs), GFP_KERNEL); - if (!chan->inputs) { - of_node_put(rem); + if (!chan->inputs) goto err_cleanup; - } chan->input_count++; chan->inputs[i].input.type = V4L2_INPUT_TYPE_CAMERA; @@ -1589,10 +1587,10 @@ vpif_capture_get_pdata(struct platform_device *pdev) pdata->asd[i] = v4l2_async_notifier_add_fwnode_subdev( &vpif_obj.notifier, of_fwnode_handle(rem), sizeof(struct v4l2_async_subdev)); - if (IS_ERR(pdata->asd[i])) { - of_node_put(rem); + if (IS_ERR(pdata->asd[i])) goto err_cleanup; - } + + of_node_put(rem); } done: @@ -1604,6 +1602,7 @@ done: return pdata; err_cleanup: + of_node_put(rem); of_node_put(endpoint); v4l2_async_notifier_cleanup(&vpif_obj.notifier); diff --git a/drivers/media/platform/qcom/camss/camss.c b/drivers/media/platform/qcom/camss/camss.c index 63da18773d24..3fdc9f964a3c 100644 --- a/drivers/media/platform/qcom/camss/camss.c +++ b/drivers/media/platform/qcom/camss/camss.c @@ -486,9 +486,9 @@ static int camss_of_parse_ports(struct camss *camss) asd = v4l2_async_notifier_add_fwnode_subdev( &camss->notifier, of_fwnode_handle(remote), sizeof(*csd)); + of_node_put(remote); if (IS_ERR(asd)) { ret = PTR_ERR(asd); - of_node_put(remote); goto err_cleanup; } diff --git a/drivers/media/platform/xilinx/xilinx-vipp.c b/drivers/media/platform/xilinx/xilinx-vipp.c index edce0402155d..cc2856efea59 100644 --- a/drivers/media/platform/xilinx/xilinx-vipp.c +++ b/drivers/media/platform/xilinx/xilinx-vipp.c @@ -385,9 +385,9 @@ static int xvip_graph_parse_one(struct xvip_composite_device *xdev, asd = v4l2_async_notifier_add_fwnode_subdev( &xdev->notifier, remote, sizeof(struct xvip_graph_entity)); + fwnode_handle_put(remote); if (IS_ERR(asd)) { ret = PTR_ERR(asd); - fwnode_handle_put(remote); goto err_notifier_cleanup; } } diff --git a/drivers/media/v4l2-core/v4l2-async.c b/drivers/media/v4l2-core/v4l2-async.c index 8d307b538f52..7d364c545a40 100644 --- a/drivers/media/v4l2-core/v4l2-async.c +++ b/drivers/media/v4l2-core/v4l2-async.c @@ -593,10 +593,11 @@ v4l2_async_notifier_add_fwnode_subdev(struct v4l2_async_notifier *notifier, return ERR_PTR(-ENOMEM); asd->match_type = V4L2_ASYNC_MATCH_FWNODE; - asd->match.fwnode = fwnode; + asd->match.fwnode = fwnode_handle_get(fwnode); ret = v4l2_async_notifier_add_subdev(notifier, asd); if (ret) { + fwnode_handle_put(fwnode); kfree(asd); return ERR_PTR(ret); } diff --git a/drivers/media/v4l2-core/v4l2-fwnode.c b/drivers/media/v4l2-core/v4l2-fwnode.c index 6972c5af25c6..3bd1888787eb 100644 --- a/drivers/media/v4l2-core/v4l2-fwnode.c +++ b/drivers/media/v4l2-core/v4l2-fwnode.c @@ -775,23 +775,17 @@ static int v4l2_fwnode_reference_parse(struct device *dev, asd = v4l2_async_notifier_add_fwnode_subdev(notifier, args.fwnode, sizeof(*asd)); + fwnode_handle_put(args.fwnode); if (IS_ERR(asd)) { - ret = PTR_ERR(asd); /* not an error if asd already exists */ - if (ret == -EEXIST) { - fwnode_handle_put(args.fwnode); + if (PTR_ERR(asd) == -EEXIST) continue; - } - goto error; + return PTR_ERR(asd); } } return 0; - -error: - fwnode_handle_put(args.fwnode); - return ret; } /* @@ -1081,23 +1075,18 @@ v4l2_fwnode_reference_parse_int_props(struct device *dev, asd = v4l2_async_notifier_add_fwnode_subdev(notifier, fwnode, sizeof(*asd)); + fwnode_handle_put(fwnode); if (IS_ERR(asd)) { ret = PTR_ERR(asd); /* not an error if asd already exists */ - if (ret == -EEXIST) { - fwnode_handle_put(fwnode); + if (ret == -EEXIST) continue; - } - goto error; + return PTR_ERR(asd); } } return !fwnode || PTR_ERR(fwnode) == -ENOENT ? 0 : PTR_ERR(fwnode); - -error: - fwnode_handle_put(fwnode); - return ret; } int v4l2_async_notifier_parse_fwnode_sensor_common(struct device *dev, diff --git a/include/media/v4l2-async.h b/include/media/v4l2-async.h index 2e3d93f742a3..a95f7fd8969e 100644 --- a/include/media/v4l2-async.h +++ b/include/media/v4l2-async.h @@ -172,8 +172,9 @@ int v4l2_async_notifier_add_subdev(struct v4l2_async_notifier *notifier, * the driver's async sub-device struct, i.e. both * begin at the same memory address. * - * Allocate a fwnode-matched asd of size asd_struct_size, and add it - * to the notifiers @asd_list. + * Allocate a fwnode-matched asd of size asd_struct_size, and add it to the + * notifiers @asd_list. The function also gets a reference of the fwnode which + * is released later at notifier cleanup time. */ struct v4l2_async_subdev * v4l2_async_notifier_add_fwnode_subdev(struct v4l2_async_notifier *notifier, -- cgit v1.2.3 From 820342aca05188c9af4b468d6c41b3327161f7ad Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Thu, 28 Feb 2019 08:25:28 -0500 Subject: media: v4l2-async: Add v4l2_async_notifier_add_fwnode_remote_subdev MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit v4l2_async_notifier_add_fwnode_remote_subdev is a convenience function for parsing information on V4L2 fwnode subdevs. Signed-off-by: Sakari Ailus Tested-by: Niklas Söderlund Signed-off-by: Mauro Carvalho Chehab --- drivers/media/v4l2-core/v4l2-async.c | 23 +++++++++++++++++++++++ include/media/v4l2-async.h | 25 +++++++++++++++++++++++++ 2 files changed, 48 insertions(+) (limited to 'include') diff --git a/drivers/media/v4l2-core/v4l2-async.c b/drivers/media/v4l2-core/v4l2-async.c index 7d364c545a40..dc4f470ea6a7 100644 --- a/drivers/media/v4l2-core/v4l2-async.c +++ b/drivers/media/v4l2-core/v4l2-async.c @@ -606,6 +606,29 @@ v4l2_async_notifier_add_fwnode_subdev(struct v4l2_async_notifier *notifier, } EXPORT_SYMBOL_GPL(v4l2_async_notifier_add_fwnode_subdev); +int +v4l2_async_notifier_add_fwnode_remote_subdev(struct v4l2_async_notifier *notif, + struct fwnode_handle *endpoint, + struct v4l2_async_subdev *asd) +{ + struct fwnode_handle *remote; + int ret; + + remote = fwnode_graph_get_remote_port_parent(endpoint); + if (!remote) + return -ENOTCONN; + + asd->match_type = V4L2_ASYNC_MATCH_FWNODE; + asd->match.fwnode = remote; + + ret = v4l2_async_notifier_add_subdev(notif, asd); + if (ret) + fwnode_handle_put(remote); + + return ret; +} +EXPORT_SYMBOL_GPL(v4l2_async_notifier_add_fwnode_remote_subdev); + struct v4l2_async_subdev * v4l2_async_notifier_add_i2c_subdev(struct v4l2_async_notifier *notifier, int adapter_id, unsigned short address, diff --git a/include/media/v4l2-async.h b/include/media/v4l2-async.h index a95f7fd8969e..8319284c93cb 100644 --- a/include/media/v4l2-async.h +++ b/include/media/v4l2-async.h @@ -181,6 +181,31 @@ v4l2_async_notifier_add_fwnode_subdev(struct v4l2_async_notifier *notifier, struct fwnode_handle *fwnode, unsigned int asd_struct_size); +/** + * v4l2_async_notifier_add_fwnode_remote_subdev - Allocate and add a fwnode + * remote async subdev to the + * notifier's master asd_list. + * + * @notif: pointer to &struct v4l2_async_notifier + * @endpoint: local endpoint pointing to the remote sub-device to be matched + * @asd: Async sub-device struct allocated by the caller. The &struct + * v4l2_async_subdev shall be the first member of the driver's async + * sub-device struct, i.e. both begin at the same memory address. + * + * Gets the remote endpoint of a given local endpoint, set it up for fwnode + * matching and adds the async sub-device to the notifier's @asd_list. The + * function also gets a reference of the fwnode which is released later at + * notifier cleanup time. + * + * This is just like @v4l2_async_notifier_add_fwnode_subdev, but with the + * exception that the fwnode refers to a local endpoint, not the remote one, and + * the function relies on the caller to allocate the async sub-device struct. + */ +int +v4l2_async_notifier_add_fwnode_remote_subdev(struct v4l2_async_notifier *notif, + struct fwnode_handle *endpoint, + struct v4l2_async_subdev *asd); + /** * v4l2_async_notifier_add_i2c_subdev - Allocate and add an i2c async * subdev to the notifier's master asd_list. -- cgit v1.2.3 From 16d51a590a8ce3befb1308e0e7ab77f3b661af33 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Tue, 16 Jul 2019 17:20:45 +0200 Subject: sched/fair: Don't free p->numa_faults with concurrent readers When going through execve(), zero out the NUMA fault statistics instead of freeing them. During execve, the task is reachable through procfs and the scheduler. A concurrent /proc/*/sched reader can read data from a freed ->numa_faults allocation (confirmed by KASAN) and write it back to userspace. I believe that it would also be possible for a use-after-free read to occur through a race between a NUMA fault and execve(): task_numa_fault() can lead to task_numa_compare(), which invokes task_weight() on the currently running task of a different CPU. Another way to fix this would be to make ->numa_faults RCU-managed or add extra locking, but it seems easier to wipe the NUMA fault statistics on execve. Signed-off-by: Jann Horn Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Petr Mladek Cc: Sergey Senozhatsky Cc: Thomas Gleixner Cc: Will Deacon Fixes: 82727018b0d3 ("sched/numa: Call task_numa_free() from do_execve()") Link: https://lkml.kernel.org/r/20190716152047.14424-1-jannh@google.com Signed-off-by: Ingo Molnar --- fs/exec.c | 2 +- include/linux/sched/numa_balancing.h | 4 ++-- kernel/fork.c | 2 +- kernel/sched/fair.c | 24 ++++++++++++++++++++---- 4 files changed, 24 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/fs/exec.c b/fs/exec.c index c71cbfe6826a..f7f6a140856a 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1828,7 +1828,7 @@ static int __do_execve_file(int fd, struct filename *filename, membarrier_execve(current); rseq_execve(current); acct_update_integrals(current); - task_numa_free(current); + task_numa_free(current, false); free_bprm(bprm); kfree(pathbuf); if (filename) diff --git a/include/linux/sched/numa_balancing.h b/include/linux/sched/numa_balancing.h index e7dd04a84ba8..3988762efe15 100644 --- a/include/linux/sched/numa_balancing.h +++ b/include/linux/sched/numa_balancing.h @@ -19,7 +19,7 @@ extern void task_numa_fault(int last_node, int node, int pages, int flags); extern pid_t task_numa_group_id(struct task_struct *p); extern void set_numabalancing_state(bool enabled); -extern void task_numa_free(struct task_struct *p); +extern void task_numa_free(struct task_struct *p, bool final); extern bool should_numa_migrate_memory(struct task_struct *p, struct page *page, int src_nid, int dst_cpu); #else @@ -34,7 +34,7 @@ static inline pid_t task_numa_group_id(struct task_struct *p) static inline void set_numabalancing_state(bool enabled) { } -static inline void task_numa_free(struct task_struct *p) +static inline void task_numa_free(struct task_struct *p, bool final) { } static inline bool should_numa_migrate_memory(struct task_struct *p, diff --git a/kernel/fork.c b/kernel/fork.c index d8ae0f1b4148..2852d0e76ea3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -726,7 +726,7 @@ void __put_task_struct(struct task_struct *tsk) WARN_ON(tsk == current); cgroup_free(tsk); - task_numa_free(tsk); + task_numa_free(tsk, true); security_task_free(tsk); exit_creds(tsk); delayacct_tsk_free(tsk); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 036be95a87e9..6adb0e0f5feb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2353,13 +2353,23 @@ no_join: return; } -void task_numa_free(struct task_struct *p) +/* + * Get rid of NUMA staticstics associated with a task (either current or dead). + * If @final is set, the task is dead and has reached refcount zero, so we can + * safely free all relevant data structures. Otherwise, there might be + * concurrent reads from places like load balancing and procfs, and we should + * reset the data back to default state without freeing ->numa_faults. + */ +void task_numa_free(struct task_struct *p, bool final) { struct numa_group *grp = p->numa_group; - void *numa_faults = p->numa_faults; + unsigned long *numa_faults = p->numa_faults; unsigned long flags; int i; + if (!numa_faults) + return; + if (grp) { spin_lock_irqsave(&grp->lock, flags); for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) @@ -2372,8 +2382,14 @@ void task_numa_free(struct task_struct *p) put_numa_group(grp); } - p->numa_faults = NULL; - kfree(numa_faults); + if (final) { + p->numa_faults = NULL; + kfree(numa_faults); + } else { + p->total_numa_faults = 0; + for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) + numa_faults[i] = 0; + } } /* -- cgit v1.2.3 From cb361d8cdef69990f6b4504dc1fd9a594d983c97 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Tue, 16 Jul 2019 17:20:47 +0200 Subject: sched/fair: Use RCU accessors consistently for ->numa_group The old code used RCU annotations and accessors inconsistently for ->numa_group, which can lead to use-after-frees and NULL dereferences. Let all accesses to ->numa_group use proper RCU helpers to prevent such issues. Signed-off-by: Jann Horn Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Petr Mladek Cc: Sergey Senozhatsky Cc: Thomas Gleixner Cc: Will Deacon Fixes: 8c8a743c5087 ("sched/numa: Use {cpu, pid} to create task groups for shared faults") Link: https://lkml.kernel.org/r/20190716152047.14424-3-jannh@google.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 10 ++++- kernel/sched/fair.c | 120 ++++++++++++++++++++++++++++++++++---------------- 2 files changed, 90 insertions(+), 40 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 8dc1811487f5..9f51932bd543 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1092,7 +1092,15 @@ struct task_struct { u64 last_sum_exec_runtime; struct callback_head numa_work; - struct numa_group *numa_group; + /* + * This pointer is only modified for current in syscall and + * pagefault context (and for tasks being destroyed), so it can be read + * from any of the following contexts: + * - RCU read-side critical section + * - current->numa_group from everywhere + * - task's runqueue locked, task not running + */ + struct numa_group __rcu *numa_group; /* * numa_faults is an array split into four regions: diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6adb0e0f5feb..bc9cfeaac8bd 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1086,6 +1086,21 @@ struct numa_group { unsigned long faults[0]; }; +/* + * For functions that can be called in multiple contexts that permit reading + * ->numa_group (see struct task_struct for locking rules). + */ +static struct numa_group *deref_task_numa_group(struct task_struct *p) +{ + return rcu_dereference_check(p->numa_group, p == current || + (lockdep_is_held(&task_rq(p)->lock) && !READ_ONCE(p->on_cpu))); +} + +static struct numa_group *deref_curr_numa_group(struct task_struct *p) +{ + return rcu_dereference_protected(p->numa_group, p == current); +} + static inline unsigned long group_faults_priv(struct numa_group *ng); static inline unsigned long group_faults_shared(struct numa_group *ng); @@ -1129,10 +1144,12 @@ static unsigned int task_scan_start(struct task_struct *p) { unsigned long smin = task_scan_min(p); unsigned long period = smin; + struct numa_group *ng; /* Scale the maximum scan period with the amount of shared memory. */ - if (p->numa_group) { - struct numa_group *ng = p->numa_group; + rcu_read_lock(); + ng = rcu_dereference(p->numa_group); + if (ng) { unsigned long shared = group_faults_shared(ng); unsigned long private = group_faults_priv(ng); @@ -1140,6 +1157,7 @@ static unsigned int task_scan_start(struct task_struct *p) period *= shared + 1; period /= private + shared + 1; } + rcu_read_unlock(); return max(smin, period); } @@ -1148,13 +1166,14 @@ static unsigned int task_scan_max(struct task_struct *p) { unsigned long smin = task_scan_min(p); unsigned long smax; + struct numa_group *ng; /* Watch for min being lower than max due to floor calculations */ smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p); /* Scale the maximum scan period with the amount of shared memory. */ - if (p->numa_group) { - struct numa_group *ng = p->numa_group; + ng = deref_curr_numa_group(p); + if (ng) { unsigned long shared = group_faults_shared(ng); unsigned long private = group_faults_priv(ng); unsigned long period = smax; @@ -1186,7 +1205,7 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) p->numa_scan_period = sysctl_numa_balancing_scan_delay; p->numa_work.next = &p->numa_work; p->numa_faults = NULL; - p->numa_group = NULL; + RCU_INIT_POINTER(p->numa_group, NULL); p->last_task_numa_placement = 0; p->last_sum_exec_runtime = 0; @@ -1233,7 +1252,16 @@ static void account_numa_dequeue(struct rq *rq, struct task_struct *p) pid_t task_numa_group_id(struct task_struct *p) { - return p->numa_group ? p->numa_group->gid : 0; + struct numa_group *ng; + pid_t gid = 0; + + rcu_read_lock(); + ng = rcu_dereference(p->numa_group); + if (ng) + gid = ng->gid; + rcu_read_unlock(); + + return gid; } /* @@ -1258,11 +1286,13 @@ static inline unsigned long task_faults(struct task_struct *p, int nid) static inline unsigned long group_faults(struct task_struct *p, int nid) { - if (!p->numa_group) + struct numa_group *ng = deref_task_numa_group(p); + + if (!ng) return 0; - return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] + - p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)]; + return ng->faults[task_faults_idx(NUMA_MEM, nid, 0)] + + ng->faults[task_faults_idx(NUMA_MEM, nid, 1)]; } static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) @@ -1400,12 +1430,13 @@ static inline unsigned long task_weight(struct task_struct *p, int nid, static inline unsigned long group_weight(struct task_struct *p, int nid, int dist) { + struct numa_group *ng = deref_task_numa_group(p); unsigned long faults, total_faults; - if (!p->numa_group) + if (!ng) return 0; - total_faults = p->numa_group->total_faults; + total_faults = ng->total_faults; if (!total_faults) return 0; @@ -1419,7 +1450,7 @@ static inline unsigned long group_weight(struct task_struct *p, int nid, bool should_numa_migrate_memory(struct task_struct *p, struct page * page, int src_nid, int dst_cpu) { - struct numa_group *ng = p->numa_group; + struct numa_group *ng = deref_curr_numa_group(p); int dst_nid = cpu_to_node(dst_cpu); int last_cpupid, this_cpupid; @@ -1600,13 +1631,14 @@ static bool load_too_imbalanced(long src_load, long dst_load, static void task_numa_compare(struct task_numa_env *env, long taskimp, long groupimp, bool maymove) { + struct numa_group *cur_ng, *p_ng = deref_curr_numa_group(env->p); struct rq *dst_rq = cpu_rq(env->dst_cpu); + long imp = p_ng ? groupimp : taskimp; struct task_struct *cur; long src_load, dst_load; - long load; - long imp = env->p->numa_group ? groupimp : taskimp; - long moveimp = imp; int dist = env->dist; + long moveimp = imp; + long load; if (READ_ONCE(dst_rq->numa_migrate_on)) return; @@ -1645,21 +1677,22 @@ static void task_numa_compare(struct task_numa_env *env, * If dst and source tasks are in the same NUMA group, or not * in any group then look only at task weights. */ - if (cur->numa_group == env->p->numa_group) { + cur_ng = rcu_dereference(cur->numa_group); + if (cur_ng == p_ng) { imp = taskimp + task_weight(cur, env->src_nid, dist) - task_weight(cur, env->dst_nid, dist); /* * Add some hysteresis to prevent swapping the * tasks within a group over tiny differences. */ - if (cur->numa_group) + if (cur_ng) imp -= imp / 16; } else { /* * Compare the group weights. If a task is all by itself * (not part of a group), use the task weight instead. */ - if (cur->numa_group && env->p->numa_group) + if (cur_ng && p_ng) imp += group_weight(cur, env->src_nid, dist) - group_weight(cur, env->dst_nid, dist); else @@ -1757,11 +1790,12 @@ static int task_numa_migrate(struct task_struct *p) .best_imp = 0, .best_cpu = -1, }; + unsigned long taskweight, groupweight; struct sched_domain *sd; + long taskimp, groupimp; + struct numa_group *ng; struct rq *best_rq; - unsigned long taskweight, groupweight; int nid, ret, dist; - long taskimp, groupimp; /* * Pick the lowest SD_NUMA domain, as that would have the smallest @@ -1807,7 +1841,8 @@ static int task_numa_migrate(struct task_struct *p) * multiple NUMA nodes; in order to better consolidate the group, * we need to check other locations. */ - if (env.best_cpu == -1 || (p->numa_group && p->numa_group->active_nodes > 1)) { + ng = deref_curr_numa_group(p); + if (env.best_cpu == -1 || (ng && ng->active_nodes > 1)) { for_each_online_node(nid) { if (nid == env.src_nid || nid == p->numa_preferred_nid) continue; @@ -1840,7 +1875,7 @@ static int task_numa_migrate(struct task_struct *p) * A task that migrated to a second choice node will be better off * trying for a better one later. Do not set the preferred node here. */ - if (p->numa_group) { + if (ng) { if (env.best_cpu == -1) nid = env.src_nid; else @@ -2135,6 +2170,7 @@ static void task_numa_placement(struct task_struct *p) unsigned long total_faults; u64 runtime, period; spinlock_t *group_lock = NULL; + struct numa_group *ng; /* * The p->mm->numa_scan_seq field gets updated without @@ -2152,8 +2188,9 @@ static void task_numa_placement(struct task_struct *p) runtime = numa_get_avg_runtime(p, &period); /* If the task is part of a group prevent parallel updates to group stats */ - if (p->numa_group) { - group_lock = &p->numa_group->lock; + ng = deref_curr_numa_group(p); + if (ng) { + group_lock = &ng->lock; spin_lock_irq(group_lock); } @@ -2194,7 +2231,7 @@ static void task_numa_placement(struct task_struct *p) p->numa_faults[cpu_idx] += f_diff; faults += p->numa_faults[mem_idx]; p->total_numa_faults += diff; - if (p->numa_group) { + if (ng) { /* * safe because we can only change our own group * @@ -2202,14 +2239,14 @@ static void task_numa_placement(struct task_struct *p) * nid and priv in a specific region because it * is at the beginning of the numa_faults array. */ - p->numa_group->faults[mem_idx] += diff; - p->numa_group->faults_cpu[mem_idx] += f_diff; - p->numa_group->total_faults += diff; - group_faults += p->numa_group->faults[mem_idx]; + ng->faults[mem_idx] += diff; + ng->faults_cpu[mem_idx] += f_diff; + ng->total_faults += diff; + group_faults += ng->faults[mem_idx]; } } - if (!p->numa_group) { + if (!ng) { if (faults > max_faults) { max_faults = faults; max_nid = nid; @@ -2220,8 +2257,8 @@ static void task_numa_placement(struct task_struct *p) } } - if (p->numa_group) { - numa_group_count_active_nodes(p->numa_group); + if (ng) { + numa_group_count_active_nodes(ng); spin_unlock_irq(group_lock); max_nid = preferred_group_nid(p, max_nid); } @@ -2255,7 +2292,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, int cpu = cpupid_to_cpu(cpupid); int i; - if (unlikely(!p->numa_group)) { + if (unlikely(!deref_curr_numa_group(p))) { unsigned int size = sizeof(struct numa_group) + 4*nr_node_ids*sizeof(unsigned long); @@ -2291,7 +2328,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, if (!grp) goto no_join; - my_grp = p->numa_group; + my_grp = deref_curr_numa_group(p); if (grp == my_grp) goto no_join; @@ -2362,7 +2399,8 @@ no_join: */ void task_numa_free(struct task_struct *p, bool final) { - struct numa_group *grp = p->numa_group; + /* safe: p either is current or is being freed by current */ + struct numa_group *grp = rcu_dereference_raw(p->numa_group); unsigned long *numa_faults = p->numa_faults; unsigned long flags; int i; @@ -2442,7 +2480,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) * actively using should be counted as local. This allows the * scan rate to slow down when a workload has settled down. */ - ng = p->numa_group; + ng = deref_curr_numa_group(p); if (!priv && !local && ng && ng->active_nodes > 1 && numa_is_active_node(cpu_node, ng) && numa_is_active_node(mem_node, ng)) @@ -10460,18 +10498,22 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m) { int node; unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0; + struct numa_group *ng; + rcu_read_lock(); + ng = rcu_dereference(p->numa_group); for_each_online_node(node) { if (p->numa_faults) { tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)]; tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)]; } - if (p->numa_group) { - gsf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 0)], - gpf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 1)]; + if (ng) { + gsf = ng->faults[task_faults_idx(NUMA_MEM, node, 0)], + gpf = ng->faults[task_faults_idx(NUMA_MEM, node, 1)]; } print_numa_stats(m, node, tsf, tpf, gsf, gpf); } + rcu_read_unlock(); } #endif /* CONFIG_NUMA_BALANCING */ #endif /* CONFIG_SCHED_DEBUG */ -- cgit v1.2.3 From 364f6afc4f5537b79cf454eb35cae92920676075 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 22 Jul 2019 11:24:40 -0700 Subject: locking/lockdep: Make it clear that what lock_class::key points at is not modified This patch does not change the behavior of the lockdep code. Signed-off-by: Bart Van Assche Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Waiman Long Cc: Will Deacon Link: https://lkml.kernel.org/r/20190722182443.216015-2-bvanassche@acm.org Signed-off-by: Ingo Molnar --- include/linux/lockdep.h | 2 +- kernel/locking/lockdep.c | 2 +- kernel/locking/lockdep_internals.h | 3 ++- kernel/locking/lockdep_proc.c | 2 +- 4 files changed, 5 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 0b0d7259276d..cdb3c2f06092 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -97,7 +97,7 @@ struct lock_class { */ struct list_head locks_after, locks_before; - struct lockdep_subclass_key *key; + const struct lockdep_subclass_key *key; unsigned int subclass; unsigned int dep_gen_id; diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 4861cf8e274b..af6627866191 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -511,7 +511,7 @@ static const char *usage_str[] = }; #endif -const char * __get_key_name(struct lockdep_subclass_key *key, char *str) +const char *__get_key_name(const struct lockdep_subclass_key *key, char *str) { return kallsyms_lookup((unsigned long)key, NULL, NULL, NULL, str); } diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index cc83568d5012..2e518369add4 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -116,7 +116,8 @@ extern struct lock_chain lock_chains[]; extern void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS]); -extern const char * __get_key_name(struct lockdep_subclass_key *key, char *str); +extern const char *__get_key_name(const struct lockdep_subclass_key *key, + char *str); struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i); diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index bda006f8a88b..ed9842425cac 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -399,7 +399,7 @@ static void seq_lock_time(struct seq_file *m, struct lock_time *lt) static void seq_stats(struct seq_file *m, struct lock_stat_data *data) { - struct lockdep_subclass_key *ckey; + const struct lockdep_subclass_key *ckey; struct lock_class_stats *stats; struct lock_class *class; const char *cname; -- cgit v1.2.3 From a2970421640bd9b6a78f2685d7750a791abdfd4e Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 22 Jul 2019 11:24:41 -0700 Subject: stacktrace: Constify 'entries' arguments Make it clear to humans and to the compiler that the stack trace ('entries') arguments are not modified. Signed-off-by: Bart Van Assche Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Waiman Long Cc: Will Deacon Link: https://lkml.kernel.org/r/20190722182443.216015-3-bvanassche@acm.org Signed-off-by: Ingo Molnar --- include/linux/stacktrace.h | 4 ++-- kernel/stacktrace.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h index f0cfd12cb45e..83bd8cb475d7 100644 --- a/include/linux/stacktrace.h +++ b/include/linux/stacktrace.h @@ -9,9 +9,9 @@ struct task_struct; struct pt_regs; #ifdef CONFIG_STACKTRACE -void stack_trace_print(unsigned long *trace, unsigned int nr_entries, +void stack_trace_print(const unsigned long *trace, unsigned int nr_entries, int spaces); -int stack_trace_snprint(char *buf, size_t size, unsigned long *entries, +int stack_trace_snprint(char *buf, size_t size, const unsigned long *entries, unsigned int nr_entries, int spaces); unsigned int stack_trace_save(unsigned long *store, unsigned int size, unsigned int skipnr); diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index f5440abb7532..6d1f68b7e528 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c @@ -20,7 +20,7 @@ * @nr_entries: Number of entries in the storage array * @spaces: Number of leading spaces to print */ -void stack_trace_print(unsigned long *entries, unsigned int nr_entries, +void stack_trace_print(const unsigned long *entries, unsigned int nr_entries, int spaces) { unsigned int i; @@ -43,7 +43,7 @@ EXPORT_SYMBOL_GPL(stack_trace_print); * * Return: Number of bytes printed. */ -int stack_trace_snprint(char *buf, size_t size, unsigned long *entries, +int stack_trace_snprint(char *buf, size_t size, const unsigned long *entries, unsigned int nr_entries, int spaces) { unsigned int generated, i, total = 0; -- cgit v1.2.3 From 12593b7467f9130b64a6d4b6a26ed4ec217b6784 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 22 Jul 2019 11:24:42 -0700 Subject: locking/lockdep: Reduce space occupied by stack traces Although commit 669de8bda87b ("kernel/workqueue: Use dynamic lockdep keys for workqueues") unregisters dynamic lockdep keys when a workqueue is destroyed, a side effect of that commit is that all stack traces associated with the lockdep key are leaked when a workqueue is destroyed. Fix this by storing each unique stack trace once. Other changes in this patch are: - Use NULL instead of { .nr_entries = 0 } to represent 'no trace'. - Store a pointer to a stack trace in struct lock_class and struct lock_list instead of storing 'nr_entries' and 'offset'. This patch avoids that the following program triggers the "BUG: MAX_STACK_TRACE_ENTRIES too low!" complaint: #include #include int main() { for (;;) { int fd = open("/dev/infiniband/rdma_cm", O_RDWR); close(fd); } } Suggested-by: Peter Zijlstra Reported-by: Eric Biggers Signed-off-by: Bart Van Assche Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Thomas Gleixner Cc: Waiman Long Cc: Will Deacon Cc: Yuyang Du Link: https://lkml.kernel.org/r/20190722182443.216015-4-bvanassche@acm.org Signed-off-by: Ingo Molnar --- include/linux/lockdep.h | 9 +-- kernel/locking/lockdep.c | 128 ++++++++++++++++++++++++++----------- kernel/locking/lockdep_internals.h | 2 + 3 files changed, 95 insertions(+), 44 deletions(-) (limited to 'include') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index cdb3c2f06092..b8a835fd611b 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -66,10 +66,7 @@ struct lock_class_key { extern struct lock_class_key __lockdep_no_validate__; -struct lock_trace { - unsigned int nr_entries; - unsigned int offset; -}; +struct lock_trace; #define LOCKSTAT_POINTS 4 @@ -105,7 +102,7 @@ struct lock_class { * IRQ/softirq usage tracking bits: */ unsigned long usage_mask; - struct lock_trace usage_traces[XXX_LOCK_USAGE_STATES]; + const struct lock_trace *usage_traces[XXX_LOCK_USAGE_STATES]; /* * Generation counter, when doing certain classes of graph walking, @@ -193,7 +190,7 @@ struct lock_list { struct list_head entry; struct lock_class *class; struct lock_class *links_to; - struct lock_trace trace; + const struct lock_trace *trace; int distance; /* diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index af6627866191..1a96869cb2f0 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -449,33 +449,72 @@ static void print_lockdep_off(const char *bug_msg) unsigned long nr_stack_trace_entries; #ifdef CONFIG_PROVE_LOCKING +/** + * struct lock_trace - single stack backtrace + * @hash_entry: Entry in a stack_trace_hash[] list. + * @hash: jhash() of @entries. + * @nr_entries: Number of entries in @entries. + * @entries: Actual stack backtrace. + */ +struct lock_trace { + struct hlist_node hash_entry; + u32 hash; + u32 nr_entries; + unsigned long entries[0] __aligned(sizeof(unsigned long)); +}; +#define LOCK_TRACE_SIZE_IN_LONGS \ + (sizeof(struct lock_trace) / sizeof(unsigned long)) /* - * Stack-trace: tightly packed array of stack backtrace - * addresses. Protected by the graph_lock. + * Stack-trace: sequence of lock_trace structures. Protected by the graph_lock. */ static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES]; +static struct hlist_head stack_trace_hash[STACK_TRACE_HASH_SIZE]; + +static bool traces_identical(struct lock_trace *t1, struct lock_trace *t2) +{ + return t1->hash == t2->hash && t1->nr_entries == t2->nr_entries && + memcmp(t1->entries, t2->entries, + t1->nr_entries * sizeof(t1->entries[0])) == 0; +} -static int save_trace(struct lock_trace *trace) +static struct lock_trace *save_trace(void) { - unsigned long *entries = stack_trace + nr_stack_trace_entries; + struct lock_trace *trace, *t2; + struct hlist_head *hash_head; + u32 hash; unsigned int max_entries; - trace->offset = nr_stack_trace_entries; - max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries; - trace->nr_entries = stack_trace_save(entries, max_entries, 3); - nr_stack_trace_entries += trace->nr_entries; + BUILD_BUG_ON_NOT_POWER_OF_2(STACK_TRACE_HASH_SIZE); + BUILD_BUG_ON(LOCK_TRACE_SIZE_IN_LONGS >= MAX_STACK_TRACE_ENTRIES); + + trace = (struct lock_trace *)(stack_trace + nr_stack_trace_entries); + max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries - + LOCK_TRACE_SIZE_IN_LONGS; + trace->nr_entries = stack_trace_save(trace->entries, max_entries, 3); - if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) { + if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES - + LOCK_TRACE_SIZE_IN_LONGS - 1) { if (!debug_locks_off_graph_unlock()) - return 0; + return NULL; print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!"); dump_stack(); - return 0; + return NULL; } - return 1; + hash = jhash(trace->entries, trace->nr_entries * + sizeof(trace->entries[0]), 0); + trace->hash = hash; + hash_head = stack_trace_hash + (hash & (STACK_TRACE_HASH_SIZE - 1)); + hlist_for_each_entry(t2, hash_head, hash_entry) { + if (traces_identical(trace, t2)) + return t2; + } + nr_stack_trace_entries += LOCK_TRACE_SIZE_IN_LONGS + trace->nr_entries; + hlist_add_head(&trace->hash_entry, hash_head); + + return trace; } #endif @@ -1235,7 +1274,7 @@ static struct lock_list *alloc_list_entry(void) static int add_lock_to_list(struct lock_class *this, struct lock_class *links_to, struct list_head *head, unsigned long ip, int distance, - struct lock_trace *trace) + const struct lock_trace *trace) { struct lock_list *entry; /* @@ -1249,7 +1288,7 @@ static int add_lock_to_list(struct lock_class *this, entry->class = this; entry->links_to = links_to; entry->distance = distance; - entry->trace = *trace; + entry->trace = trace; /* * Both allocation and removal are done under the graph lock; but * iteration is under RCU-sched; see look_up_lock_class() and @@ -1470,11 +1509,10 @@ static inline int __bfs_backwards(struct lock_list *src_entry, } -static void print_lock_trace(struct lock_trace *trace, unsigned int spaces) +static void print_lock_trace(const struct lock_trace *trace, + unsigned int spaces) { - unsigned long *entries = stack_trace + trace->offset; - - stack_trace_print(entries, trace->nr_entries, spaces); + stack_trace_print(trace->entries, trace->nr_entries, spaces); } /* @@ -1489,7 +1527,7 @@ print_circular_bug_entry(struct lock_list *target, int depth) printk("\n-> #%u", depth); print_lock_name(target->class); printk(KERN_CONT ":\n"); - print_lock_trace(&target->trace, 6); + print_lock_trace(target->trace, 6); } static void @@ -1592,7 +1630,8 @@ static noinline void print_circular_bug(struct lock_list *this, if (!debug_locks_off_graph_unlock() || debug_locks_silent) return; - if (!save_trace(&this->trace)) + this->trace = save_trace(); + if (!this->trace) return; depth = get_lock_depth(target); @@ -1715,7 +1754,7 @@ check_path(struct lock_class *target, struct lock_list *src_entry, */ static noinline int check_noncircular(struct held_lock *src, struct held_lock *target, - struct lock_trace *trace) + struct lock_trace **const trace) { int ret; struct lock_list *uninitialized_var(target_entry); @@ -1729,13 +1768,13 @@ check_noncircular(struct held_lock *src, struct held_lock *target, ret = check_path(hlock_class(target), &src_entry, &target_entry); if (unlikely(!ret)) { - if (!trace->nr_entries) { + if (!*trace) { /* * If save_trace fails here, the printing might * trigger a WARN but because of the !nr_entries it * should not do bad things. */ - save_trace(trace); + *trace = save_trace(); } print_circular_bug(&src_entry, target_entry, src, target); @@ -1859,7 +1898,7 @@ static void print_lock_class_header(struct lock_class *class, int depth) len += printk("%*s %s", depth, "", usage_str[bit]); len += printk(KERN_CONT " at:\n"); - print_lock_trace(class->usage_traces + bit, len); + print_lock_trace(class->usage_traces[bit], len); } } printk("%*s }\n", depth, ""); @@ -1884,7 +1923,7 @@ print_shortest_lock_dependencies(struct lock_list *leaf, do { print_lock_class_header(entry->class, depth); printk("%*s ... acquired at:\n", depth, ""); - print_lock_trace(&entry->trace, 2); + print_lock_trace(entry->trace, 2); printk("\n"); if (depth == 0 && (entry != root)) { @@ -1995,14 +2034,14 @@ print_bad_irq_dependency(struct task_struct *curr, print_lock_name(backwards_entry->class); pr_warn("\n... which became %s-irq-safe at:\n", irqclass); - print_lock_trace(backwards_entry->class->usage_traces + bit1, 1); + print_lock_trace(backwards_entry->class->usage_traces[bit1], 1); pr_warn("\nto a %s-irq-unsafe lock:\n", irqclass); print_lock_name(forwards_entry->class); pr_warn("\n... which became %s-irq-unsafe at:\n", irqclass); pr_warn("..."); - print_lock_trace(forwards_entry->class->usage_traces + bit2, 1); + print_lock_trace(forwards_entry->class->usage_traces[bit2], 1); pr_warn("\nother info that might help us debug this:\n\n"); print_irq_lock_scenario(backwards_entry, forwards_entry, @@ -2011,13 +2050,15 @@ print_bad_irq_dependency(struct task_struct *curr, lockdep_print_held_locks(curr); pr_warn("\nthe dependencies between %s-irq-safe lock and the holding lock:\n", irqclass); - if (!save_trace(&prev_root->trace)) + prev_root->trace = save_trace(); + if (!prev_root->trace) return; print_shortest_lock_dependencies(backwards_entry, prev_root); pr_warn("\nthe dependencies between the lock to be acquired"); pr_warn(" and %s-irq-unsafe lock:\n", irqclass); - if (!save_trace(&next_root->trace)) + next_root->trace = save_trace(); + if (!next_root->trace) return; print_shortest_lock_dependencies(forwards_entry, next_root); @@ -2369,7 +2410,8 @@ check_deadlock(struct task_struct *curr, struct held_lock *next) */ static int check_prev_add(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next, int distance, struct lock_trace *trace) + struct held_lock *next, int distance, + struct lock_trace **const trace) { struct lock_list *entry; int ret; @@ -2444,8 +2486,11 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, return ret; #endif - if (!trace->nr_entries && !save_trace(trace)) - return 0; + if (!*trace) { + *trace = save_trace(); + if (!*trace) + return 0; + } /* * Ok, all validations passed, add the new lock @@ -2453,14 +2498,14 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, */ ret = add_lock_to_list(hlock_class(next), hlock_class(prev), &hlock_class(prev)->locks_after, - next->acquire_ip, distance, trace); + next->acquire_ip, distance, *trace); if (!ret) return 0; ret = add_lock_to_list(hlock_class(prev), hlock_class(next), &hlock_class(next)->locks_before, - next->acquire_ip, distance, trace); + next->acquire_ip, distance, *trace); if (!ret) return 0; @@ -2476,7 +2521,7 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, static int check_prevs_add(struct task_struct *curr, struct held_lock *next) { - struct lock_trace trace = { .nr_entries = 0 }; + struct lock_trace *trace = NULL; int depth = curr->lockdep_depth; struct held_lock *hlock; @@ -3015,7 +3060,7 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, print_lock(this); pr_warn("{%s} state was registered at:\n", usage_str[prev_bit]); - print_lock_trace(hlock_class(this)->usage_traces + prev_bit, 1); + print_lock_trace(hlock_class(this)->usage_traces[prev_bit], 1); print_irqtrace_events(curr); pr_warn("\nother info that might help us debug this:\n"); @@ -3096,7 +3141,8 @@ print_irq_inversion_bug(struct task_struct *curr, lockdep_print_held_locks(curr); pr_warn("\nthe shortest dependencies between 2nd lock and 1st lock:\n"); - if (!save_trace(&root->trace)) + root->trace = save_trace(); + if (!root->trace) return; print_shortest_lock_dependencies(other, root); @@ -3580,7 +3626,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, hlock_class(this)->usage_mask |= new_mask; - if (!save_trace(hlock_class(this)->usage_traces + new_bit)) + if (!(hlock_class(this)->usage_traces[new_bit] = save_trace())) return 0; switch (new_bit) { @@ -5157,6 +5203,12 @@ void __init lockdep_init(void) ) / 1024 ); +#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) + printk(" memory used for stack traces: %zu kB\n", + (sizeof(stack_trace) + sizeof(stack_trace_hash)) / 1024 + ); +#endif + printk(" per task-struct memory footprint: %zu bytes\n", sizeof(((struct task_struct *)NULL)->held_locks)); } diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index 2e518369add4..93a008bf77db 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -92,6 +92,7 @@ static const unsigned long LOCKF_USED_IN_IRQ_READ = #define MAX_LOCKDEP_ENTRIES 16384UL #define MAX_LOCKDEP_CHAINS_BITS 15 #define MAX_STACK_TRACE_ENTRIES 262144UL +#define STACK_TRACE_HASH_SIZE 8192 #else #define MAX_LOCKDEP_ENTRIES 32768UL @@ -102,6 +103,7 @@ static const unsigned long LOCKF_USED_IN_IRQ_READ = * addresses. Protected by the hash_lock. */ #define MAX_STACK_TRACE_ENTRIES 524288UL +#define STACK_TRACE_HASH_SIZE 16384 #endif #define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) -- cgit v1.2.3 From e797bda3fd29137f6c151dfa10ea6a61c17895ce Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 22 Jul 2019 20:47:16 +0200 Subject: smp/hotplug: Track booted once CPUs in a cpumask The booted once information which is required to deal with the MCE broadcast issue on X86 correctly is stored in the per cpu hotplug state, which is perfectly fine for the intended purpose. X86 needs that information for supporting NMI broadcasting via shortcuts, but retrieving it from per cpu data is cumbersome. Move it to a cpumask so the information can be checked against the cpu_present_mask quickly. No functional change intended. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20190722105219.818822855@linutronix.de --- include/linux/cpumask.h | 2 ++ kernel/cpu.c | 11 +++++++---- 2 files changed, 9 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 21755471b1c3..693124900f0a 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -115,6 +115,8 @@ extern struct cpumask __cpu_active_mask; #define cpu_active(cpu) ((cpu) == 0) #endif +extern cpumask_t cpus_booted_once_mask; + static inline void cpu_max_bits_warn(unsigned int cpu, unsigned int bits) { #ifdef CONFIG_DEBUG_PER_CPU_MAPS diff --git a/kernel/cpu.c b/kernel/cpu.c index e84c0873559e..05778e32674a 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -62,7 +62,6 @@ struct cpuhp_cpu_state { bool rollback; bool single; bool bringup; - bool booted_once; struct hlist_node *node; struct hlist_node *last; enum cpuhp_state cb_state; @@ -76,6 +75,10 @@ static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state) = { .fail = CPUHP_INVALID, }; +#ifdef CONFIG_SMP +cpumask_t cpus_booted_once_mask; +#endif + #if defined(CONFIG_LOCKDEP) && defined(CONFIG_SMP) static struct lockdep_map cpuhp_state_up_map = STATIC_LOCKDEP_MAP_INIT("cpuhp_state-up", &cpuhp_state_up_map); @@ -433,7 +436,7 @@ static inline bool cpu_smt_allowed(unsigned int cpu) * CPU. Otherwise, a broadacasted MCE observing CR4.MCE=0b on any * core will shutdown the machine. */ - return !per_cpu(cpuhp_state, cpu).booted_once; + return !cpumask_test_cpu(cpu, &cpus_booted_once_mask); } #else static inline bool cpu_smt_allowed(unsigned int cpu) { return true; } @@ -1066,7 +1069,7 @@ void notify_cpu_starting(unsigned int cpu) int ret; rcu_cpu_starting(cpu); /* Enables RCU usage on this CPU. */ - st->booted_once = true; + cpumask_set_cpu(cpu, &cpus_booted_once_mask); while (st->state < target) { st->state++; ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL); @@ -2334,7 +2337,7 @@ void __init boot_cpu_init(void) void __init boot_cpu_hotplug_init(void) { #ifdef CONFIG_SMP - this_cpu_write(cpuhp_state.booted_once, true); + cpumask_set_cpu(smp_processor_id(), &cpus_booted_once_mask); #endif this_cpu_write(cpuhp_state.state, CPUHP_ONLINE); } -- cgit v1.2.3 From b9fa6442f7043e2cdd247905d4f3b80f2e9605cb Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 22 Jul 2019 20:47:24 +0200 Subject: cpumask: Implement cpumask_or_equal() The IPI code of x86 needs to evaluate whether the target cpumask is equal to the cpu_online_mask or equal except for the calling CPU. To replace the current implementation which requires the usage of a temporary cpumask, which might involve allocations, add a new function which compares a cpumask to the result of two other cpumasks which are or'ed together before comparison. This allows to make the required decision in one go and the calling code then can check for the calling CPU being set in the target mask with cpumask_test_cpu(). Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20190722105220.585449120@linutronix.de --- include/linux/bitmap.h | 23 +++++++++++++++++++++++ include/linux/cpumask.h | 14 ++++++++++++++ lib/bitmap.c | 20 ++++++++++++++++++++ 3 files changed, 57 insertions(+) (limited to 'include') diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index f58e97446abc..90528f12bdfa 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -120,6 +120,10 @@ extern int __bitmap_empty(const unsigned long *bitmap, unsigned int nbits); extern int __bitmap_full(const unsigned long *bitmap, unsigned int nbits); extern int __bitmap_equal(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); +extern bool __pure __bitmap_or_equal(const unsigned long *src1, + const unsigned long *src2, + const unsigned long *src3, + unsigned int nbits); extern void __bitmap_complement(unsigned long *dst, const unsigned long *src, unsigned int nbits); extern void __bitmap_shift_right(unsigned long *dst, const unsigned long *src, @@ -321,6 +325,25 @@ static inline int bitmap_equal(const unsigned long *src1, return __bitmap_equal(src1, src2, nbits); } +/** + * bitmap_or_equal - Check whether the or of two bitnaps is equal to a third + * @src1: Pointer to bitmap 1 + * @src2: Pointer to bitmap 2 will be or'ed with bitmap 1 + * @src3: Pointer to bitmap 3. Compare to the result of *@src1 | *@src2 + * + * Returns: True if (*@src1 | *@src2) == *@src3, false otherwise + */ +static inline bool bitmap_or_equal(const unsigned long *src1, + const unsigned long *src2, + const unsigned long *src3, + unsigned int nbits) +{ + if (!small_const_nbits(nbits)) + return __bitmap_or_equal(src1, src2, src3, nbits); + + return !(((*src1 | *src2) ^ *src3) & BITMAP_LAST_WORD_MASK(nbits)); +} + static inline int bitmap_intersects(const unsigned long *src1, const unsigned long *src2, unsigned int nbits) { diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 693124900f0a..0c7db5efe66c 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -475,6 +475,20 @@ static inline bool cpumask_equal(const struct cpumask *src1p, nr_cpumask_bits); } +/** + * cpumask_or_equal - *src1p | *src2p == *src3p + * @src1p: the first input + * @src2p: the second input + * @src3p: the third input + */ +static inline bool cpumask_or_equal(const struct cpumask *src1p, + const struct cpumask *src2p, + const struct cpumask *src3p) +{ + return bitmap_or_equal(cpumask_bits(src1p), cpumask_bits(src2p), + cpumask_bits(src3p), nr_cpumask_bits); +} + /** * cpumask_intersects - (*src1p & *src2p) != 0 * @src1p: the first input diff --git a/lib/bitmap.c b/lib/bitmap.c index bbe2589e8497..f9e834841e94 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -59,6 +59,26 @@ int __bitmap_equal(const unsigned long *bitmap1, } EXPORT_SYMBOL(__bitmap_equal); +bool __bitmap_or_equal(const unsigned long *bitmap1, + const unsigned long *bitmap2, + const unsigned long *bitmap3, + unsigned int bits) +{ + unsigned int k, lim = bits / BITS_PER_LONG; + unsigned long tmp; + + for (k = 0; k < lim; ++k) { + if ((bitmap1[k] | bitmap2[k]) != bitmap3[k]) + return false; + } + + if (!(bits % BITS_PER_LONG)) + return true; + + tmp = (bitmap1[k] | bitmap2[k]) ^ bitmap3[k]; + return (tmp & BITMAP_LAST_WORD_MASK(bits)) == 0; +} + void __bitmap_complement(unsigned long *dst, const unsigned long *src, unsigned int bits) { unsigned int k, lim = BITS_TO_LONGS(bits); -- cgit v1.2.3 From 0c09ab96fc820109d63097a2adcbbd20836b655f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 9 Jul 2019 16:23:40 +0200 Subject: cpu/hotplug: Cache number of online CPUs Re-evaluating the bitmap wheight of the online cpus bitmap in every invocation of num_online_cpus() over and over is a pretty useless exercise. Especially when num_online_cpus() is used in code paths like the IPI delivery of x86 or the membarrier code. Cache the number of online CPUs in the core and just return the cached variable. The accessor function provides only a snapshot when used without protection against concurrent CPU hotplug. The storage needs to use an atomic_t because the kexec and reboot code (ab)use set_cpu_online() in their 'shutdown' handlers without any form of serialization as pointed out by Mathieu. Regular CPU hotplug usage is properly serialized. Signed-off-by: Thomas Gleixner Reviewed-by: Mathieu Desnoyers Link: https://lkml.kernel.org/r/alpine.DEB.2.21.1907091622590.1634@nanos.tec.linutronix.de --- include/linux/cpumask.h | 25 ++++++++++++++++--------- kernel/cpu.c | 24 ++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 0c7db5efe66c..b5a5a1ed9efd 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -10,6 +10,7 @@ #include #include #include +#include #include /* Don't assign or return these: may not be this big! */ @@ -95,8 +96,21 @@ extern struct cpumask __cpu_active_mask; #define cpu_present_mask ((const struct cpumask *)&__cpu_present_mask) #define cpu_active_mask ((const struct cpumask *)&__cpu_active_mask) +extern atomic_t __num_online_cpus; + #if NR_CPUS > 1 -#define num_online_cpus() cpumask_weight(cpu_online_mask) +/** + * num_online_cpus() - Read the number of online CPUs + * + * Despite the fact that __num_online_cpus is of type atomic_t, this + * interface gives only a momentary snapshot and is not protected against + * concurrent CPU hotplug operations unless invoked from a cpuhp_lock held + * region. + */ +static inline unsigned int num_online_cpus(void) +{ + return atomic_read(&__num_online_cpus); +} #define num_possible_cpus() cpumask_weight(cpu_possible_mask) #define num_present_cpus() cpumask_weight(cpu_present_mask) #define num_active_cpus() cpumask_weight(cpu_active_mask) @@ -821,14 +835,7 @@ set_cpu_present(unsigned int cpu, bool present) cpumask_clear_cpu(cpu, &__cpu_present_mask); } -static inline void -set_cpu_online(unsigned int cpu, bool online) -{ - if (online) - cpumask_set_cpu(cpu, &__cpu_online_mask); - else - cpumask_clear_cpu(cpu, &__cpu_online_mask); -} +void set_cpu_online(unsigned int cpu, bool online); static inline void set_cpu_active(unsigned int cpu, bool active) diff --git a/kernel/cpu.c b/kernel/cpu.c index 05778e32674a..e1967e9eddc2 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -2298,6 +2298,9 @@ EXPORT_SYMBOL(__cpu_present_mask); struct cpumask __cpu_active_mask __read_mostly; EXPORT_SYMBOL(__cpu_active_mask); +atomic_t __num_online_cpus __read_mostly; +EXPORT_SYMBOL(__num_online_cpus); + void init_cpu_present(const struct cpumask *src) { cpumask_copy(&__cpu_present_mask, src); @@ -2313,6 +2316,27 @@ void init_cpu_online(const struct cpumask *src) cpumask_copy(&__cpu_online_mask, src); } +void set_cpu_online(unsigned int cpu, bool online) +{ + /* + * atomic_inc/dec() is required to handle the horrid abuse of this + * function by the reboot and kexec code which invoke it from + * IPI/NMI broadcasts when shutting down CPUs. Invocation from + * regular CPU hotplug is properly serialized. + * + * Note, that the fact that __num_online_cpus is of type atomic_t + * does not protect readers which are not serialized against + * concurrent hotplug operations. + */ + if (online) { + if (!cpumask_test_and_set_cpu(cpu, &__cpu_online_mask)) + atomic_inc(&__num_online_cpus); + } else { + if (cpumask_test_and_clear_cpu(cpu, &__cpu_online_mask)) + atomic_dec(&__num_online_cpus); + } +} + /* * Activate the first processor. */ -- cgit v1.2.3 From 7b3c92b85a65c2db1f542265bc98e1f9e3056eba Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 4 Jul 2019 15:13:23 -0700 Subject: sched/core: Convert get_task_struct() to return the task Returning the pointer that was passed in allows us to write slightly more idiomatic code. Convert a few users. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190704221323.24290-1-willy@infradead.org Signed-off-by: Ingo Molnar --- include/linux/sched/task.h | 6 +++++- kernel/events/core.c | 9 +++------ kernel/irq/manage.c | 3 +-- kernel/locking/rtmutex.c | 6 ++---- kernel/trace/trace_sched_wakeup.c | 3 +-- 5 files changed, 12 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 0497091e40c1..3d90ed8f75f0 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -105,7 +105,11 @@ extern void sched_exec(void); #define sched_exec() {} #endif -#define get_task_struct(tsk) do { refcount_inc(&(tsk)->usage); } while(0) +static inline struct task_struct *get_task_struct(struct task_struct *t) +{ + refcount_inc(&t->usage); + return t; +} extern void __put_task_struct(struct task_struct *t); diff --git a/kernel/events/core.c b/kernel/events/core.c index 026a14541a38..ea5e8139fe62 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4089,10 +4089,8 @@ alloc_perf_context(struct pmu *pmu, struct task_struct *task) return NULL; __perf_event_init_context(ctx); - if (task) { - ctx->task = task; - get_task_struct(task); - } + if (task) + ctx->task = get_task_struct(task); ctx->pmu = pmu; return ctx; @@ -10355,8 +10353,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, * and we cannot use the ctx information because we need the * pmu before we get a ctx. */ - get_task_struct(task); - event->hw.target = task; + event->hw.target = get_task_struct(task); } event->clock = &local_clock; diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index e8f7f179bf77..9d50fbe5531a 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1255,8 +1255,7 @@ setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary) * the thread dies to avoid that the interrupt code * references an already freed task_struct. */ - get_task_struct(t); - new->thread = t; + new->thread = get_task_struct(t); /* * Tell the thread to set its affinity. This is * important for shared interrupt handlers as we do diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index fa83d36e30c6..2874bf556162 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -628,8 +628,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, } /* [10] Grab the next task, i.e. owner of @lock */ - task = rt_mutex_owner(lock); - get_task_struct(task); + task = get_task_struct(rt_mutex_owner(lock)); raw_spin_lock(&task->pi_lock); /* @@ -709,8 +708,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, } /* [10] Grab the next task, i.e. the owner of @lock */ - task = rt_mutex_owner(lock); - get_task_struct(task); + task = get_task_struct(rt_mutex_owner(lock)); raw_spin_lock(&task->pi_lock); /* [11] requeue the pi waiters if necessary */ diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 743b2b520d34..5e43b9664eca 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -579,8 +579,7 @@ probe_wakeup(void *ignore, struct task_struct *p) else tracing_dl = 0; - wakeup_task = p; - get_task_struct(wakeup_task); + wakeup_task = get_task_struct(p); local_save_flags(flags); -- cgit v1.2.3 From c22645f4c8f021fb1c5e7189eb1f968132cc0844 Mon Sep 17 00:00:00 2001 From: Mathieu Poirier Date: Fri, 19 Jul 2019 15:59:53 +0200 Subject: sched/topology: Add partition_sched_domains_locked() Introduce the partition_sched_domains_locked() function by taking the mutex locking code out of the original function. That way the work done by partition_sched_domains_locked() can be reused without dropping the mutex lock. No change of functionality is introduced by this patch. Tested-by: Dietmar Eggemann Signed-off-by: Mathieu Poirier Signed-off-by: Peter Zijlstra (Intel) Acked-by: Tejun Heo Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bristot@redhat.com Cc: claudio@evidence.eu.com Cc: lizefan@huawei.com Cc: longman@redhat.com Cc: luca.abeni@santannapisa.it Cc: rostedt@goodmis.org Cc: tommaso.cucinotta@santannapisa.it Link: https://lkml.kernel.org/r/20190719140000.31694-2-juri.lelli@redhat.com Signed-off-by: Ingo Molnar --- include/linux/sched/topology.h | 10 ++++++++++ kernel/sched/topology.c | 17 +++++++++++++---- 2 files changed, 23 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 7863bb62d2ab..f341163fedc9 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -150,6 +150,10 @@ static inline struct cpumask *sched_domain_span(struct sched_domain *sd) return to_cpumask(sd->span); } +extern void partition_sched_domains_locked(int ndoms_new, + cpumask_var_t doms_new[], + struct sched_domain_attr *dattr_new); + extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], struct sched_domain_attr *dattr_new); @@ -194,6 +198,12 @@ extern void set_sched_topology(struct sched_domain_topology_level *tl); struct sched_domain_attr; +static inline void +partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[], + struct sched_domain_attr *dattr_new) +{ +} + static inline void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], struct sched_domain_attr *dattr_new) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 4eea2c9bc732..5a174ae6ecf3 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -2169,16 +2169,16 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, * ndoms_new == 0 is a special case for destroying existing domains, * and it will not create the default domain. * - * Call with hotplug lock held + * Call with hotplug lock and sched_domains_mutex held */ -void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], - struct sched_domain_attr *dattr_new) +void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[], + struct sched_domain_attr *dattr_new) { bool __maybe_unused has_eas = false; int i, j, n; int new_topology; - mutex_lock(&sched_domains_mutex); + lockdep_assert_held(&sched_domains_mutex); /* Always unregister in case we don't destroy any domains: */ unregister_sched_domain_sysctl(); @@ -2261,6 +2261,15 @@ match3: ndoms_cur = ndoms_new; register_sched_domain_sysctl(); +} +/* + * Call with hotplug lock held + */ +void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], + struct sched_domain_attr *dattr_new) +{ + mutex_lock(&sched_domains_mutex); + partition_sched_domains_locked(ndoms_new, doms_new, dattr_new); mutex_unlock(&sched_domains_mutex); } -- cgit v1.2.3 From f9a25f776d780bfa3279f0b6e5f5cf3224997976 Mon Sep 17 00:00:00 2001 From: Mathieu Poirier Date: Fri, 19 Jul 2019 15:59:55 +0200 Subject: cpusets: Rebuild root domain deadline accounting information When the topology of root domains is modified by CPUset or CPUhotplug operations information about the current deadline bandwidth held in the root domain is lost. This patch addresses the issue by recalculating the lost deadline bandwidth information by circling through the deadline tasks held in CPUsets and adding their current load to the root domain they are associated with. Tested-by: Dietmar Eggemann Signed-off-by: Mathieu Poirier Signed-off-by: Juri Lelli [ Various additional modifications. ] Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bristot@redhat.com Cc: claudio@evidence.eu.com Cc: lizefan@huawei.com Cc: longman@redhat.com Cc: luca.abeni@santannapisa.it Cc: rostedt@goodmis.org Cc: tj@kernel.org Cc: tommaso.cucinotta@santannapisa.it Link: https://lkml.kernel.org/r/20190719140000.31694-4-juri.lelli@redhat.com Signed-off-by: Ingo Molnar --- include/linux/cgroup.h | 1 + include/linux/sched.h | 5 ++++ include/linux/sched/deadline.h | 8 ++++++ kernel/cgroup/cgroup.c | 2 +- kernel/cgroup/cpuset.c | 64 +++++++++++++++++++++++++++++++++++++++++- kernel/sched/deadline.c | 30 ++++++++++++++++++++ kernel/sched/sched.h | 3 -- kernel/sched/topology.c | 13 ++++++++- 8 files changed, 120 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index f6b048902d6c..3ba3e6da13a6 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -150,6 +150,7 @@ struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset, struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, struct cgroup_subsys_state **dst_cssp); +void cgroup_enable_task_cg_lists(void); void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags, struct css_task_iter *it); struct task_struct *css_task_iter_next(struct css_task_iter *it); diff --git a/include/linux/sched.h b/include/linux/sched.h index 9f51932bd543..b94ad92dfbe6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -295,6 +295,11 @@ enum uclamp_id { UCLAMP_CNT }; +#ifdef CONFIG_SMP +extern struct root_domain def_root_domain; +extern struct mutex sched_domains_mutex; +#endif + struct sched_info { #ifdef CONFIG_SCHED_INFO /* Cumulative counters: */ diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h index 0cb034331cbb..1aff00b65f3c 100644 --- a/include/linux/sched/deadline.h +++ b/include/linux/sched/deadline.h @@ -24,3 +24,11 @@ static inline bool dl_time_before(u64 a, u64 b) { return (s64)(a - b) < 0; } + +#ifdef CONFIG_SMP + +struct root_domain; +extern void dl_add_task_root_domain(struct task_struct *p); +extern void dl_clear_root_domain(struct root_domain *rd); + +#endif /* CONFIG_SMP */ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 753afbca549f..4b5bc452176c 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1891,7 +1891,7 @@ static int cgroup_reconfigure(struct fs_context *fc) */ static bool use_task_css_set_links __read_mostly; -static void cgroup_enable_task_cg_lists(void) +void cgroup_enable_task_cg_lists(void) { struct task_struct *p, *g; diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 5aa37531ce76..846cbdb68566 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -45,6 +45,7 @@ #include #include #include +#include #include #include #include @@ -894,6 +895,67 @@ done: return ndoms; } +static void update_tasks_root_domain(struct cpuset *cs) +{ + struct css_task_iter it; + struct task_struct *task; + + css_task_iter_start(&cs->css, 0, &it); + + while ((task = css_task_iter_next(&it))) + dl_add_task_root_domain(task); + + css_task_iter_end(&it); +} + +static void rebuild_root_domains(void) +{ + struct cpuset *cs = NULL; + struct cgroup_subsys_state *pos_css; + + lockdep_assert_held(&cpuset_mutex); + lockdep_assert_cpus_held(); + lockdep_assert_held(&sched_domains_mutex); + + cgroup_enable_task_cg_lists(); + + rcu_read_lock(); + + /* + * Clear default root domain DL accounting, it will be computed again + * if a task belongs to it. + */ + dl_clear_root_domain(&def_root_domain); + + cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { + + if (cpumask_empty(cs->effective_cpus)) { + pos_css = css_rightmost_descendant(pos_css); + continue; + } + + css_get(&cs->css); + + rcu_read_unlock(); + + update_tasks_root_domain(cs); + + rcu_read_lock(); + css_put(&cs->css); + } + rcu_read_unlock(); +} + +static void +partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[], + struct sched_domain_attr *dattr_new) +{ + mutex_lock(&sched_domains_mutex); + partition_sched_domains_locked(ndoms_new, doms_new, dattr_new); + rebuild_root_domains(); + mutex_unlock(&sched_domains_mutex); +} + /* * Rebuild scheduler domains. * @@ -931,7 +993,7 @@ static void rebuild_sched_domains_locked(void) ndoms = generate_sched_domains(&doms, &attr); /* Have scheduler rebuild the domains */ - partition_sched_domains(ndoms, doms, attr); + partition_and_rebuild_sched_domains(ndoms, doms, attr); out: put_online_cpus(); } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index ef5b9f6b1d42..0f9d2180be23 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2283,6 +2283,36 @@ void __init init_sched_dl_class(void) GFP_KERNEL, cpu_to_node(i)); } +void dl_add_task_root_domain(struct task_struct *p) +{ + struct rq_flags rf; + struct rq *rq; + struct dl_bw *dl_b; + + rq = task_rq_lock(p, &rf); + if (!dl_task(p)) + goto unlock; + + dl_b = &rq->rd->dl_bw; + raw_spin_lock(&dl_b->lock); + + __dl_add(dl_b, p->dl.dl_bw, cpumask_weight(rq->rd->span)); + + raw_spin_unlock(&dl_b->lock); + +unlock: + task_rq_unlock(rq, p, &rf); +} + +void dl_clear_root_domain(struct root_domain *rd) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&rd->dl_bw.lock, flags); + rd->dl_bw.total_bw = 0; + raw_spin_unlock_irqrestore(&rd->dl_bw.lock, flags); +} + #endif /* CONFIG_SMP */ static void switched_from_dl(struct rq *rq, struct task_struct *p) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 16126efd14ed..7583faddba33 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -778,9 +778,6 @@ struct root_domain { struct perf_domain __rcu *pd; }; -extern struct root_domain def_root_domain; -extern struct mutex sched_domains_mutex; - extern void init_defrootdomain(void); extern int sched_init_domains(const struct cpumask *cpu_map); extern void rq_attach_root(struct rq *rq, struct root_domain *rd); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 5a174ae6ecf3..8f83e8e3ea9a 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -2203,8 +2203,19 @@ void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[], for (i = 0; i < ndoms_cur; i++) { for (j = 0; j < n && !new_topology; j++) { if (cpumask_equal(doms_cur[i], doms_new[j]) && - dattrs_equal(dattr_cur, i, dattr_new, j)) + dattrs_equal(dattr_cur, i, dattr_new, j)) { + struct root_domain *rd; + + /* + * This domain won't be destroyed and as such + * its dl_bw->total_bw needs to be cleared. It + * will be recomputed in function + * update_tasks_root_domain(). + */ + rd = cpu_rq(cpumask_any(doms_cur[i]))->rd; + dl_clear_root_domain(rd); goto match1; + } } /* No match - a current sched domain not in new doms_new[] */ detach_destroy_domains(doms_cur[i]); -- cgit v1.2.3 From d74b27d63a8bebe2fe634944e4ebdc7b10db7a39 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 19 Jul 2019 15:59:58 +0200 Subject: cgroup/cpuset: Change cpuset_rwsem and hotplug lock order cpuset_rwsem is going to be acquired from sched_setscheduler() with a following patch. There are however paths (e.g., spawn_ksoftirqd) in which sched_scheduler() is eventually called while holding hotplug lock; this creates a dependecy between hotplug lock (to be always acquired first) and cpuset_rwsem (to be always acquired after hotplug lock). Fix paths which currently take the two locks in the wrong order (after a following patch is applied). Tested-by: Dietmar Eggemann Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bristot@redhat.com Cc: claudio@evidence.eu.com Cc: lizefan@huawei.com Cc: longman@redhat.com Cc: luca.abeni@santannapisa.it Cc: mathieu.poirier@linaro.org Cc: rostedt@goodmis.org Cc: tj@kernel.org Cc: tommaso.cucinotta@santannapisa.it Link: https://lkml.kernel.org/r/20190719140000.31694-7-juri.lelli@redhat.com Signed-off-by: Ingo Molnar --- include/linux/cpuset.h | 8 ++++---- kernel/cgroup/cpuset.c | 22 +++++++++++++++++----- 2 files changed, 21 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 934633a05d20..7f1478c26a33 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -40,14 +40,14 @@ static inline bool cpusets_enabled(void) static inline void cpuset_inc(void) { - static_branch_inc(&cpusets_pre_enable_key); - static_branch_inc(&cpusets_enabled_key); + static_branch_inc_cpuslocked(&cpusets_pre_enable_key); + static_branch_inc_cpuslocked(&cpusets_enabled_key); } static inline void cpuset_dec(void) { - static_branch_dec(&cpusets_enabled_key); - static_branch_dec(&cpusets_pre_enable_key); + static_branch_dec_cpuslocked(&cpusets_enabled_key); + static_branch_dec_cpuslocked(&cpusets_pre_enable_key); } extern int cpuset_init(void); diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index e1a8d168c5e9..5c5014caa23c 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -973,8 +973,8 @@ static void rebuild_sched_domains_locked(void) cpumask_var_t *doms; int ndoms; + lockdep_assert_cpus_held(); percpu_rwsem_assert_held(&cpuset_rwsem); - get_online_cpus(); /* * We have raced with CPU hotplug. Don't do anything to avoid @@ -983,19 +983,17 @@ static void rebuild_sched_domains_locked(void) */ if (!top_cpuset.nr_subparts_cpus && !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) - goto out; + return; if (top_cpuset.nr_subparts_cpus && !cpumask_subset(top_cpuset.effective_cpus, cpu_active_mask)) - goto out; + return; /* Generate domain masks and attrs */ ndoms = generate_sched_domains(&doms, &attr); /* Have scheduler rebuild the domains */ partition_and_rebuild_sched_domains(ndoms, doms, attr); -out: - put_online_cpus(); } #else /* !CONFIG_SMP */ static void rebuild_sched_domains_locked(void) @@ -1005,9 +1003,11 @@ static void rebuild_sched_domains_locked(void) void rebuild_sched_domains(void) { + get_online_cpus(); percpu_down_write(&cpuset_rwsem); rebuild_sched_domains_locked(); percpu_up_write(&cpuset_rwsem); + put_online_cpus(); } /** @@ -2245,6 +2245,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, cpuset_filetype_t type = cft->private; int retval = 0; + get_online_cpus(); percpu_down_write(&cpuset_rwsem); if (!is_cpuset_online(cs)) { retval = -ENODEV; @@ -2282,6 +2283,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, } out_unlock: percpu_up_write(&cpuset_rwsem); + put_online_cpus(); return retval; } @@ -2292,6 +2294,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, cpuset_filetype_t type = cft->private; int retval = -ENODEV; + get_online_cpus(); percpu_down_write(&cpuset_rwsem); if (!is_cpuset_online(cs)) goto out_unlock; @@ -2306,6 +2309,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, } out_unlock: percpu_up_write(&cpuset_rwsem); + put_online_cpus(); return retval; } @@ -2344,6 +2348,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, kernfs_break_active_protection(of->kn); flush_work(&cpuset_hotplug_work); + get_online_cpus(); percpu_down_write(&cpuset_rwsem); if (!is_cpuset_online(cs)) goto out_unlock; @@ -2369,6 +2374,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, free_cpuset(trialcs); out_unlock: percpu_up_write(&cpuset_rwsem); + put_online_cpus(); kernfs_unbreak_active_protection(of->kn); css_put(&cs->css); flush_workqueue(cpuset_migrate_mm_wq); @@ -2499,6 +2505,7 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf, return -EINVAL; css_get(&cs->css); + get_online_cpus(); percpu_down_write(&cpuset_rwsem); if (!is_cpuset_online(cs)) goto out_unlock; @@ -2506,6 +2513,7 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf, retval = update_prstate(cs, val); out_unlock: percpu_up_write(&cpuset_rwsem); + put_online_cpus(); css_put(&cs->css); return retval ?: nbytes; } @@ -2711,6 +2719,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) if (!parent) return 0; + get_online_cpus(); percpu_down_write(&cpuset_rwsem); set_bit(CS_ONLINE, &cs->flags); @@ -2763,6 +2772,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) spin_unlock_irq(&callback_lock); out_unlock: percpu_up_write(&cpuset_rwsem); + put_online_cpus(); return 0; } @@ -2781,6 +2791,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); + get_online_cpus(); percpu_down_write(&cpuset_rwsem); if (is_partition_root(cs)) @@ -2801,6 +2812,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) clear_bit(CS_ONLINE, &cs->flags); percpu_up_write(&cpuset_rwsem); + put_online_cpus(); } static void cpuset_css_free(struct cgroup_subsys_state *css) -- cgit v1.2.3 From 710da3c8ea7dfbd327920afd3831d8c82c42789d Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 19 Jul 2019 16:00:00 +0200 Subject: sched/core: Prevent race condition between cpuset and __sched_setscheduler() No synchronisation mechanism exists between the cpuset subsystem and calls to function __sched_setscheduler(). As such, it is possible that new root domains are created on the cpuset side while a deadline acceptance test is carried out in __sched_setscheduler(), leading to a potential oversell of CPU bandwidth. Grab cpuset_rwsem read lock from core scheduler, so to prevent situations such as the one described above from happening. The only exception is normalize_rt_tasks() which needs to work under tasklist_lock and can't therefore grab cpuset_rwsem. We are fine with this, as this function is only called by sysrq and, if that gets triggered, DEADLINE guarantees are already gone out of the window anyway. Tested-by: Dietmar Eggemann Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bristot@redhat.com Cc: claudio@evidence.eu.com Cc: lizefan@huawei.com Cc: longman@redhat.com Cc: luca.abeni@santannapisa.it Cc: mathieu.poirier@linaro.org Cc: rostedt@goodmis.org Cc: tj@kernel.org Cc: tommaso.cucinotta@santannapisa.it Link: https://lkml.kernel.org/r/20190719140000.31694-9-juri.lelli@redhat.com Signed-off-by: Ingo Molnar --- include/linux/cpuset.h | 5 +++++ kernel/cgroup/cpuset.c | 11 +++++++++++ kernel/sched/core.c | 20 +++++++++++++++++--- 3 files changed, 33 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 7f1478c26a33..04c20de66afc 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -55,6 +55,8 @@ extern void cpuset_init_smp(void); extern void cpuset_force_rebuild(void); extern void cpuset_update_active_cpus(void); extern void cpuset_wait_for_hotplug(void); +extern void cpuset_read_lock(void); +extern void cpuset_read_unlock(void); extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); extern void cpuset_cpus_allowed_fallback(struct task_struct *p); extern nodemask_t cpuset_mems_allowed(struct task_struct *p); @@ -176,6 +178,9 @@ static inline void cpuset_update_active_cpus(void) static inline void cpuset_wait_for_hotplug(void) { } +static inline void cpuset_read_lock(void) { } +static inline void cpuset_read_unlock(void) { } + static inline void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask) { diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 5c5014caa23c..c52bc91f882b 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -334,6 +334,17 @@ static struct cpuset top_cpuset = { */ DEFINE_STATIC_PERCPU_RWSEM(cpuset_rwsem); + +void cpuset_read_lock(void) +{ + percpu_down_read(&cpuset_rwsem); +} + +void cpuset_read_unlock(void) +{ + percpu_up_read(&cpuset_rwsem); +} + static DEFINE_SPINLOCK(callback_lock); static struct workqueue_struct *cpuset_migrate_mm_wq; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1af3d2dc6b29..1bceb22dac18 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4698,6 +4698,9 @@ recheck: return retval; } + if (pi) + cpuset_read_lock(); + /* * Make sure no PI-waiters arrive (or leave) while we are * changing the priority of the task: @@ -4772,6 +4775,8 @@ change: if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { policy = oldpolicy = -1; task_rq_unlock(rq, p, &rf); + if (pi) + cpuset_read_unlock(); goto recheck; } @@ -4832,8 +4837,10 @@ change: preempt_disable(); task_rq_unlock(rq, p, &rf); - if (pi) + if (pi) { + cpuset_read_unlock(); rt_mutex_adjust_pi(p); + } /* Run balance callbacks after we've adjusted the PI chain: */ balance_callback(rq); @@ -4843,6 +4850,8 @@ change: unlock: task_rq_unlock(rq, p, &rf); + if (pi) + cpuset_read_unlock(); return retval; } @@ -4927,10 +4936,15 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) rcu_read_lock(); retval = -ESRCH; p = find_process_by_pid(pid); - if (p != NULL) - retval = sched_setscheduler(p, policy, &lparam); + if (likely(p)) + get_task_struct(p); rcu_read_unlock(); + if (likely(p)) { + retval = sched_setscheduler(p, policy, &lparam); + put_task_struct(p); + } + return retval; } -- cgit v1.2.3 From 39289bfc221423b27570e7c9157b690828e786cb Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 22 Jul 2019 17:01:30 +0000 Subject: RDMA: Make most headers compile stand alone So that rdma can work with CONFIG_KERNEL_HEADER_TEST and CONFIG_HEADERS_CHECK. Link: https://lore.kernel.org/r/20190722170126.GA16453@ziepe.ca Signed-off-by: Jason Gunthorpe --- include/Kbuild | 6 ------ include/rdma/ib.h | 2 ++ include/rdma/iw_portmap.h | 3 +++ include/rdma/opa_port_info.h | 2 ++ include/rdma/rdmavt_cq.h | 1 + include/rdma/signature.h | 2 ++ 6 files changed, 10 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/Kbuild b/include/Kbuild index c38f0d46b267..fc2aa4e20658 100644 --- a/include/Kbuild +++ b/include/Kbuild @@ -945,12 +945,6 @@ header-test- += net/xdp.h header-test- += net/xdp_priv.h header-test- += pcmcia/cistpl.h header-test- += pcmcia/ds.h -header-test- += rdma/ib.h -header-test- += rdma/iw_portmap.h -header-test- += rdma/opa_port_info.h -header-test- += rdma/rdmavt_cq.h -header-test- += rdma/restrack.h -header-test- += rdma/signature.h header-test- += rdma/tid_rdma_defs.h header-test- += scsi/fc/fc_encaps.h header-test- += scsi/fc/fc_fc2.h diff --git a/include/rdma/ib.h b/include/rdma/ib.h index 4f385ec54f80..fe2fc9e91588 100644 --- a/include/rdma/ib.h +++ b/include/rdma/ib.h @@ -36,6 +36,8 @@ #include #include #include +#include +#include struct ib_addr { union { diff --git a/include/rdma/iw_portmap.h b/include/rdma/iw_portmap.h index b9fee7feeeb5..c89535047c42 100644 --- a/include/rdma/iw_portmap.h +++ b/include/rdma/iw_portmap.h @@ -33,6 +33,9 @@ #ifndef _IW_PORTMAP_H #define _IW_PORTMAP_H +#include +#include + #define IWPM_ULIBNAME_SIZE 32 #define IWPM_DEVNAME_SIZE 32 #define IWPM_IFNAME_SIZE 16 diff --git a/include/rdma/opa_port_info.h b/include/rdma/opa_port_info.h index 7147a9263011..bdbfe25d3854 100644 --- a/include/rdma/opa_port_info.h +++ b/include/rdma/opa_port_info.h @@ -33,6 +33,8 @@ #if !defined(OPA_PORT_INFO_H) #define OPA_PORT_INFO_H +#include + #define OPA_PORT_LINK_MODE_NOP 0 /* No change */ #define OPA_PORT_LINK_MODE_OPA 4 /* Port mode is OPA */ diff --git a/include/rdma/rdmavt_cq.h b/include/rdma/rdmavt_cq.h index 04c519ef6d71..574eb7278f46 100644 --- a/include/rdma/rdmavt_cq.h +++ b/include/rdma/rdmavt_cq.h @@ -53,6 +53,7 @@ #include #include +#include /* * Define an ib_cq_notify value that is not valid so we know when CQ diff --git a/include/rdma/signature.h b/include/rdma/signature.h index f24cc2a1d3c5..d16b0fcc8344 100644 --- a/include/rdma/signature.h +++ b/include/rdma/signature.h @@ -6,6 +6,8 @@ #ifndef _RDMA_SIGNATURE_H_ #define _RDMA_SIGNATURE_H_ +#include + enum ib_signature_prot_cap { IB_PROT_T10DIF_TYPE_1 = 1, IB_PROT_T10DIF_TYPE_2 = 1 << 1, -- cgit v1.2.3 From 8732d85a69a0411f16a4b78df8fdc7b09c50a849 Mon Sep 17 00:00:00 2001 From: Mattias Jacobsson <2pi@mok.nu> Date: Fri, 19 Jul 2019 19:51:45 +0200 Subject: platform/x86: wmi: add missing struct parameter description Add a description for the context parameter in the struct wmi_device_id. Reported-by: kbuild test robot Fixes: a48e23385fcf ("platform/x86: wmi: add context pointer field to struct wmi_device_id") Signed-off-by: Mattias Jacobsson <2pi@mok.nu> Signed-off-by: Andy Shevchenko --- include/linux/mod_devicetable.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index b2c1648f7e5d..5714fd35a83c 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -814,6 +814,7 @@ struct tee_client_device_id { /** * struct wmi_device_id - WMI device identifier * @guid_string: 36 char string of the form fa50ff2b-f2e8-45de-83fa-65417f2f49ba + * @context: pointer to driver specific data */ struct wmi_device_id { const char guid_string[UUID_STRING_LEN+1]; -- cgit v1.2.3 From 1d2fedd8561dc469a7503855ee602f4bb3eccfa7 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 23 Jul 2019 10:02:05 +0300 Subject: RDMA/core: Support netlink commands in non init_net net namespaces Now that IB core supports RDMA device binding with specific net namespace, enable IB core to accept netlink commands in non init_net namespaces. This is done by having per net namespace netlink socket. At present only netlink device handling client RDMA_NL_NLDEV supports device handling in multiple net namespaces. Hence do not accept netlink messages for other clients in non init_net net namespaces. Link: https://lore.kernel.org/r/20190723070205.6247-1-leon@kernel.org Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/addr.c | 2 +- drivers/infiniband/core/core_priv.h | 24 +++++++++++++- drivers/infiniband/core/device.c | 38 ++++++++-------------- drivers/infiniband/core/iwpm_msg.c | 8 ++--- drivers/infiniband/core/iwpm_util.c | 6 ++-- drivers/infiniband/core/netlink.c | 63 +++++++++++++++++++++++++------------ drivers/infiniband/core/nldev.c | 20 ++++++------ drivers/infiniband/core/sa_query.c | 2 +- include/rdma/rdma_netlink.h | 10 ++++-- 9 files changed, 105 insertions(+), 68 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 9b76a8fcdd24..1dd467bed8fc 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -183,7 +183,7 @@ static int ib_nl_ip_send_msg(struct rdma_dev_addr *dev_addr, /* Repair the nlmsg header length */ nlmsg_end(skb, nlh); - rdma_nl_multicast(skb, RDMA_NL_GROUP_LS, GFP_KERNEL); + rdma_nl_multicast(&init_net, skb, RDMA_NL_GROUP_LS, GFP_KERNEL); /* Make the request retry, so when we get the response from userspace * we will have something. diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index 888d89ce81df..589ed805e0ad 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -36,6 +36,8 @@ #include #include #include +#include +#include #include #include @@ -54,8 +56,26 @@ struct pkey_index_qp_list { struct list_head qp_list; }; +/** + * struct rdma_dev_net - rdma net namespace metadata for a net + * @nl_sock: Pointer to netlink socket + * @net: Pointer to owner net namespace + * @id: xarray id to identify the net namespace. + */ +struct rdma_dev_net { + struct sock *nl_sock; + possible_net_t net; + u32 id; +}; + extern const struct attribute_group ib_dev_attr_group; extern bool ib_devices_shared_netns; +extern unsigned int rdma_dev_net_id; + +static inline struct rdma_dev_net *rdma_net_to_dev_net(struct net *net) +{ + return net_generic(net, rdma_dev_net_id); +} int ib_device_register_sysfs(struct ib_device *device); void ib_device_unregister_sysfs(struct ib_device *device); @@ -179,7 +199,6 @@ void ib_mad_cleanup(void); int ib_sa_init(void); void ib_sa_cleanup(void); -int rdma_nl_init(void); void rdma_nl_exit(void); int ib_nl_handle_resolve_resp(struct sk_buff *skb, @@ -362,4 +381,7 @@ void ib_port_unregister_module_stat(struct kobject *kobj); int ib_device_set_netns_put(struct sk_buff *skb, struct ib_device *dev, u32 ns_fd); + +int rdma_nl_net_init(struct rdma_dev_net *rnet); +void rdma_nl_net_exit(struct rdma_dev_net *rnet); #endif /* _CORE_PRIV_H */ diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 309210d2d4a8..c3576c7d2e8f 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -39,7 +39,6 @@ #include #include #include -#include #include #include #include @@ -105,17 +104,7 @@ static DECLARE_RWSEM(clients_rwsem); */ #define CLIENT_DATA_REGISTERED XA_MARK_1 -/** - * struct rdma_dev_net - rdma net namespace metadata for a net - * @net: Pointer to owner net namespace - * @id: xarray id to identify the net namespace. - */ -struct rdma_dev_net { - possible_net_t net; - u32 id; -}; - -static unsigned int rdma_dev_net_id; +unsigned int rdma_dev_net_id; /* * A list of net namespaces is maintained in an xarray. This is necessary @@ -1050,7 +1039,7 @@ int rdma_compatdev_set(u8 enable) static void rdma_dev_exit_net(struct net *net) { - struct rdma_dev_net *rnet = net_generic(net, rdma_dev_net_id); + struct rdma_dev_net *rnet = rdma_net_to_dev_net(net); struct ib_device *dev; unsigned long index; int ret; @@ -1084,25 +1073,32 @@ static void rdma_dev_exit_net(struct net *net) } up_read(&devices_rwsem); + rdma_nl_net_exit(rnet); xa_erase(&rdma_nets, rnet->id); } static __net_init int rdma_dev_init_net(struct net *net) { - struct rdma_dev_net *rnet = net_generic(net, rdma_dev_net_id); + struct rdma_dev_net *rnet = rdma_net_to_dev_net(net); unsigned long index; struct ib_device *dev; int ret; + write_pnet(&rnet->net, net); + + ret = rdma_nl_net_init(rnet); + if (ret) + return ret; + /* No need to create any compat devices in default init_net. */ if (net_eq(net, &init_net)) return 0; - write_pnet(&rnet->net, net); - ret = xa_alloc(&rdma_nets, &rnet->id, rnet, xa_limit_32b, GFP_KERNEL); - if (ret) + if (ret) { + rdma_nl_net_exit(rnet); return ret; + } down_read(&devices_rwsem); xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { @@ -2629,12 +2625,6 @@ static int __init ib_core_init(void) goto err_comp_unbound; } - ret = rdma_nl_init(); - if (ret) { - pr_warn("Couldn't init IB netlink interface: err %d\n", ret); - goto err_sysfs; - } - ret = addr_init(); if (ret) { pr_warn("Could't init IB address resolution\n"); @@ -2680,8 +2670,6 @@ err_mad: err_addr: addr_cleanup(); err_ibnl: - rdma_nl_exit(); -err_sysfs: class_unregister(&ib_class); err_comp_unbound: destroy_workqueue(ib_comp_unbound_wq); diff --git a/drivers/infiniband/core/iwpm_msg.c b/drivers/infiniband/core/iwpm_msg.c index 2452b0ddcf0d..f1a873d4e842 100644 --- a/drivers/infiniband/core/iwpm_msg.c +++ b/drivers/infiniband/core/iwpm_msg.c @@ -112,7 +112,7 @@ int iwpm_register_pid(struct iwpm_dev_data *pm_msg, u8 nl_client) pr_debug("%s: Multicasting a nlmsg (dev = %s ifname = %s iwpm = %s)\n", __func__, pm_msg->dev_name, pm_msg->if_name, iwpm_ulib_name); - ret = rdma_nl_multicast(skb, RDMA_NL_GROUP_IWPM, GFP_KERNEL); + ret = rdma_nl_multicast(&init_net, skb, RDMA_NL_GROUP_IWPM, GFP_KERNEL); if (ret) { skb = NULL; /* skb is freed in the netlink send-op handling */ iwpm_user_pid = IWPM_PID_UNAVAILABLE; @@ -202,7 +202,7 @@ int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) nlmsg_end(skb, nlh); nlmsg_request->req_buffer = pm_msg; - ret = rdma_nl_unicast_wait(skb, iwpm_user_pid); + ret = rdma_nl_unicast_wait(&init_net, skb, iwpm_user_pid); if (ret) { skb = NULL; /* skb is freed in the netlink send-op handling */ iwpm_user_pid = IWPM_PID_UNDEFINED; @@ -297,7 +297,7 @@ int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) nlmsg_end(skb, nlh); nlmsg_request->req_buffer = pm_msg; - ret = rdma_nl_unicast_wait(skb, iwpm_user_pid); + ret = rdma_nl_unicast_wait(&init_net, skb, iwpm_user_pid); if (ret) { skb = NULL; /* skb is freed in the netlink send-op handling */ err_str = "Unable to send a nlmsg"; @@ -364,7 +364,7 @@ int iwpm_remove_mapping(struct sockaddr_storage *local_addr, u8 nl_client) nlmsg_end(skb, nlh); - ret = rdma_nl_unicast_wait(skb, iwpm_user_pid); + ret = rdma_nl_unicast_wait(&init_net, skb, iwpm_user_pid); if (ret) { skb = NULL; /* skb is freed in the netlink send-op handling */ iwpm_user_pid = IWPM_PID_UNDEFINED; diff --git a/drivers/infiniband/core/iwpm_util.c b/drivers/infiniband/core/iwpm_util.c index 41929bb83739..c7ad3499228c 100644 --- a/drivers/infiniband/core/iwpm_util.c +++ b/drivers/infiniband/core/iwpm_util.c @@ -645,7 +645,7 @@ static int send_mapinfo_num(u32 mapping_num, u8 nl_client, int iwpm_pid) nlmsg_end(skb, nlh); - ret = rdma_nl_unicast(skb, iwpm_pid); + ret = rdma_nl_unicast(&init_net, skb, iwpm_pid); if (ret) { skb = NULL; err_str = "Unable to send a nlmsg"; @@ -674,7 +674,7 @@ static int send_nlmsg_done(struct sk_buff *skb, u8 nl_client, int iwpm_pid) return -ENOMEM; } nlh->nlmsg_type = NLMSG_DONE; - ret = rdma_nl_unicast(skb, iwpm_pid); + ret = rdma_nl_unicast(&init_net, skb, iwpm_pid); if (ret) pr_warn("%s Unable to send a nlmsg\n", __func__); return ret; @@ -824,7 +824,7 @@ int iwpm_send_hello(u8 nl_client, int iwpm_pid, u16 abi_version) goto hello_num_error; nlmsg_end(skb, nlh); - ret = rdma_nl_unicast(skb, iwpm_pid); + ret = rdma_nl_unicast(&init_net, skb, iwpm_pid); if (ret) { skb = NULL; err_str = "Unable to send a nlmsg"; diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c index eecfc0b377c9..67a76aca2dd6 100644 --- a/drivers/infiniband/core/netlink.c +++ b/drivers/infiniband/core/netlink.c @@ -36,20 +36,22 @@ #include #include #include +#include #include #include #include #include "core_priv.h" static DEFINE_MUTEX(rdma_nl_mutex); -static struct sock *nls; static struct { const struct rdma_nl_cbs *cb_table; } rdma_nl_types[RDMA_NL_NUM_CLIENTS]; bool rdma_nl_chk_listeners(unsigned int group) { - return netlink_has_listeners(nls, group); + struct rdma_dev_net *rnet = rdma_net_to_dev_net(&init_net); + + return netlink_has_listeners(rnet->nl_sock, group); } EXPORT_SYMBOL(rdma_nl_chk_listeners); @@ -73,13 +75,21 @@ static bool is_nl_msg_valid(unsigned int type, unsigned int op) return (op < max_num_ops[type]) ? true : false; } -static bool is_nl_valid(unsigned int type, unsigned int op) +static bool +is_nl_valid(const struct sk_buff *skb, unsigned int type, unsigned int op) { const struct rdma_nl_cbs *cb_table; if (!is_nl_msg_valid(type, op)) return false; + /* + * Currently only NLDEV client is supporting netlink commands in + * non init_net net namespace. + */ + if (sock_net(skb->sk) != &init_net && type != RDMA_NL_NLDEV) + return false; + if (!rdma_nl_types[type].cb_table) { mutex_unlock(&rdma_nl_mutex); request_module("rdma-netlink-subsys-%d", type); @@ -161,7 +171,7 @@ static int rdma_nl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, unsigned int op = RDMA_NL_GET_OP(type); const struct rdma_nl_cbs *cb_table; - if (!is_nl_valid(index, op)) + if (!is_nl_valid(skb, index, op)) return -EINVAL; cb_table = rdma_nl_types[index].cb_table; @@ -185,7 +195,7 @@ static int rdma_nl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, .dump = cb_table[op].dump, }; if (c.dump) - return netlink_dump_start(nls, skb, nlh, &c); + return netlink_dump_start(skb->sk, skb, nlh, &c); return -EINVAL; } @@ -258,52 +268,65 @@ static void rdma_nl_rcv(struct sk_buff *skb) mutex_unlock(&rdma_nl_mutex); } -int rdma_nl_unicast(struct sk_buff *skb, u32 pid) +int rdma_nl_unicast(struct net *net, struct sk_buff *skb, u32 pid) { + struct rdma_dev_net *rnet = rdma_net_to_dev_net(net); int err; - err = netlink_unicast(nls, skb, pid, MSG_DONTWAIT); + err = netlink_unicast(rnet->nl_sock, skb, pid, MSG_DONTWAIT); return (err < 0) ? err : 0; } EXPORT_SYMBOL(rdma_nl_unicast); -int rdma_nl_unicast_wait(struct sk_buff *skb, __u32 pid) +int rdma_nl_unicast_wait(struct net *net, struct sk_buff *skb, __u32 pid) { + struct rdma_dev_net *rnet = rdma_net_to_dev_net(net); int err; - err = netlink_unicast(nls, skb, pid, 0); + err = netlink_unicast(rnet->nl_sock, skb, pid, 0); return (err < 0) ? err : 0; } EXPORT_SYMBOL(rdma_nl_unicast_wait); -int rdma_nl_multicast(struct sk_buff *skb, unsigned int group, gfp_t flags) +int rdma_nl_multicast(struct net *net, struct sk_buff *skb, + unsigned int group, gfp_t flags) { - return nlmsg_multicast(nls, skb, 0, group, flags); + struct rdma_dev_net *rnet = rdma_net_to_dev_net(net); + + return nlmsg_multicast(rnet->nl_sock, skb, 0, group, flags); } EXPORT_SYMBOL(rdma_nl_multicast); -int __init rdma_nl_init(void) +void rdma_nl_exit(void) +{ + int idx; + + for (idx = 0; idx < RDMA_NL_NUM_CLIENTS; idx++) + WARN(rdma_nl_types[idx].cb_table, + "Nelink client %d wasn't released prior to unloading %s\n", + idx, KBUILD_MODNAME); +} + +int rdma_nl_net_init(struct rdma_dev_net *rnet) { + struct net *net = read_pnet(&rnet->net); struct netlink_kernel_cfg cfg = { .input = rdma_nl_rcv, }; + struct sock *nls; - nls = netlink_kernel_create(&init_net, NETLINK_RDMA, &cfg); + nls = netlink_kernel_create(net, NETLINK_RDMA, &cfg); if (!nls) return -ENOMEM; nls->sk_sndtimeo = 10 * HZ; + rnet->nl_sock = nls; return 0; } -void rdma_nl_exit(void) +void rdma_nl_net_exit(struct rdma_dev_net *rnet) { - int idx; - - for (idx = 0; idx < RDMA_NL_NUM_CLIENTS; idx++) - rdma_nl_unregister(idx); - - netlink_kernel_release(nls); + netlink_kernel_release(rnet->nl_sock); } MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_RDMA); diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 783e465e7c41..e287b71a1cfd 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -832,7 +832,7 @@ static int nldev_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, nlmsg_end(msg, nlh); ib_device_put(device); - return rdma_nl_unicast(msg, NETLINK_CB(skb).portid); + return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); err_free: nlmsg_free(msg); @@ -972,7 +972,7 @@ static int nldev_port_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, nlmsg_end(msg, nlh); ib_device_put(device); - return rdma_nl_unicast(msg, NETLINK_CB(skb).portid); + return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); err_free: nlmsg_free(msg); @@ -1074,7 +1074,7 @@ static int nldev_res_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, nlmsg_end(msg, nlh); ib_device_put(device); - return rdma_nl_unicast(msg, NETLINK_CB(skb).portid); + return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); err_free: nlmsg_free(msg); @@ -1251,7 +1251,7 @@ static int res_get_common_doit(struct sk_buff *skb, struct nlmsghdr *nlh, nlmsg_end(msg, nlh); ib_device_put(device); - return rdma_nl_unicast(msg, NETLINK_CB(skb).portid); + return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); err_free: nlmsg_free(msg); @@ -1596,7 +1596,7 @@ static int nldev_get_chardev(struct sk_buff *skb, struct nlmsghdr *nlh, put_device(data.cdev); if (ibdev) ib_device_put(ibdev); - return rdma_nl_unicast(msg, NETLINK_CB(skb).portid); + return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); out_data: put_device(data.cdev); @@ -1636,7 +1636,7 @@ static int nldev_sys_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, return err; } nlmsg_end(msg, nlh); - return rdma_nl_unicast(msg, NETLINK_CB(skb).portid); + return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); } static int nldev_set_sys_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -1734,7 +1734,7 @@ static int nldev_stat_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh, nlmsg_end(msg, nlh); ib_device_put(device); - return rdma_nl_unicast(msg, NETLINK_CB(skb).portid); + return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); err_fill: rdma_counter_unbind_qpn(device, port, qpn, cntn); @@ -1802,7 +1802,7 @@ static int nldev_stat_del_doit(struct sk_buff *skb, struct nlmsghdr *nlh, nlmsg_end(msg, nlh); ib_device_put(device); - return rdma_nl_unicast(msg, NETLINK_CB(skb).portid); + return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); err_fill: rdma_counter_bind_qpn(device, port, qpn, cntn); @@ -1893,7 +1893,7 @@ static int stat_get_doit_default_counter(struct sk_buff *skb, mutex_unlock(&stats->lock); nlmsg_end(msg, nlh); ib_device_put(device); - return rdma_nl_unicast(msg, NETLINK_CB(skb).portid); + return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); err_table: nla_nest_cancel(msg, table_attr); @@ -1961,7 +1961,7 @@ static int stat_get_doit_qp(struct sk_buff *skb, struct nlmsghdr *nlh, nlmsg_end(msg, nlh); ib_device_put(device); - return rdma_nl_unicast(msg, NETLINK_CB(skb).portid); + return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); err_msg: nlmsg_free(msg); diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index 7d8071c7e564..17fc2936c077 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -860,7 +860,7 @@ static int ib_nl_send_msg(struct ib_sa_query *query, gfp_t gfp_mask) /* Repair the nlmsg header length */ nlmsg_end(skb, nlh); - return rdma_nl_multicast(skb, RDMA_NL_GROUP_LS, gfp_mask); + return rdma_nl_multicast(&init_net, skb, RDMA_NL_GROUP_LS, gfp_mask); } static int ib_nl_make_request(struct ib_sa_query *query, gfp_t gfp_mask) diff --git a/include/rdma/rdma_netlink.h b/include/rdma/rdma_netlink.h index 6631624e4d7c..ab22759de7ea 100644 --- a/include/rdma/rdma_netlink.h +++ b/include/rdma/rdma_netlink.h @@ -76,28 +76,32 @@ int ibnl_put_attr(struct sk_buff *skb, struct nlmsghdr *nlh, /** * Send the supplied skb to a specific userspace PID. + * @net: Net namespace in which to send the skb * @skb: The netlink skb * @pid: Userspace netlink process ID * Returns 0 on success or a negative error code. */ -int rdma_nl_unicast(struct sk_buff *skb, u32 pid); +int rdma_nl_unicast(struct net *net, struct sk_buff *skb, u32 pid); /** * Send, with wait/1 retry, the supplied skb to a specific userspace PID. + * @net: Net namespace in which to send the skb * @skb: The netlink skb * @pid: Userspace netlink process ID * Returns 0 on success or a negative error code. */ -int rdma_nl_unicast_wait(struct sk_buff *skb, __u32 pid); +int rdma_nl_unicast_wait(struct net *net, struct sk_buff *skb, __u32 pid); /** * Send the supplied skb to a netlink group. + * @net: Net namespace in which to send the skb * @skb: The netlink skb * @group: Netlink group ID * @flags: allocation flags * Returns 0 on success or a negative error code. */ -int rdma_nl_multicast(struct sk_buff *skb, unsigned int group, gfp_t flags); +int rdma_nl_multicast(struct net *net, struct sk_buff *skb, + unsigned int group, gfp_t flags); /** * Check if there are any listeners to the netlink group -- cgit v1.2.3 From 0058eb589881056b49a4ba15dfa3f1b8db53991c Mon Sep 17 00:00:00 2001 From: Michal Kalderon Date: Tue, 9 Jul 2019 17:17:33 +0300 Subject: qed*: Change dpi_addr to be denoted with __iomem Several casts were required around dpi_addr parameter in qed_rdma_if.h This is an address on the doorbell bar and should therefore be marked with __iomem. Link: https://lore.kernel.org/r/20190709141735.19193-5-michal.kalderon@marvell.com Reported-by: Jason Gunthorpe Signed-off-by: Ariel Elior Signed-off-by: Michal Kalderon Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/qedr/main.c | 2 +- drivers/infiniband/hw/qedr/qedr.h | 2 +- drivers/net/ethernet/qlogic/qed/qed_rdma.c | 5 ++--- include/linux/qed/qed_rdma_if.h | 2 +- 4 files changed, 5 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c index 533157a2a3be..ca5f6f65c929 100644 --- a/drivers/infiniband/hw/qedr/main.c +++ b/drivers/infiniband/hw/qedr/main.c @@ -820,7 +820,7 @@ static int qedr_init_hw(struct qedr_dev *dev) if (rc) goto out; - dev->db_addr = (void __iomem *)(uintptr_t)out_params.dpi_addr; + dev->db_addr = out_params.dpi_addr; dev->db_phys_addr = out_params.dpi_phys_addr; dev->db_size = out_params.dpi_size; dev->dpi = out_params.dpi; diff --git a/drivers/infiniband/hw/qedr/qedr.h b/drivers/infiniband/hw/qedr/qedr.h index a92ca22e5de1..0cfd849b13d6 100644 --- a/drivers/infiniband/hw/qedr/qedr.h +++ b/drivers/infiniband/hw/qedr/qedr.h @@ -229,7 +229,7 @@ struct qedr_ucontext { struct ib_ucontext ibucontext; struct qedr_dev *dev; struct qedr_pd *pd; - u64 dpi_addr; + void __iomem *dpi_addr; u64 dpi_phys_addr; u32 dpi_size; u16 dpi; diff --git a/drivers/net/ethernet/qlogic/qed/qed_rdma.c b/drivers/net/ethernet/qlogic/qed/qed_rdma.c index f900fde448db..95eba277f6e2 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_rdma.c +++ b/drivers/net/ethernet/qlogic/qed/qed_rdma.c @@ -799,9 +799,8 @@ static int qed_rdma_add_user(void *rdma_cxt, /* Calculate the corresponding DPI address */ dpi_start_offset = p_hwfn->dpi_start_offset; - out_params->dpi_addr = (u64)((u8 __iomem *)p_hwfn->doorbells + - dpi_start_offset + - ((out_params->dpi) * p_hwfn->dpi_size)); + out_params->dpi_addr = p_hwfn->doorbells + dpi_start_offset + + out_params->dpi * p_hwfn->dpi_size; out_params->dpi_phys_addr = p_hwfn->db_phys_addr + dpi_start_offset + diff --git a/include/linux/qed/qed_rdma_if.h b/include/linux/qed/qed_rdma_if.h index 898f595ea3d6..74efca15fde7 100644 --- a/include/linux/qed/qed_rdma_if.h +++ b/include/linux/qed/qed_rdma_if.h @@ -225,7 +225,7 @@ struct qed_rdma_start_in_params { struct qed_rdma_add_user_out_params { u16 dpi; - u64 dpi_addr; + void __iomem *dpi_addr; u64 dpi_phys_addr; u32 dpi_size; u16 wid_count; -- cgit v1.2.3 From b2b998c0f944993c9ef435569651e407d607af41 Mon Sep 17 00:00:00 2001 From: Jacek Anaszewski Date: Sun, 9 Jun 2019 20:19:03 +0200 Subject: leds: class: Improve LED and LED flash class registration API Replace of_led_classdev_register() with led_classdev_register_ext(), which accepts easily extendable struct led_init_data, instead of the fixed struct device_node argument. The latter can be now passed in an fwnode property of the struct led_init_data. The modification is driven by the need for passing additional arguments required for the forthcoming generic mechanism for composing LED names. Currently the LED name is conveyed in the "name" char pointer property of the struct led_classdev. This is redundant since LED class device name is accessible throughout the whole LED class device life time via associated struct device's kobj->name property. The change will not break any existing clients since the patch alters also existing led_classdev{_flash}_register() macro wrappers, that pass NULL in place of init_data, which leads to using legacy name initialization path basing on the struct led_classdev's "name" property. Three existing users of devm_of_led_classdev_registers() are modified to use devm_led_classdev_register(), which will not impact their operation since they in fact didn't need to pass struct device_node on registration from the beginning. Signed-off-by: Jacek Anaszewski Cc: Baolin Wang Cc: Dan Murphy Cc: Daniel Mack Cc: Linus Walleij Cc: Oleh Kravchenko Cc: Sakari Ailus Cc: Simon Shields Acked-by: Pavel Machek --- drivers/leds/led-class-flash.c | 9 +++++---- drivers/leds/led-class.c | 29 +++++++++++++++++------------ drivers/leds/leds-cr0014114.c | 3 +-- drivers/leds/leds-gpio.c | 2 +- drivers/leds/leds-max77650.c | 2 +- drivers/leds/leds-pwm.c | 3 +-- include/linux/led-class-flash.h | 15 ++++++++++----- include/linux/leds.h | 34 ++++++++++++++++++++++++---------- 8 files changed, 60 insertions(+), 37 deletions(-) (limited to 'include') diff --git a/drivers/leds/led-class-flash.c b/drivers/leds/led-class-flash.c index 94980c654d89..60c3de5c6b9f 100644 --- a/drivers/leds/led-class-flash.c +++ b/drivers/leds/led-class-flash.c @@ -282,8 +282,9 @@ static void led_flash_init_sysfs_groups(struct led_classdev_flash *fled_cdev) led_cdev->groups = flash_groups; } -int led_classdev_flash_register(struct device *parent, - struct led_classdev_flash *fled_cdev) +int led_classdev_flash_register_ext(struct device *parent, + struct led_classdev_flash *fled_cdev, + struct led_init_data *init_data) { struct led_classdev *led_cdev; const struct led_flash_ops *ops; @@ -309,13 +310,13 @@ int led_classdev_flash_register(struct device *parent, } /* Register led class device */ - ret = led_classdev_register(parent, led_cdev); + ret = led_classdev_register_ext(parent, led_cdev, init_data); if (ret < 0) return ret; return 0; } -EXPORT_SYMBOL_GPL(led_classdev_flash_register); +EXPORT_SYMBOL_GPL(led_classdev_flash_register_ext); void led_classdev_flash_unregister(struct led_classdev_flash *fled_cdev) { diff --git a/drivers/leds/led-class.c b/drivers/leds/led-class.c index 4793e77808e2..242122f49333 100644 --- a/drivers/leds/led-class.c +++ b/drivers/leds/led-class.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -242,14 +243,16 @@ static int led_classdev_next_name(const char *init_name, char *name, } /** - * of_led_classdev_register - register a new object of led_classdev class. + * led_classdev_register_ext - register a new object of led_classdev class + * with init data. * * @parent: parent of LED device * @led_cdev: the led_classdev structure for this device. - * @np: DT node describing this LED + * @init_data: LED class device initialization data */ -int of_led_classdev_register(struct device *parent, struct device_node *np, - struct led_classdev *led_cdev) +int led_classdev_register_ext(struct device *parent, + struct led_classdev *led_cdev, + struct led_init_data *init_data) { char name[LED_MAX_NAME_SIZE]; int ret; @@ -266,7 +269,8 @@ int of_led_classdev_register(struct device *parent, struct device_node *np, mutex_unlock(&led_cdev->led_access); return PTR_ERR(led_cdev->dev); } - led_cdev->dev->of_node = np; + if (init_data && init_data->fwnode) + led_cdev->dev->of_node = to_of_node(init_data->fwnode); if (ret) dev_warn(parent, "Led %s renamed to %s due to name collision", @@ -311,7 +315,7 @@ int of_led_classdev_register(struct device *parent, struct device_node *np, return 0; } -EXPORT_SYMBOL_GPL(of_led_classdev_register); +EXPORT_SYMBOL_GPL(led_classdev_register_ext); /** * led_classdev_unregister - unregisters a object of led_properties class. @@ -356,14 +360,15 @@ static void devm_led_classdev_release(struct device *dev, void *res) } /** - * devm_of_led_classdev_register - resource managed led_classdev_register() + * devm_led_classdev_register_ext - resource managed led_classdev_register_ext() * * @parent: parent of LED device * @led_cdev: the led_classdev structure for this device. + * @init_data: LED class device initialization data */ -int devm_of_led_classdev_register(struct device *parent, - struct device_node *np, - struct led_classdev *led_cdev) +int devm_led_classdev_register_ext(struct device *parent, + struct led_classdev *led_cdev, + struct led_init_data *init_data) { struct led_classdev **dr; int rc; @@ -372,7 +377,7 @@ int devm_of_led_classdev_register(struct device *parent, if (!dr) return -ENOMEM; - rc = of_led_classdev_register(parent, np, led_cdev); + rc = led_classdev_register_ext(parent, led_cdev, init_data); if (rc) { devres_free(dr); return rc; @@ -383,7 +388,7 @@ int devm_of_led_classdev_register(struct device *parent, return 0; } -EXPORT_SYMBOL_GPL(devm_of_led_classdev_register); +EXPORT_SYMBOL_GPL(devm_led_classdev_register_ext); static int devm_led_classdev_match(struct device *dev, void *res, void *data) { diff --git a/drivers/leds/leds-cr0014114.c b/drivers/leds/leds-cr0014114.c index 0e4262462cb9..1c82152764d2 100644 --- a/drivers/leds/leds-cr0014114.c +++ b/drivers/leds/leds-cr0014114.c @@ -207,8 +207,7 @@ static int cr0014114_probe_dt(struct cr0014114 *priv) led->ldev.max_brightness = CR_MAX_BRIGHTNESS; led->ldev.brightness_set_blocking = cr0014114_set_sync; - ret = devm_of_led_classdev_register(priv->dev, np, - &led->ldev); + ret = devm_led_classdev_register(priv->dev, &led->ldev); if (ret) { dev_err(priv->dev, "failed to register LED device %s, err %d", diff --git a/drivers/leds/leds-gpio.c b/drivers/leds/leds-gpio.c index bdc98ddca1dc..8f463c912db8 100644 --- a/drivers/leds/leds-gpio.c +++ b/drivers/leds/leds-gpio.c @@ -108,7 +108,7 @@ static int create_gpio_led(const struct gpio_led *template, if (ret < 0) return ret; - return devm_of_led_classdev_register(parent, np, &led_dat->cdev); + return devm_led_classdev_register(parent, &led_dat->cdev); } struct gpio_leds_priv { diff --git a/drivers/leds/leds-max77650.c b/drivers/leds/leds-max77650.c index 04738324b3e6..5a14f9775b0e 100644 --- a/drivers/leds/leds-max77650.c +++ b/drivers/leds/leds-max77650.c @@ -118,7 +118,7 @@ static int max77650_led_probe(struct platform_device *pdev) of_property_read_string(child, "linux,default-trigger", &led->cdev.default_trigger); - rv = devm_of_led_classdev_register(dev, child, &led->cdev); + rv = devm_led_classdev_register(dev, &led->cdev); if (rv) goto err_node_put; diff --git a/drivers/leds/leds-pwm.c b/drivers/leds/leds-pwm.c index 48d068f80f11..d0e1f2710351 100644 --- a/drivers/leds/leds-pwm.c +++ b/drivers/leds/leds-pwm.c @@ -111,8 +111,7 @@ static int led_pwm_add(struct device *dev, struct led_pwm_priv *priv, if (!led_data->period && (led->pwm_period_ns > 0)) led_data->period = led->pwm_period_ns; - ret = devm_of_led_classdev_register(dev, to_of_node(fwnode), - &led_data->cdev); + ret = devm_led_classdev_register(dev, &led_data->cdev); if (ret == 0) { priv->num_leds++; led_pwm_set(&led_data->cdev, led_data->cdev.brightness); diff --git a/include/linux/led-class-flash.h b/include/linux/led-class-flash.h index f52713f0a269..1e824963af17 100644 --- a/include/linux/led-class-flash.h +++ b/include/linux/led-class-flash.h @@ -86,15 +86,20 @@ static inline struct led_classdev_flash *lcdev_to_flcdev( } /** - * led_classdev_flash_register - register a new object of led_classdev class - * with support for flash LEDs - * @parent: the flash LED to register + * led_classdev_flash_register_ext - register a new object of LED class with + * init data and with support for flash LEDs + * @parent: LED flash controller device this flash LED is driven by * @fled_cdev: the led_classdev_flash structure for this device + * @init_data: the LED class flash device initialization data * * Returns: 0 on success or negative error value on failure */ -extern int led_classdev_flash_register(struct device *parent, - struct led_classdev_flash *fled_cdev); +extern int led_classdev_flash_register_ext(struct device *parent, + struct led_classdev_flash *fled_cdev, + struct led_init_data *init_data); + +#define led_classdev_flash_register(parent, fled_cdev) \ + led_classdev_flash_register_ext(parent, fled_cdev, NULL) /** * led_classdev_flash_unregister - unregisters an object of led_classdev class diff --git a/include/linux/leds.h b/include/linux/leds.h index 9b2bf574a17a..2bbe8a38fe39 100644 --- a/include/linux/leds.h +++ b/include/linux/leds.h @@ -30,6 +30,11 @@ enum led_brightness { LED_FULL = 255, }; +struct led_init_data { + /* device fwnode handle */ + struct fwnode_handle *fwnode; +}; + struct led_classdev { const char *name; enum led_brightness brightness; @@ -125,16 +130,25 @@ struct led_classdev { struct mutex led_access; }; -extern int of_led_classdev_register(struct device *parent, - struct device_node *np, - struct led_classdev *led_cdev); -#define led_classdev_register(parent, led_cdev) \ - of_led_classdev_register(parent, NULL, led_cdev) -extern int devm_of_led_classdev_register(struct device *parent, - struct device_node *np, - struct led_classdev *led_cdev); -#define devm_led_classdev_register(parent, led_cdev) \ - devm_of_led_classdev_register(parent, NULL, led_cdev) +/** + * led_classdev_register_ext - register a new object of LED class with + * init data + * @parent: LED controller device this LED is driven by + * @led_cdev: the led_classdev structure for this device + * @init_data: the LED class device initialization data + * + * Returns: 0 on success or negative error value on failure + */ +extern int led_classdev_register_ext(struct device *parent, + struct led_classdev *led_cdev, + struct led_init_data *init_data); +#define led_classdev_register(parent, led_cdev) \ + led_classdev_register_ext(parent, led_cdev, NULL) +extern int devm_led_classdev_register_ext(struct device *parent, + struct led_classdev *led_cdev, + struct led_init_data *init_data); +#define devm_led_classdev_register(parent, led_cdev) \ + devm_led_classdev_register_ext(parent, led_cdev, NULL) extern void led_classdev_unregister(struct led_classdev *led_cdev); extern void devm_led_classdev_unregister(struct device *parent, struct led_classdev *led_cdev); -- cgit v1.2.3 From 853a78a7d6c7734f5aaa9c2a866cbd1f2a658892 Mon Sep 17 00:00:00 2001 From: Jacek Anaszewski Date: Sun, 9 Jun 2019 20:19:04 +0200 Subject: dt-bindings: leds: Add LED_COLOR_ID definitions Add common LED color identifiers. Signed-off-by: Jacek Anaszewski Cc: Baolin Wang Cc: Daniel Mack Cc: Linus Walleij Cc: Oleh Kravchenko Cc: Sakari Ailus Cc: Simon Shields Reviewed-by: Rob Herring Acked-by: Pavel Machek Reviewed-by: Dan Murphy --- include/dt-bindings/leds/common.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/dt-bindings/leds/common.h b/include/dt-bindings/leds/common.h index e171d0a6beb2..217ee9e0dd6c 100644 --- a/include/dt-bindings/leds/common.h +++ b/include/dt-bindings/leds/common.h @@ -19,4 +19,15 @@ #define LEDS_BOOST_ADAPTIVE 1 #define LEDS_BOOST_FIXED 2 +/* Standard LED colors */ +#define LED_COLOR_ID_WHITE 0 +#define LED_COLOR_ID_RED 1 +#define LED_COLOR_ID_GREEN 2 +#define LED_COLOR_ID_BLUE 3 +#define LED_COLOR_ID_AMBER 4 +#define LED_COLOR_ID_VIOLET 5 +#define LED_COLOR_ID_YELLOW 6 +#define LED_COLOR_ID_IR 7 +#define LED_COLOR_ID_MAX 8 + #endif /* __DT_BINDINGS_LEDS_H */ -- cgit v1.2.3 From 2f430310f7b4216408aaae57c55a596091c47846 Mon Sep 17 00:00:00 2001 From: Jacek Anaszewski Date: Sun, 9 Jun 2019 20:19:04 +0200 Subject: dt-bindings: leds: Add LED_FUNCTION definitions Add initial set of common LED function definitions. Signed-off-by: Jacek Anaszewski Cc: Baolin Wang Cc: Daniel Mack Cc: Linus Walleij Cc: Oleh Kravchenko Cc: Sakari Ailus Cc: Simon Shields Reviewed-by: Rob Herring Acked-by: Pavel Machek Reviewed-by: Dan Murphy --- include/dt-bindings/leds/common.h | 44 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/dt-bindings/leds/common.h b/include/dt-bindings/leds/common.h index 217ee9e0dd6c..9e1256a7c1bf 100644 --- a/include/dt-bindings/leds/common.h +++ b/include/dt-bindings/leds/common.h @@ -3,8 +3,9 @@ * This header provides macros for the common LEDs device tree bindings. * * Copyright (C) 2015, Samsung Electronics Co., Ltd. - * * Author: Jacek Anaszewski + * + * Copyright (C) 2019 Jacek Anaszewski */ #ifndef __DT_BINDINGS_LEDS_H @@ -30,4 +31,45 @@ #define LED_COLOR_ID_IR 7 #define LED_COLOR_ID_MAX 8 +/* Standard LED functions */ +#define LED_FUNCTION_ACTIVITY "activity" +#define LED_FUNCTION_ALARM "alarm" +#define LED_FUNCTION_BACKLIGHT "backlight" +#define LED_FUNCTION_BLUETOOTH "bluetooth" +#define LED_FUNCTION_BOOT "boot" +#define LED_FUNCTION_CPU "cpu" +#define LED_FUNCTION_CAPSLOCK "capslock" +#define LED_FUNCTION_CHARGING "charging" +#define LED_FUNCTION_DEBUG "debug" +#define LED_FUNCTION_DISK "disk" +#define LED_FUNCTION_DISK_ACTIVITY "disk-activity" +#define LED_FUNCTION_DISK_ERR "disk-err" +#define LED_FUNCTION_DISK_READ "disk-read" +#define LED_FUNCTION_DISK_WRITE "disk-write" +#define LED_FUNCTION_FAULT "fault" +#define LED_FUNCTION_FLASH "flash" +#define LED_FUNCTION_HEARTBEAT "heartbeat" +#define LED_FUNCTION_INDICATOR "indicator" +#define LED_FUNCTION_KBD_BACKLIGHT "kbd_backlight" +#define LED_FUNCTION_LAN "lan" +#define LED_FUNCTION_MAIL "mail" +#define LED_FUNCTION_MTD "mtd" +#define LED_FUNCTION_MICMUTE "micmute" +#define LED_FUNCTION_MUTE "mute" +#define LED_FUNCTION_NUMLOCK "numlock" +#define LED_FUNCTION_PANIC "panic" +#define LED_FUNCTION_PROGRAMMING "programming" +#define LED_FUNCTION_POWER "power" +#define LED_FUNCTION_RX "rx" +#define LED_FUNCTION_SD "sd" +#define LED_FUNCTION_SCROLLLOCK "scrolllock" +#define LED_FUNCTION_STANDBY "standby" +#define LED_FUNCTION_STATUS "status" +#define LED_FUNCTION_TORCH "torch" +#define LED_FUNCTION_TX "tx" +#define LED_FUNCTION_USB "usb" +#define LED_FUNCTION_WAN "wan" +#define LED_FUNCTION_WLAN "wlan" +#define LED_FUNCTION_WPS "wps" + #endif /* __DT_BINDINGS_LEDS_H */ -- cgit v1.2.3 From bb4e9af0348dfeafd66c7e7f82e8a0983fe5390c Mon Sep 17 00:00:00 2001 From: Jacek Anaszewski Date: Sun, 9 Jun 2019 20:19:04 +0200 Subject: leds: core: Add support for composing LED class device names Add generic support for composing LED class device name. The newly introduced led_compose_name() function composes device name according to either or pattern, depending on the configuration of initialization data. Backward compatibility with in-driver hard-coded LED class device names is assured thanks to the default_label and devicename properties of newly introduced struct led_init_data. In case none of the aforementioned properties was found, then, for OF nodes, the node name is adopted for LED class device name. At the occassion of amending the Documentation/leds/leds-class.txt unify spelling: colour -> color. Alongside these changes added is a new tool - tools/leds/get_led_device_info.sh. The tool allows retrieving details of a LED class device's parent device, which proves that using vendor or product name for devicename part of LED name doesn't convey any added value since that information had been already available in sysfs. The script performs also basic validation of a LED class device name. Signed-off-by: Jacek Anaszewski Cc: Baolin Wang Cc: Dan Murphy Cc: Daniel Mack Cc: Linus Walleij Cc: Oleh Kravchenko Cc: Sakari Ailus Cc: Simon Shields Reviewed-by: Linus Walleij Acked-by: Pavel Machek --- Documentation/leds/leds-class.rst | 70 ++++++++++++- drivers/leds/led-class.c | 20 +++- drivers/leds/led-core.c | 127 ++++++++++++++++++++++++ drivers/leds/leds.h | 1 + include/linux/leds.h | 45 +++++++++ tools/leds/get_led_device_info.sh | 201 ++++++++++++++++++++++++++++++++++++++ 6 files changed, 458 insertions(+), 6 deletions(-) create mode 100755 tools/leds/get_led_device_info.sh (limited to 'include') diff --git a/Documentation/leds/leds-class.rst b/Documentation/leds/leds-class.rst index df0120a1ee3c..a0708d3f3d0b 100644 --- a/Documentation/leds/leds-class.rst +++ b/Documentation/leds/leds-class.rst @@ -43,9 +43,73 @@ LED Device Naming Is currently of the form: - "devicename:colour:function" - -There have been calls for LED properties such as colour to be exported as + "devicename:color:function" + +- devicename: + it should refer to a unique identifier created by the kernel, + like e.g. phyN for network devices or inputN for input devices, rather + than to the hardware; the information related to the product and the bus + to which given device is hooked is available in sysfs and can be + retrieved using get_led_device_info.sh script from tools/leds; generally + this section is expected mostly for LEDs that are somehow associated with + other devices. + +- color: + one of LED_COLOR_ID_* definitions from the header + include/dt-bindings/leds/common.h. + +- function: + one of LED_FUNCTION_* definitions from the header + include/dt-bindings/leds/common.h. + +If required color or function is missing, please submit a patch +to linux-leds@vger.kernel.org. + +It is possible that more than one LED with the same color and function will +be required for given platform, differing only with an ordinal number. +In this case it is preferable to just concatenate the predefined LED_FUNCTION_* +name with required "-N" suffix in the driver. fwnode based drivers can use +function-enumerator property for that and then the concatenation will be handled +automatically by the LED core upon LED class device registration. + +LED subsystem has also a protection against name clash, that may occur +when LED class device is created by a driver of hot-pluggable device and +it doesn't provide unique devicename section. In this case numerical +suffix (e.g. "_1", "_2", "_3" etc.) is added to the requested LED class +device name. + +There might be still LED class drivers around using vendor or product name +for devicename, but this approach is now deprecated as it doesn't convey +any added value. Product information can be found in other places in sysfs +(see tools/leds/get_led_device_info.sh). + +Examples of proper LED names: + + - "red:disk" + - "white:flash" + - "red:indicator" + - "phy1:green:wlan" + - "phy3::wlan" + - ":kbd_backlight" + - "input5::kbd_backlight" + - "input3::numlock" + - "input3::scrolllock" + - "input3::capslock" + - "mmc1::status" + - "white:status" + +get_led_device_info.sh script can be used for verifying if the LED name +meets the requirements pointed out here. It performs validation of the LED class +devicename sections and gives hints on expected value for a section in case +the validation fails for it. So far the script supports validation +of associations between LEDs and following types of devices: + + - input devices + - ieee80211 compliant USB devices + +The script is open to extensions. + +There have been calls for LED properties such as color to be exported as individual led class attributes. As a solution which doesn't incur as much overhead, I suggest these become part of the device name. The naming scheme above leaves scope for further attributes should they be needed. If sections diff --git a/drivers/leds/led-class.c b/drivers/leds/led-class.c index 242122f49333..508d6477d0b8 100644 --- a/drivers/leds/led-class.c +++ b/drivers/leds/led-class.c @@ -254,17 +254,31 @@ int led_classdev_register_ext(struct device *parent, struct led_classdev *led_cdev, struct led_init_data *init_data) { - char name[LED_MAX_NAME_SIZE]; + char composed_name[LED_MAX_NAME_SIZE]; + char final_name[LED_MAX_NAME_SIZE]; + const char *proposed_name = composed_name; int ret; - ret = led_classdev_next_name(led_cdev->name, name, sizeof(name)); + if (init_data) { + if (init_data->devname_mandatory && !init_data->devicename) { + dev_err(parent, "Mandatory device name is missing"); + return -EINVAL; + } + ret = led_compose_name(parent, init_data, composed_name); + if (ret < 0) + return ret; + } else { + proposed_name = led_cdev->name; + } + + ret = led_classdev_next_name(proposed_name, final_name, sizeof(final_name)); if (ret < 0) return ret; mutex_init(&led_cdev->led_access); mutex_lock(&led_cdev->led_access); led_cdev->dev = device_create_with_groups(leds_class, parent, 0, - led_cdev, led_cdev->groups, "%s", name); + led_cdev, led_cdev->groups, "%s", final_name); if (IS_ERR(led_cdev->dev)) { mutex_unlock(&led_cdev->led_access); return PTR_ERR(led_cdev->dev); diff --git a/drivers/leds/led-core.c b/drivers/leds/led-core.c index 7107cd7e87cf..f0c1c403f678 100644 --- a/drivers/leds/led-core.c +++ b/drivers/leds/led-core.c @@ -13,8 +13,10 @@ #include #include #include +#include #include #include +#include #include "leds.h" DECLARE_RWSEM(leds_list_lock); @@ -23,6 +25,18 @@ EXPORT_SYMBOL_GPL(leds_list_lock); LIST_HEAD(leds_list); EXPORT_SYMBOL_GPL(leds_list); +const char * const led_colors[LED_COLOR_ID_MAX] = { + [LED_COLOR_ID_WHITE] = "white", + [LED_COLOR_ID_RED] = "red", + [LED_COLOR_ID_GREEN] = "green", + [LED_COLOR_ID_BLUE] = "blue", + [LED_COLOR_ID_AMBER] = "amber", + [LED_COLOR_ID_VIOLET] = "violet", + [LED_COLOR_ID_YELLOW] = "yellow", + [LED_COLOR_ID_IR] = "ir", +}; +EXPORT_SYMBOL_GPL(led_colors); + static int __led_set_brightness(struct led_classdev *led_cdev, enum led_brightness value) { @@ -353,3 +367,116 @@ void led_sysfs_enable(struct led_classdev *led_cdev) led_cdev->flags &= ~LED_SYSFS_DISABLE; } EXPORT_SYMBOL_GPL(led_sysfs_enable); + +static void led_parse_fwnode_props(struct device *dev, + struct fwnode_handle *fwnode, + struct led_properties *props) +{ + int ret; + + if (!fwnode) + return; + + if (fwnode_property_present(fwnode, "label")) { + ret = fwnode_property_read_string(fwnode, "label", &props->label); + if (ret) + dev_err(dev, "Error parsing 'label' property (%d)\n", ret); + return; + } + + if (fwnode_property_present(fwnode, "color")) { + ret = fwnode_property_read_u32(fwnode, "color", &props->color); + if (ret) + dev_err(dev, "Error parsing 'color' property (%d)\n", ret); + else if (props->color >= LED_COLOR_ID_MAX) + dev_err(dev, "LED color identifier out of range\n"); + else + props->color_present = true; + } + + + if (!fwnode_property_present(fwnode, "function")) + return; + + ret = fwnode_property_read_string(fwnode, "function", &props->function); + if (ret) { + dev_err(dev, + "Error parsing 'function' property (%d)\n", + ret); + } + + if (!fwnode_property_present(fwnode, "function-enumerator")) + return; + + ret = fwnode_property_read_u32(fwnode, "function-enumerator", + &props->func_enum); + if (ret) { + dev_err(dev, + "Error parsing 'function-enumerator' property (%d)\n", + ret); + } else { + props->func_enum_present = true; + } +} + +int led_compose_name(struct device *dev, struct led_init_data *init_data, + char *led_classdev_name) +{ + struct led_properties props = {}; + struct fwnode_handle *fwnode = init_data->fwnode; + const char *devicename = init_data->devicename; + + if (!led_classdev_name) + return -EINVAL; + + led_parse_fwnode_props(dev, fwnode, &props); + + if (props.label) { + /* + * If init_data.devicename is NULL, then it indicates that + * DT label should be used as-is for LED class device name. + * Otherwise the label is prepended with devicename to compose + * the final LED class device name. + */ + if (!devicename) { + strscpy(led_classdev_name, props.label, + LED_MAX_NAME_SIZE); + } else { + snprintf(led_classdev_name, LED_MAX_NAME_SIZE, "%s:%s", + devicename, props.label); + } + } else if (props.function || props.color_present) { + char tmp_buf[LED_MAX_NAME_SIZE]; + + if (props.func_enum_present) { + snprintf(tmp_buf, LED_MAX_NAME_SIZE, "%s:%s-%d", + props.color_present ? led_colors[props.color] : "", + props.function ?: "", props.func_enum); + } else { + snprintf(tmp_buf, LED_MAX_NAME_SIZE, "%s:%s", + props.color_present ? led_colors[props.color] : "", + props.function ?: ""); + } + if (init_data->devname_mandatory) { + snprintf(led_classdev_name, LED_MAX_NAME_SIZE, "%s:%s", + devicename, tmp_buf); + } else { + strscpy(led_classdev_name, tmp_buf, LED_MAX_NAME_SIZE); + + } + } else if (init_data->default_label) { + if (!devicename) { + dev_err(dev, "Legacy LED naming requires devicename segment"); + return -EINVAL; + } + snprintf(led_classdev_name, LED_MAX_NAME_SIZE, "%s:%s", + devicename, init_data->default_label); + } else if (is_of_node(fwnode)) { + strscpy(led_classdev_name, to_of_node(fwnode)->name, + LED_MAX_NAME_SIZE); + } else + return -EINVAL; + + return 0; +} +EXPORT_SYMBOL_GPL(led_compose_name); diff --git a/drivers/leds/leds.h b/drivers/leds/leds.h index 47b229469069..0b577cece8f7 100644 --- a/drivers/leds/leds.h +++ b/drivers/leds/leds.h @@ -27,5 +27,6 @@ void led_set_brightness_nosleep(struct led_classdev *led_cdev, extern struct rw_semaphore leds_list_lock; extern struct list_head leds_list; extern struct list_head trigger_list; +extern const char * const led_colors[LED_COLOR_ID_MAX]; #endif /* __LEDS_H_INCLUDED */ diff --git a/include/linux/leds.h b/include/linux/leds.h index 2bbe8a38fe39..d101fd13e18e 100644 --- a/include/linux/leds.h +++ b/include/linux/leds.h @@ -8,6 +8,7 @@ #ifndef __LINUX_LEDS_H_INCLUDED #define __LINUX_LEDS_H_INCLUDED +#include #include #include #include @@ -33,6 +34,25 @@ enum led_brightness { struct led_init_data { /* device fwnode handle */ struct fwnode_handle *fwnode; + /* + * default tuple, for backward compatibility + * with in-driver hard-coded LED names used as a fallback when + * DT "label" property is absent; it should be set to NULL + * in new LED class drivers. + */ + const char *default_label; + /* + * string to be used for devicename section of LED class device + * either for label based LED name composition path or for fwnode + * based when devname_mandatory is true + */ + const char *devicename; + /* + * indicates if LED name should always comprise devicename section; + * only LEDs exposed by drivers of hot-pluggable devices should + * set it to true + */ + bool devname_mandatory; }; struct led_classdev { @@ -257,6 +277,22 @@ extern void led_sysfs_disable(struct led_classdev *led_cdev); */ extern void led_sysfs_enable(struct led_classdev *led_cdev); +/** + * led_compose_name - compose LED class device name + * @dev: LED controller device object + * @child: child fwnode_handle describing a LED or a group of synchronized LEDs; + * it must be provided only for fwnode based LEDs + * @led_classdev_name: composed LED class device name + * + * Create LED class device name basing on the provided init_data argument. + * The name can have or . + * form, depending on the init_data configuration. + * + * Returns: 0 on success or negative error value on failure + */ +extern int led_compose_name(struct device *dev, struct led_init_data *init_data, + char *led_classdev_name); + /** * led_sysfs_is_disabled - check if LED sysfs interface is disabled * @led_cdev: the LED to query @@ -434,6 +470,15 @@ struct led_platform_data { struct led_info *leds; }; +struct led_properties { + u32 color; + bool color_present; + const char *function; + u32 func_enum; + bool func_enum_present; + const char *label; +}; + struct gpio_desc; typedef int (*gpio_blink_set_t)(struct gpio_desc *desc, int state, unsigned long *delay_on, diff --git a/tools/leds/get_led_device_info.sh b/tools/leds/get_led_device_info.sh new file mode 100755 index 000000000000..ccfa442db5fe --- /dev/null +++ b/tools/leds/get_led_device_info.sh @@ -0,0 +1,201 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 + +led_common_defs_path="include/dt-bindings/leds/common.h" + +num_args=$# +if [ $num_args -eq 1 ]; then + linux_top=$(dirname `realpath $0` | awk -F/ \ + '{ \ + i=1; \ + while (i <= NF - 2) { \ + printf $i"/"; \ + i++; \ + }; \ + }') + led_defs_path=$linux_top/$led_common_defs_path +elif [ $num_args -eq 2 ]; then + led_defs_path=`realpath $2` +else + echo "Usage: get_led_device_info.sh LED_CDEV_PATH [LED_COMMON_DEFS_PATH]" + exit 1 +fi + +if [ ! -f $led_defs_path ]; then + echo "$led_defs_path doesn't exist" + exit 1 +fi + +led_cdev_path=`echo $1 | sed s'/\/$//'` + +ls "$led_cdev_path/brightness" > /dev/null 2>&1 +if [ $? -ne 0 ]; then + echo "Device \"$led_cdev_path\" does not exist." + exit 1 +fi + +bus=`readlink $led_cdev_path/device/subsystem | sed s'/.*\///'` +usb_subdev=`readlink $led_cdev_path | grep usb | sed s'/\(.*usb[0-9]*\/[0-9]*-[0-9]*\)\/.*/\1/'` +ls "$led_cdev_path/device/of_node/compatible" > /dev/null 2>&1 +of_node_missing=$? + +if [ "$bus" = "input" ]; then + input_node=`readlink $led_cdev_path/device | sed s'/.*\///'` + if [ ! -z "$usb_subdev" ]; then + bus="usb" + fi +fi + +if [ "$bus" = "usb" ]; then + usb_interface=`readlink $led_cdev_path | sed s'/.*\(usb[0-9]*\)/\1/' | cut -d\/ -f3` + cd $led_cdev_path/../$usb_subdev + driver=`readlink $usb_interface/driver | sed s'/.*\///'` + if [ -d "$usb_interface/ieee80211" ]; then + wifi_phy=`ls -l $usb_interface/ieee80211 | grep phy | awk '{print $9}'` + fi + idVendor=`cat idVendor` + idProduct=`cat idProduct` + manufacturer=`cat manufacturer` + product=`cat product` +elif [ "$bus" = "input" ]; then + cd $led_cdev_path + product=`cat device/name` + driver=`cat device/device/driver/description` +elif [ $of_node_missing -eq 0 ]; then + cd $led_cdev_path + compatible=`cat device/of_node/compatible` + if [ "$compatible" = "gpio-leds" ]; then + driver="leds-gpio" + elif [ "$compatible" = "pwm-leds" ]; then + driver="leds-pwm" + else + manufacturer=`echo $compatible | awk -F, '{print $1}'` + product=`echo $compatible | awk -F, '{print $2}'` + fi +else + echo "Unknown device type." + exit 1 +fi + +printf "\n#####################################\n" +printf "# LED class device hardware details #\n" +printf "#####################################\n\n" + +printf "bus:\t\t\t$bus\n" + +if [ ! -z "$idVendor" ]; then + printf "idVendor:\t\t$idVendor\n" +fi + +if [ ! -z "$idProduct" ]; then + printf "idProduct:\t\t$idProduct\n" +fi + +if [ ! -z "$manufacturer" ]; then + printf "manufacturer:\t\t$manufacturer\n" +fi + +if [ ! -z "$product" ]; then + printf "product:\t\t$product\n" +fi + +if [ ! -z "$driver" ]; then + printf "driver:\t\t\t$driver\n" +fi + +if [ ! -z "$input_node" ]; then + printf "associated input node:\t$input_node\n" +fi + +printf "\n####################################\n" +printf "# LED class device name validation #\n" +printf "####################################\n\n" + +led_name=`echo $led_cdev_path | sed s'/.*\///'` + +num_sections=`echo $led_name | awk -F: '{print NF}'` + +if [ $num_sections -eq 1 ]; then + printf "\":\" delimiter not detected.\t[ FAILED ]\n" + exit 1 +elif [ $num_sections -eq 2 ]; then + color=`echo $led_name | cut -d: -f1` + function=`echo $led_name | cut -d: -f2` +elif [ $num_sections -eq 3 ]; then + devicename=`echo $led_name | cut -d: -f1` + color=`echo $led_name | cut -d: -f2` + function=`echo $led_name | cut -d: -f3` +else + printf "Detected %d sections in the LED class device name - should the script be updated?\n" $num_sections + exit 1 +fi + +S_DEV="devicename" +S_CLR="color " +S_FUN="function " +status_tab=20 + +print_msg_ok() +{ + local section_name="$1" + local section_val="$2" + local msg="$3" + printf "$section_name :\t%-${status_tab}.${status_tab}s %s %s\n" "$section_val" "[ OK ] " "$msg" +} + +print_msg_failed() +{ + local section_name="$1" + local section_val="$2" + local msg="$3" + printf "$section_name :\t%-${status_tab}.${status_tab}s %s %s\n" "$section_val" "[ FAILED ]" "$msg" +} + +if [ ! -z "$input_node" ]; then + expected_devname=$input_node +elif [ ! -z "$wifi_phy" ]; then + expected_devname=$wifi_phy +fi + +if [ ! -z "$devicename" ]; then + if [ ! -z "$expected_devname" ]; then + if [ "$devicename" = "$expected_devname" ]; then + print_msg_ok "$S_DEV" "$devicename" + else + print_msg_failed "$S_DEV" "$devicename" "Expected: $expected_devname" + fi + else + if [ "$devicename" = "$manufacturer" ]; then + print_msg_failed "$S_DEV" "$devicename" "Redundant: use of vendor name is discouraged" + elif [ "$devicename" = "$product" ]; then + print_msg_failed "$S_DEV" "$devicename" "Redundant: use of product name is discouraged" + else + print_msg_failed "$S_DEV" "$devicename" "Unknown devicename - should the script be updated?" + fi + fi +elif [ ! -z "$expected_devname" ]; then + print_msg_failed "$S_DEV" "blank" "Expected: $expected_devname" +fi + +if [ ! -z "$color" ]; then + color_upper=`echo $color | tr [:lower:] [:upper:]` + color_id_definition=$(cat $led_defs_path | grep "_$color_upper\s" | awk '{print $2}') + if [ ! -z "$color_id_definition" ]; then + print_msg_ok "$S_CLR" "$color" "Matching definition: $color_id_definition" + else + print_msg_failed "$S_CLR" "$color" "Definition not found in $led_defs_path" + fi + +fi + +if [ ! -z "$function" ]; then + # strip optional enumerator + function=`echo $function | sed s'/\(.*\)-[0-9]*$/\1/'` + fun_definition=$(cat $led_defs_path | grep "\"$function\"" | awk '{print $2}') + if [ ! -z "$fun_definition" ]; then + print_msg_ok "$S_FUN" "$function" "Matching definition: $fun_definition" + else + print_msg_failed "$S_FUN" "$function" "Definition not found in $led_defs_path" + fi + +fi -- cgit v1.2.3 From f8be17b81d44aed1f9ea68c3fc70f501c9616e2d Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 23 Jul 2019 10:22:48 +0300 Subject: lib/dim: Fix -Wunused-const-variable warnings DIM causes to the following warnings during kernel compilation which indicates that tx_profile and rx_profile are supposed to be declared in *.c and not in *.h files. In file included from ./include/rdma/ib_verbs.h:64, from ./include/linux/mlx5/device.h:37, from ./include/linux/mlx5/driver.h:51, from ./include/linux/mlx5/vport.h:36, from drivers/infiniband/hw/mlx5/ib_virt.c:34: ./include/linux/dim.h:326:1: warning: _tx_profile_ defined but not used [-Wunused-const-variable=] 326 | tx_profile[DIM_CQ_PERIOD_NUM_MODES][NET_DIM_PARAMS_NUM_PROFILES] = { | ^~~~~~~~~~ ./include/linux/dim.h:320:1: warning: _rx_profile_ defined but not used [-Wunused-const-variable=] 320 | rx_profile[DIM_CQ_PERIOD_NUM_MODES][NET_DIM_PARAMS_NUM_PROFILES] = { | ^~~~~~~~~~ Fixes: 4f75da3666c0 ("linux/dim: Move implementation to .c files") Signed-off-by: Leon Romanovsky Reviewed-by: Bart Van Assche Acked-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/linux/dim.h | 56 ----------------------------------------------------- lib/dim/net_dim.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+), 56 deletions(-) (limited to 'include') diff --git a/include/linux/dim.h b/include/linux/dim.h index d3a0fbfff2bb..9fa4b3f88c39 100644 --- a/include/linux/dim.h +++ b/include/linux/dim.h @@ -272,62 +272,6 @@ dim_update_sample_with_comps(u16 event_ctr, u64 packets, u64 bytes, u64 comps, /* Net DIM */ -/* - * Net DIM profiles: - * There are different set of profiles for each CQ period mode. - * There are different set of profiles for RX/TX CQs. - * Each profile size must be of NET_DIM_PARAMS_NUM_PROFILES - */ -#define NET_DIM_PARAMS_NUM_PROFILES 5 -#define NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE 256 -#define NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE 128 -#define NET_DIM_DEF_PROFILE_CQE 1 -#define NET_DIM_DEF_PROFILE_EQE 1 - -#define NET_DIM_RX_EQE_PROFILES { \ - {1, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \ - {8, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \ - {64, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \ - {128, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \ - {256, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \ -} - -#define NET_DIM_RX_CQE_PROFILES { \ - {2, 256}, \ - {8, 128}, \ - {16, 64}, \ - {32, 64}, \ - {64, 64} \ -} - -#define NET_DIM_TX_EQE_PROFILES { \ - {1, NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE}, \ - {8, NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE}, \ - {32, NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE}, \ - {64, NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE}, \ - {128, NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE} \ -} - -#define NET_DIM_TX_CQE_PROFILES { \ - {5, 128}, \ - {8, 64}, \ - {16, 32}, \ - {32, 32}, \ - {64, 32} \ -} - -static const struct dim_cq_moder -rx_profile[DIM_CQ_PERIOD_NUM_MODES][NET_DIM_PARAMS_NUM_PROFILES] = { - NET_DIM_RX_EQE_PROFILES, - NET_DIM_RX_CQE_PROFILES, -}; - -static const struct dim_cq_moder -tx_profile[DIM_CQ_PERIOD_NUM_MODES][NET_DIM_PARAMS_NUM_PROFILES] = { - NET_DIM_TX_EQE_PROFILES, - NET_DIM_TX_CQE_PROFILES, -}; - /** * net_dim_get_rx_moderation - provide a CQ moderation object for the given RX profile * @cq_period_mode: CQ period mode diff --git a/lib/dim/net_dim.c b/lib/dim/net_dim.c index 5bcc902c5388..a4db51c21266 100644 --- a/lib/dim/net_dim.c +++ b/lib/dim/net_dim.c @@ -5,6 +5,62 @@ #include +/* + * Net DIM profiles: + * There are different set of profiles for each CQ period mode. + * There are different set of profiles for RX/TX CQs. + * Each profile size must be of NET_DIM_PARAMS_NUM_PROFILES + */ +#define NET_DIM_PARAMS_NUM_PROFILES 5 +#define NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE 256 +#define NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE 128 +#define NET_DIM_DEF_PROFILE_CQE 1 +#define NET_DIM_DEF_PROFILE_EQE 1 + +#define NET_DIM_RX_EQE_PROFILES { \ + {1, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \ + {8, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \ + {64, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \ + {128, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \ + {256, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \ +} + +#define NET_DIM_RX_CQE_PROFILES { \ + {2, 256}, \ + {8, 128}, \ + {16, 64}, \ + {32, 64}, \ + {64, 64} \ +} + +#define NET_DIM_TX_EQE_PROFILES { \ + {1, NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE}, \ + {8, NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE}, \ + {32, NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE}, \ + {64, NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE}, \ + {128, NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE} \ +} + +#define NET_DIM_TX_CQE_PROFILES { \ + {5, 128}, \ + {8, 64}, \ + {16, 32}, \ + {32, 32}, \ + {64, 32} \ +} + +static const struct dim_cq_moder +rx_profile[DIM_CQ_PERIOD_NUM_MODES][NET_DIM_PARAMS_NUM_PROFILES] = { + NET_DIM_RX_EQE_PROFILES, + NET_DIM_RX_CQE_PROFILES, +}; + +static const struct dim_cq_moder +tx_profile[DIM_CQ_PERIOD_NUM_MODES][NET_DIM_PARAMS_NUM_PROFILES] = { + NET_DIM_TX_EQE_PROFILES, + NET_DIM_TX_CQE_PROFILES, +}; + struct dim_cq_moder net_dim_get_rx_moderation(u8 cq_period_mode, int ix) { -- cgit v1.2.3 From 02712bc3250849c1cf99d626aea98f610e695f34 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 24 Jul 2019 08:52:53 +0200 Subject: mm/hmm: move hmm_vma_range_done and hmm_vma_fault to nouveau These two functions are marked as a legacy APIs to get rid of, but seem to suit the current nouveau flow. Move it to the only user in preparation for fixing a locking bug involving caller and callee. All comments referring to the old API have been removed as this now is a driver private helper. Link: https://lore.kernel.org/r/20190724065258.16603-3-hch@lst.de Tested-by: Ralph Campbell Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- drivers/gpu/drm/nouveau/nouveau_svm.c | 46 +++++++++++++++++++++++++++-- include/linux/hmm.h | 54 ----------------------------------- 2 files changed, 44 insertions(+), 56 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c index 8c92374afcf2..6c1b04de0db8 100644 --- a/drivers/gpu/drm/nouveau/nouveau_svm.c +++ b/drivers/gpu/drm/nouveau/nouveau_svm.c @@ -475,6 +475,48 @@ nouveau_svm_fault_cache(struct nouveau_svm *svm, fault->inst, fault->addr, fault->access); } +static inline bool +nouveau_range_done(struct hmm_range *range) +{ + bool ret = hmm_range_valid(range); + + hmm_range_unregister(range); + return ret; +} + +static int +nouveau_range_fault(struct hmm_mirror *mirror, struct hmm_range *range, + bool block) +{ + long ret; + + range->default_flags = 0; + range->pfn_flags_mask = -1UL; + + ret = hmm_range_register(range, mirror, + range->start, range->end, + PAGE_SHIFT); + if (ret) + return (int)ret; + + if (!hmm_range_wait_until_valid(range, HMM_RANGE_DEFAULT_TIMEOUT)) { + up_read(&range->vma->vm_mm->mmap_sem); + return -EAGAIN; + } + + ret = hmm_range_fault(range, block); + if (ret <= 0) { + if (ret == -EBUSY || !ret) { + up_read(&range->vma->vm_mm->mmap_sem); + ret = -EBUSY; + } else if (ret == -EAGAIN) + ret = -EBUSY; + hmm_range_unregister(range); + return ret; + } + return 0; +} + static int nouveau_svm_fault(struct nvif_notify *notify) { @@ -649,10 +691,10 @@ nouveau_svm_fault(struct nvif_notify *notify) range.values = nouveau_svm_pfn_values; range.pfn_shift = NVIF_VMM_PFNMAP_V0_ADDR_SHIFT; again: - ret = hmm_vma_fault(&svmm->mirror, &range, true); + ret = nouveau_range_fault(&svmm->mirror, &range, true); if (ret == 0) { mutex_lock(&svmm->mutex); - if (!hmm_vma_range_done(&range)) { + if (!nouveau_range_done(&range)) { mutex_unlock(&svmm->mutex); goto again; } diff --git a/include/linux/hmm.h b/include/linux/hmm.h index b8a08b2a10ca..7ef56dc18050 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -484,60 +484,6 @@ long hmm_range_dma_unmap(struct hmm_range *range, */ #define HMM_RANGE_DEFAULT_TIMEOUT 1000 -/* This is a temporary helper to avoid merge conflict between trees. */ -static inline bool hmm_vma_range_done(struct hmm_range *range) -{ - bool ret = hmm_range_valid(range); - - hmm_range_unregister(range); - return ret; -} - -/* This is a temporary helper to avoid merge conflict between trees. */ -static inline int hmm_vma_fault(struct hmm_mirror *mirror, - struct hmm_range *range, bool block) -{ - long ret; - - /* - * With the old API the driver must set each individual entries with - * the requested flags (valid, write, ...). So here we set the mask to - * keep intact the entries provided by the driver and zero out the - * default_flags. - */ - range->default_flags = 0; - range->pfn_flags_mask = -1UL; - - ret = hmm_range_register(range, mirror, - range->start, range->end, - PAGE_SHIFT); - if (ret) - return (int)ret; - - if (!hmm_range_wait_until_valid(range, HMM_RANGE_DEFAULT_TIMEOUT)) { - /* - * The mmap_sem was taken by driver we release it here and - * returns -EAGAIN which correspond to mmap_sem have been - * drop in the old API. - */ - up_read(&range->vma->vm_mm->mmap_sem); - return -EAGAIN; - } - - ret = hmm_range_fault(range, block); - if (ret <= 0) { - if (ret == -EBUSY || !ret) { - /* Same as above, drop mmap_sem to match old API. */ - up_read(&range->vma->vm_mm->mmap_sem); - ret = -EBUSY; - } else if (ret == -EAGAIN) - ret = -EBUSY; - hmm_range_unregister(range); - return ret; - } - return 0; -} - /* Below are for HMM internal use only! Not to be used by device driver! */ static inline void hmm_mm_init(struct mm_struct *mm) { -- cgit v1.2.3 From f32471e2cf87112b8f5dc10469b27c39c1a41722 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 24 Jul 2019 08:52:57 +0200 Subject: mm/hmm: remove the legacy hmm_pfn_* APIs Switch the one remaining user in nouveau over to its replacement, and remove all the wrappers. Link: https://lore.kernel.org/r/20190724065258.16603-7-hch@lst.de Tested-by: Ralph Campbell Signed-off-by: Christoph Hellwig Reviewed-by: Ralph Campbell Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- drivers/gpu/drm/nouveau/nouveau_dmem.c | 2 +- include/linux/hmm.h | 34 ---------------------------------- 2 files changed, 1 insertion(+), 35 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index 1333220787a1..345c63cb752a 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c @@ -845,7 +845,7 @@ nouveau_dmem_convert_pfn(struct nouveau_drm *drm, struct page *page; uint64_t addr; - page = hmm_pfn_to_page(range, range->pfns[i]); + page = hmm_device_entry_to_page(range, range->pfns[i]); if (page == NULL) continue; diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 7ef56dc18050..9f32586684c9 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -290,40 +290,6 @@ static inline uint64_t hmm_device_entry_from_pfn(const struct hmm_range *range, range->flags[HMM_PFN_VALID]; } -/* - * Old API: - * hmm_pfn_to_page() - * hmm_pfn_to_pfn() - * hmm_pfn_from_page() - * hmm_pfn_from_pfn() - * - * This are the OLD API please use new API, it is here to avoid cross-tree - * merge painfullness ie we convert things to new API in stages. - */ -static inline struct page *hmm_pfn_to_page(const struct hmm_range *range, - uint64_t pfn) -{ - return hmm_device_entry_to_page(range, pfn); -} - -static inline unsigned long hmm_pfn_to_pfn(const struct hmm_range *range, - uint64_t pfn) -{ - return hmm_device_entry_to_pfn(range, pfn); -} - -static inline uint64_t hmm_pfn_from_page(const struct hmm_range *range, - struct page *page) -{ - return hmm_device_entry_from_page(range, page); -} - -static inline uint64_t hmm_pfn_from_pfn(const struct hmm_range *range, - unsigned long pfn) -{ - return hmm_device_entry_from_pfn(range, pfn); -} - /* * Mirroring: how to synchronize device page table with CPU page table. * -- cgit v1.2.3 From 7a32f2962c56d9d8a836b4469855caeee8766bd4 Mon Sep 17 00:00:00 2001 From: Edward Srouji Date: Tue, 23 Jul 2019 10:12:55 +0300 Subject: net/mlx5: Fix modify_cq_in alignment Fix modify_cq_in alignment to match the device specification. After this fix the 'cq_umem_valid' field will be in the right offset. Cc: # 4.19 Fixes: bd37197554eb ("net/mlx5: Update mlx5_ifc with DEVX UID bits") Signed-off-by: Edward Srouji Reviewed-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index b3d5752657d9..ec571fd7fcf8 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -5975,10 +5975,12 @@ struct mlx5_ifc_modify_cq_in_bits { struct mlx5_ifc_cqc_bits cq_context; - u8 reserved_at_280[0x40]; + u8 reserved_at_280[0x60]; u8 cq_umem_valid[0x1]; - u8 reserved_at_2c1[0x5bf]; + u8 reserved_at_2e1[0x1f]; + + u8 reserved_at_300[0x580]; u8 pas[0][0x40]; }; -- cgit v1.2.3 From 90bb769291161cf25a818d69cf608c181654473e Mon Sep 17 00:00:00 2001 From: Ariel Levkovich Date: Sat, 6 Jul 2019 18:06:15 +0300 Subject: net/mlx5e: Prevent encap flow counter update async to user query This patch prevents a race between user invoked cached counters query and a neighbor last usage updater. The cached flow counter stats can be queried by calling "mlx5_fc_query_cached" which provides the number of bytes and packets that passed via this flow since the last time this counter was queried. It does so by reducting the last saved stats from the current, cached stats and then updating the last saved stats with the cached stats. It also provide the lastuse value for that flow. Since "mlx5e_tc_update_neigh_used_value" needs to retrieve the last usage time of encapsulation flows, it calls the flow counter query method periodically and async to user queries of the flow counter using cls_flower. This call is causing the driver to update the last reported bytes and packets from the cache and therefore, future user queries of the flow stats will return lower than expected number for bytes and packets since the last saved stats in the driver was updated async to the last saved stats in cls_flower. This causes wrong stats presentation of encapsulation flows to user. Since the neighbor usage updater only needs the lastuse stats from the cached counter, the fix is to use a dedicated lastuse query call that returns the lastuse value without synching between the cached stats and the last saved stats. Fixes: f6dfb4c3f216 ("net/mlx5e: Update neighbour 'used' state using HW flow rules counters") Signed-off-by: Ariel Levkovich Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 4 ++-- drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c | 5 +++++ include/linux/mlx5/fs.h | 1 + 3 files changed, 8 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index cc096f6011d9..7ecfc53cf5f6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -1230,13 +1230,13 @@ static struct mlx5_fc *mlx5e_tc_get_counter(struct mlx5e_tc_flow *flow) void mlx5e_tc_update_neigh_used_value(struct mlx5e_neigh_hash_entry *nhe) { struct mlx5e_neigh *m_neigh = &nhe->m_neigh; - u64 bytes, packets, lastuse = 0; struct mlx5e_tc_flow *flow; struct mlx5e_encap_entry *e; struct mlx5_fc *counter; struct neigh_table *tbl; bool neigh_used = false; struct neighbour *n; + u64 lastuse; if (m_neigh->family == AF_INET) tbl = &arp_tbl; @@ -1256,7 +1256,7 @@ void mlx5e_tc_update_neigh_used_value(struct mlx5e_neigh_hash_entry *nhe) encaps[efi->index]); if (flow->flags & MLX5E_TC_FLOW_OFFLOADED) { counter = mlx5e_tc_get_counter(flow); - mlx5_fc_query_cached(counter, &bytes, &packets, &lastuse); + lastuse = mlx5_fc_query_lastuse(counter); if (time_after((unsigned long)lastuse, nhe->reported_lastuse)) { neigh_used = true; break; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c index b3762123a69c..1834d9f3aa1c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c @@ -369,6 +369,11 @@ int mlx5_fc_query(struct mlx5_core_dev *dev, struct mlx5_fc *counter, } EXPORT_SYMBOL(mlx5_fc_query); +u64 mlx5_fc_query_lastuse(struct mlx5_fc *counter) +{ + return counter->cache.lastuse; +} + void mlx5_fc_query_cached(struct mlx5_fc *counter, u64 *bytes, u64 *packets, u64 *lastuse) { diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 04a569568eac..f049af3f3cd8 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -220,6 +220,7 @@ int mlx5_modify_rule_destination(struct mlx5_flow_handle *handler, struct mlx5_fc *mlx5_fc_create(struct mlx5_core_dev *dev, bool aging); void mlx5_fc_destroy(struct mlx5_core_dev *dev, struct mlx5_fc *counter); +u64 mlx5_fc_query_lastuse(struct mlx5_fc *counter); void mlx5_fc_query_cached(struct mlx5_fc *counter, u64 *bytes, u64 *packets, u64 *lastuse); int mlx5_fc_query(struct mlx5_core_dev *dev, struct mlx5_fc *counter, -- cgit v1.2.3 From 086f95682114fd2d1790bd3226e76cbae9a2d192 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Thu, 25 Jul 2019 15:52:25 -0700 Subject: bpf/flow_dissector: pass input flags to BPF flow dissector program C flow dissector supports input flags that tell it to customize parsing by either stopping early or trying to parse as deep as possible. Pass those flags to the BPF flow dissector so it can make the same decisions. In the next commits I'll add support for those flags to our reference bpf_flow.c v3: * Export copy of flow dissector flags instead of moving (Alexei Starovoitov) Acked-by: Petar Penkov Acked-by: Willem de Bruijn Acked-by: Song Liu Cc: Song Liu Cc: Willem de Bruijn Cc: Petar Penkov Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov --- include/linux/skbuff.h | 2 +- include/uapi/linux/bpf.h | 5 +++++ net/bpf/test_run.c | 2 +- net/core/flow_dissector.c | 12 ++++++++++-- 4 files changed, 17 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 718742b1c505..9b7a8038beec 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1271,7 +1271,7 @@ static inline int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr) struct bpf_flow_dissector; bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, - __be16 proto, int nhoff, int hlen); + __be16 proto, int nhoff, int hlen, unsigned int flags); bool __skb_flow_dissect(const struct net *net, const struct sk_buff *skb, diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index fa1c753dcdbc..88b9d743036f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3507,6 +3507,10 @@ enum bpf_task_fd_type { BPF_FD_TYPE_URETPROBE, /* filename + offset */ }; +#define BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG (1U << 0) +#define BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL (1U << 1) +#define BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP (1U << 2) + struct bpf_flow_keys { __u16 nhoff; __u16 thoff; @@ -3528,6 +3532,7 @@ struct bpf_flow_keys { __u32 ipv6_dst[4]; /* in6_addr; network order */ }; }; + __u32 flags; }; struct bpf_func_info { diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 80e6f3a6864d..4e41d15a1098 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -419,7 +419,7 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, time_start = ktime_get_ns(); for (i = 0; i < repeat; i++) { retval = bpf_flow_dissect(prog, &ctx, eth->h_proto, ETH_HLEN, - size); + size, 0); if (signal_pending(current)) { preempt_enable(); diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 3e6fedb57bc1..50ed1a688709 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -784,7 +784,7 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys, } bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, - __be16 proto, int nhoff, int hlen) + __be16 proto, int nhoff, int hlen, unsigned int flags) { struct bpf_flow_keys *flow_keys = ctx->flow_keys; u32 result; @@ -795,6 +795,14 @@ bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, flow_keys->nhoff = nhoff; flow_keys->thoff = flow_keys->nhoff; + BUILD_BUG_ON((int)BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG != + (int)FLOW_DISSECTOR_F_PARSE_1ST_FRAG); + BUILD_BUG_ON((int)BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL != + (int)FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); + BUILD_BUG_ON((int)BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP != + (int)FLOW_DISSECTOR_F_STOP_AT_ENCAP); + flow_keys->flags = flags; + preempt_disable(); result = BPF_PROG_RUN(prog, ctx); preempt_enable(); @@ -914,7 +922,7 @@ bool __skb_flow_dissect(const struct net *net, } ret = bpf_flow_dissect(attached, &ctx, n_proto, nhoff, - hlen); + hlen, flags); __skb_flow_bpf_to_target(&flow_keys, flow_dissector, target_container); rcu_read_unlock(); -- cgit v1.2.3 From 71c99e32b926159ea628352751f66383d7d04d17 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Thu, 25 Jul 2019 15:52:30 -0700 Subject: bpf/flow_dissector: support ipv6 flow_label and BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL Add support for exporting ipv6 flow label via bpf_flow_keys. Export flow label from bpf_flow.c and also return early when BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL is passed. Acked-by: Petar Penkov Acked-by: Willem de Bruijn Acked-by: Song Liu Cc: Song Liu Cc: Willem de Bruijn Cc: Petar Penkov Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 1 + net/core/flow_dissector.c | 9 +++++ tools/include/uapi/linux/bpf.h | 1 + .../selftests/bpf/prog_tests/flow_dissector.c | 46 ++++++++++++++++++++++ tools/testing/selftests/bpf/progs/bpf_flow.c | 10 +++++ 5 files changed, 67 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 88b9d743036f..e985f07a98ed 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3533,6 +3533,7 @@ struct bpf_flow_keys { }; }; __u32 flags; + __be32 flow_label; }; struct bpf_func_info { diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 50ed1a688709..9741b593ea53 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -737,6 +737,7 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys, struct flow_dissector_key_basic *key_basic; struct flow_dissector_key_addrs *key_addrs; struct flow_dissector_key_ports *key_ports; + struct flow_dissector_key_tags *key_tags; key_control = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_CONTROL, @@ -781,6 +782,14 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys, key_ports->src = flow_keys->sport; key_ports->dst = flow_keys->dport; } + + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_FLOW_LABEL)) { + key_tags = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_FLOW_LABEL, + target_container); + key_tags->flow_label = ntohl(flow_keys->flow_label); + } } bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 2e4b0848d795..3d7fc67ec1b8 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3530,6 +3530,7 @@ struct bpf_flow_keys { }; }; __u32 flags; + __be32 flow_label; }; struct bpf_func_info { diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c index 8e8c18aced9b..ef83f145a6f1 100644 --- a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c +++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c @@ -20,6 +20,7 @@ "is_encap=%u/%u " \ "ip_proto=0x%x/0x%x " \ "n_proto=0x%x/0x%x " \ + "flow_label=0x%x/0x%x " \ "sport=%u/%u " \ "dport=%u/%u\n", \ got.nhoff, expected.nhoff, \ @@ -30,6 +31,7 @@ got.is_encap, expected.is_encap, \ got.ip_proto, expected.ip_proto, \ got.n_proto, expected.n_proto, \ + got.flow_label, expected.flow_label, \ got.sport, expected.sport, \ got.dport, expected.dport) @@ -257,6 +259,50 @@ struct test tests[] = { .is_first_frag = true, }, }, + { + .name = "ipv6-flow-label", + .pkt.ipv6 = { + .eth.h_proto = __bpf_constant_htons(ETH_P_IPV6), + .iph.nexthdr = IPPROTO_TCP, + .iph.payload_len = __bpf_constant_htons(MAGIC_BYTES), + .iph.flow_lbl = { 0xb, 0xee, 0xef }, + .tcp.doff = 5, + .tcp.source = 80, + .tcp.dest = 8080, + }, + .keys = { + .nhoff = ETH_HLEN, + .thoff = ETH_HLEN + sizeof(struct ipv6hdr), + .addr_proto = ETH_P_IPV6, + .ip_proto = IPPROTO_TCP, + .n_proto = __bpf_constant_htons(ETH_P_IPV6), + .sport = 80, + .dport = 8080, + .flow_label = __bpf_constant_htonl(0xbeeef), + }, + }, + { + .name = "ipv6-no-flow-label", + .pkt.ipv6 = { + .eth.h_proto = __bpf_constant_htons(ETH_P_IPV6), + .iph.nexthdr = IPPROTO_TCP, + .iph.payload_len = __bpf_constant_htons(MAGIC_BYTES), + .iph.flow_lbl = { 0xb, 0xee, 0xef }, + .tcp.doff = 5, + .tcp.source = 80, + .tcp.dest = 8080, + }, + .keys = { + .flags = BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL, + .nhoff = ETH_HLEN, + .thoff = ETH_HLEN + sizeof(struct ipv6hdr), + .addr_proto = ETH_P_IPV6, + .ip_proto = IPPROTO_TCP, + .n_proto = __bpf_constant_htons(ETH_P_IPV6), + .flow_label = __bpf_constant_htonl(0xbeeef), + }, + .flags = BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL, + }, }; static int create_tap(const char *ifname) diff --git a/tools/testing/selftests/bpf/progs/bpf_flow.c b/tools/testing/selftests/bpf/progs/bpf_flow.c index f931cd3db9d4..7fbfa22f33df 100644 --- a/tools/testing/selftests/bpf/progs/bpf_flow.c +++ b/tools/testing/selftests/bpf/progs/bpf_flow.c @@ -83,6 +83,12 @@ static __always_inline int export_flow_keys(struct bpf_flow_keys *keys, return ret; } +#define IPV6_FLOWLABEL_MASK __bpf_constant_htonl(0x000FFFFF) +static inline __be32 ip6_flowlabel(const struct ipv6hdr *hdr) +{ + return *(__be32 *)hdr & IPV6_FLOWLABEL_MASK; +} + static __always_inline void *bpf_flow_dissect_get_header(struct __sk_buff *skb, __u16 hdr_size, void *buffer) @@ -308,6 +314,10 @@ PROG(IPV6)(struct __sk_buff *skb) keys->thoff += sizeof(struct ipv6hdr); keys->ip_proto = ip6h->nexthdr; + keys->flow_label = ip6_flowlabel(ip6h); + + if (keys->flags & BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL) + return export_flow_keys(keys, BPF_OK); return parse_ipv6_proto(skb, ip6h->nexthdr); } -- cgit v1.2.3 From 9552389c465ed1ded39edf4a5642a861b53c2955 Mon Sep 17 00:00:00 2001 From: Gilad Ben-Yossef Date: Tue, 2 Jul 2019 14:39:20 +0300 Subject: crypto: fips - add FIPS test failure notification chain Crypto test failures in FIPS mode cause an immediate panic, but on some system the cryptographic boundary extends beyond just the Linux controlled domain. Add a simple atomic notification chain to allow interested parties to register to receive notification prior to us kicking the bucket. Signed-off-by: Gilad Ben-Yossef Signed-off-by: Herbert Xu --- crypto/fips.c | 11 +++++++++++ crypto/testmgr.c | 4 +++- include/linux/fips.h | 7 +++++++ 3 files changed, 21 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/crypto/fips.c b/crypto/fips.c index c0b3a3c3452d..7b1d8caee669 100644 --- a/crypto/fips.c +++ b/crypto/fips.c @@ -11,10 +11,14 @@ #include #include #include +#include int fips_enabled; EXPORT_SYMBOL_GPL(fips_enabled); +ATOMIC_NOTIFIER_HEAD(fips_fail_notif_chain); +EXPORT_SYMBOL_GPL(fips_fail_notif_chain); + /* Process kernel command-line parameter at boot time. fips=0 or fips=1 */ static int fips_enable(char *str) { @@ -58,6 +62,13 @@ static void crypto_proc_fips_exit(void) unregister_sysctl_table(crypto_sysctls); } +void fips_fail_notify(void) +{ + if (fips_enabled) + atomic_notifier_call_chain(&fips_fail_notif_chain, 0, NULL); +} +EXPORT_SYMBOL_GPL(fips_fail_notify); + static int __init fips_init(void) { crypto_proc_fips_init(); diff --git a/crypto/testmgr.c b/crypto/testmgr.c index d0b5b33806a6..8ba1e75cd973 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -5240,9 +5240,11 @@ int alg_test(const char *driver, const char *alg, u32 type, u32 mask) type, mask); test_done: - if (rc && (fips_enabled || panic_on_fail)) + if (rc && (fips_enabled || panic_on_fail)) { + fips_fail_notify(); panic("alg: self-tests for %s (%s) failed in %s mode!\n", driver, alg, fips_enabled ? "fips" : "panic_on_fail"); + } if (fips_enabled && !rc) pr_info("alg: self-tests for %s (%s) passed\n", driver, alg); diff --git a/include/linux/fips.h b/include/linux/fips.h index afeeece92302..c6961e932fef 100644 --- a/include/linux/fips.h +++ b/include/linux/fips.h @@ -4,8 +4,15 @@ #ifdef CONFIG_CRYPTO_FIPS extern int fips_enabled; +extern struct atomic_notifier_head fips_fail_notif_chain; + +void fips_fail_notify(void); + #else #define fips_enabled 0 + +static inline void fips_fail_notify(void) {} + #endif #endif -- cgit v1.2.3 From e59c1c98745637796df824c0177f279b6e9cad94 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 2 Jul 2019 21:41:22 +0200 Subject: crypto: aes - create AES library based on the fixed time AES code Take the existing small footprint and mostly time invariant C code and turn it into a AES library that can be used for non-performance critical, casual use of AES, and as a fallback for, e.g., SIMD code that needs a secondary path that can be taken in contexts where the SIMD unit is off limits (e.g., in hard interrupts taken from kernel context) Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- crypto/Kconfig | 4 + crypto/aes_ti.c | 303 +------------------------------------------- include/crypto/aes.h | 34 +++++ lib/crypto/Makefile | 3 + lib/crypto/aes.c | 350 +++++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 394 insertions(+), 300 deletions(-) create mode 100644 lib/crypto/aes.c (limited to 'include') diff --git a/crypto/Kconfig b/crypto/Kconfig index e801450bcb1c..091ebbbc9655 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -1066,6 +1066,9 @@ config CRYPTO_GHASH_CLMUL_NI_INTEL comment "Ciphers" +config CRYPTO_LIB_AES + tristate + config CRYPTO_AES tristate "AES cipher algorithms" select CRYPTO_ALGAPI @@ -1089,6 +1092,7 @@ config CRYPTO_AES config CRYPTO_AES_TI tristate "Fixed time AES cipher" select CRYPTO_ALGAPI + select CRYPTO_LIB_AES help This is a generic implementation of AES that attempts to eliminate data dependent latencies as much as possible without affecting diff --git a/crypto/aes_ti.c b/crypto/aes_ti.c index b3ebdc5679cb..205c2c257d49 100644 --- a/crypto/aes_ti.c +++ b/crypto/aes_ti.c @@ -8,249 +8,19 @@ #include #include #include -#include - -/* - * Emit the sbox as volatile const to prevent the compiler from doing - * constant folding on sbox references involving fixed indexes. - */ -static volatile const u8 __cacheline_aligned __aesti_sbox[] = { - 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, - 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76, - 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, - 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, - 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, - 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15, - 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, - 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75, - 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, - 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84, - 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, - 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf, - 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, - 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8, - 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, - 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, - 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, - 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73, - 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, - 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb, - 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, - 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, - 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, - 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08, - 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, - 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a, - 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, - 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, - 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, - 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf, - 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, - 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16, -}; - -static volatile const u8 __cacheline_aligned __aesti_inv_sbox[] = { - 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, - 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb, - 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, - 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb, - 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, - 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e, - 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, - 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25, - 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, - 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92, - 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, - 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84, - 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, - 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06, - 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, - 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b, - 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, - 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73, - 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, - 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e, - 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, - 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b, - 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, - 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4, - 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, - 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f, - 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, - 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef, - 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, - 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61, - 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, - 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d, -}; - -static u32 mul_by_x(u32 w) -{ - u32 x = w & 0x7f7f7f7f; - u32 y = w & 0x80808080; - - /* multiply by polynomial 'x' (0b10) in GF(2^8) */ - return (x << 1) ^ (y >> 7) * 0x1b; -} - -static u32 mul_by_x2(u32 w) -{ - u32 x = w & 0x3f3f3f3f; - u32 y = w & 0x80808080; - u32 z = w & 0x40404040; - - /* multiply by polynomial 'x^2' (0b100) in GF(2^8) */ - return (x << 2) ^ (y >> 7) * 0x36 ^ (z >> 6) * 0x1b; -} - -static u32 mix_columns(u32 x) -{ - /* - * Perform the following matrix multiplication in GF(2^8) - * - * | 0x2 0x3 0x1 0x1 | | x[0] | - * | 0x1 0x2 0x3 0x1 | | x[1] | - * | 0x1 0x1 0x2 0x3 | x | x[2] | - * | 0x3 0x1 0x1 0x2 | | x[3] | - */ - u32 y = mul_by_x(x) ^ ror32(x, 16); - - return y ^ ror32(x ^ y, 8); -} - -static u32 inv_mix_columns(u32 x) -{ - /* - * Perform the following matrix multiplication in GF(2^8) - * - * | 0xe 0xb 0xd 0x9 | | x[0] | - * | 0x9 0xe 0xb 0xd | | x[1] | - * | 0xd 0x9 0xe 0xb | x | x[2] | - * | 0xb 0xd 0x9 0xe | | x[3] | - * - * which can conveniently be reduced to - * - * | 0x2 0x3 0x1 0x1 | | 0x5 0x0 0x4 0x0 | | x[0] | - * | 0x1 0x2 0x3 0x1 | | 0x0 0x5 0x0 0x4 | | x[1] | - * | 0x1 0x1 0x2 0x3 | x | 0x4 0x0 0x5 0x0 | x | x[2] | - * | 0x3 0x1 0x1 0x2 | | 0x0 0x4 0x0 0x5 | | x[3] | - */ - u32 y = mul_by_x2(x); - - return mix_columns(x ^ y ^ ror32(y, 16)); -} - -static __always_inline u32 subshift(u32 in[], int pos) -{ - return (__aesti_sbox[in[pos] & 0xff]) ^ - (__aesti_sbox[(in[(pos + 1) % 4] >> 8) & 0xff] << 8) ^ - (__aesti_sbox[(in[(pos + 2) % 4] >> 16) & 0xff] << 16) ^ - (__aesti_sbox[(in[(pos + 3) % 4] >> 24) & 0xff] << 24); -} - -static __always_inline u32 inv_subshift(u32 in[], int pos) -{ - return (__aesti_inv_sbox[in[pos] & 0xff]) ^ - (__aesti_inv_sbox[(in[(pos + 3) % 4] >> 8) & 0xff] << 8) ^ - (__aesti_inv_sbox[(in[(pos + 2) % 4] >> 16) & 0xff] << 16) ^ - (__aesti_inv_sbox[(in[(pos + 1) % 4] >> 24) & 0xff] << 24); -} - -static u32 subw(u32 in) -{ - return (__aesti_sbox[in & 0xff]) ^ - (__aesti_sbox[(in >> 8) & 0xff] << 8) ^ - (__aesti_sbox[(in >> 16) & 0xff] << 16) ^ - (__aesti_sbox[(in >> 24) & 0xff] << 24); -} - -static int aesti_expand_key(struct crypto_aes_ctx *ctx, const u8 *in_key, - unsigned int key_len) -{ - u32 kwords = key_len / sizeof(u32); - u32 rc, i, j; - - if (key_len != AES_KEYSIZE_128 && - key_len != AES_KEYSIZE_192 && - key_len != AES_KEYSIZE_256) - return -EINVAL; - - ctx->key_length = key_len; - - for (i = 0; i < kwords; i++) - ctx->key_enc[i] = get_unaligned_le32(in_key + i * sizeof(u32)); - - for (i = 0, rc = 1; i < 10; i++, rc = mul_by_x(rc)) { - u32 *rki = ctx->key_enc + (i * kwords); - u32 *rko = rki + kwords; - - rko[0] = ror32(subw(rki[kwords - 1]), 8) ^ rc ^ rki[0]; - rko[1] = rko[0] ^ rki[1]; - rko[2] = rko[1] ^ rki[2]; - rko[3] = rko[2] ^ rki[3]; - - if (key_len == 24) { - if (i >= 7) - break; - rko[4] = rko[3] ^ rki[4]; - rko[5] = rko[4] ^ rki[5]; - } else if (key_len == 32) { - if (i >= 6) - break; - rko[4] = subw(rko[3]) ^ rki[4]; - rko[5] = rko[4] ^ rki[5]; - rko[6] = rko[5] ^ rki[6]; - rko[7] = rko[6] ^ rki[7]; - } - } - - /* - * Generate the decryption keys for the Equivalent Inverse Cipher. - * This involves reversing the order of the round keys, and applying - * the Inverse Mix Columns transformation to all but the first and - * the last one. - */ - ctx->key_dec[0] = ctx->key_enc[key_len + 24]; - ctx->key_dec[1] = ctx->key_enc[key_len + 25]; - ctx->key_dec[2] = ctx->key_enc[key_len + 26]; - ctx->key_dec[3] = ctx->key_enc[key_len + 27]; - - for (i = 4, j = key_len + 20; j > 0; i += 4, j -= 4) { - ctx->key_dec[i] = inv_mix_columns(ctx->key_enc[j]); - ctx->key_dec[i + 1] = inv_mix_columns(ctx->key_enc[j + 1]); - ctx->key_dec[i + 2] = inv_mix_columns(ctx->key_enc[j + 2]); - ctx->key_dec[i + 3] = inv_mix_columns(ctx->key_enc[j + 3]); - } - - ctx->key_dec[i] = ctx->key_enc[0]; - ctx->key_dec[i + 1] = ctx->key_enc[1]; - ctx->key_dec[i + 2] = ctx->key_enc[2]; - ctx->key_dec[i + 3] = ctx->key_enc[3]; - - return 0; -} static int aesti_set_key(struct crypto_tfm *tfm, const u8 *in_key, unsigned int key_len) { struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm); - return aesti_expand_key(ctx, in_key, key_len); + return aes_expandkey(ctx, in_key, key_len); } static void aesti_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) { const struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm); - const u32 *rkp = ctx->key_enc + 4; - int rounds = 6 + ctx->key_length / 4; - u32 st0[4], st1[4]; unsigned long flags; - int round; - - st0[0] = ctx->key_enc[0] ^ get_unaligned_le32(in); - st0[1] = ctx->key_enc[1] ^ get_unaligned_le32(in + 4); - st0[2] = ctx->key_enc[2] ^ get_unaligned_le32(in + 8); - st0[3] = ctx->key_enc[3] ^ get_unaligned_le32(in + 12); /* * Temporarily disable interrupts to avoid races where cachelines are @@ -258,36 +28,7 @@ static void aesti_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) */ local_irq_save(flags); - /* - * Force the compiler to emit data independent Sbox references, - * by xoring the input with Sbox values that are known to add up - * to zero. This pulls the entire Sbox into the D-cache before any - * data dependent lookups are done. - */ - st0[0] ^= __aesti_sbox[ 0] ^ __aesti_sbox[ 64] ^ __aesti_sbox[134] ^ __aesti_sbox[195]; - st0[1] ^= __aesti_sbox[16] ^ __aesti_sbox[ 82] ^ __aesti_sbox[158] ^ __aesti_sbox[221]; - st0[2] ^= __aesti_sbox[32] ^ __aesti_sbox[ 96] ^ __aesti_sbox[160] ^ __aesti_sbox[234]; - st0[3] ^= __aesti_sbox[48] ^ __aesti_sbox[112] ^ __aesti_sbox[186] ^ __aesti_sbox[241]; - - for (round = 0;; round += 2, rkp += 8) { - st1[0] = mix_columns(subshift(st0, 0)) ^ rkp[0]; - st1[1] = mix_columns(subshift(st0, 1)) ^ rkp[1]; - st1[2] = mix_columns(subshift(st0, 2)) ^ rkp[2]; - st1[3] = mix_columns(subshift(st0, 3)) ^ rkp[3]; - - if (round == rounds - 2) - break; - - st0[0] = mix_columns(subshift(st1, 0)) ^ rkp[4]; - st0[1] = mix_columns(subshift(st1, 1)) ^ rkp[5]; - st0[2] = mix_columns(subshift(st1, 2)) ^ rkp[6]; - st0[3] = mix_columns(subshift(st1, 3)) ^ rkp[7]; - } - - put_unaligned_le32(subshift(st1, 0) ^ rkp[4], out); - put_unaligned_le32(subshift(st1, 1) ^ rkp[5], out + 4); - put_unaligned_le32(subshift(st1, 2) ^ rkp[6], out + 8); - put_unaligned_le32(subshift(st1, 3) ^ rkp[7], out + 12); + aes_encrypt(ctx, out, in); local_irq_restore(flags); } @@ -295,16 +36,7 @@ static void aesti_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) static void aesti_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) { const struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm); - const u32 *rkp = ctx->key_dec + 4; - int rounds = 6 + ctx->key_length / 4; - u32 st0[4], st1[4]; unsigned long flags; - int round; - - st0[0] = ctx->key_dec[0] ^ get_unaligned_le32(in); - st0[1] = ctx->key_dec[1] ^ get_unaligned_le32(in + 4); - st0[2] = ctx->key_dec[2] ^ get_unaligned_le32(in + 8); - st0[3] = ctx->key_dec[3] ^ get_unaligned_le32(in + 12); /* * Temporarily disable interrupts to avoid races where cachelines are @@ -312,36 +44,7 @@ static void aesti_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) */ local_irq_save(flags); - /* - * Force the compiler to emit data independent Sbox references, - * by xoring the input with Sbox values that are known to add up - * to zero. This pulls the entire Sbox into the D-cache before any - * data dependent lookups are done. - */ - st0[0] ^= __aesti_inv_sbox[ 0] ^ __aesti_inv_sbox[ 64] ^ __aesti_inv_sbox[129] ^ __aesti_inv_sbox[200]; - st0[1] ^= __aesti_inv_sbox[16] ^ __aesti_inv_sbox[ 83] ^ __aesti_inv_sbox[150] ^ __aesti_inv_sbox[212]; - st0[2] ^= __aesti_inv_sbox[32] ^ __aesti_inv_sbox[ 96] ^ __aesti_inv_sbox[160] ^ __aesti_inv_sbox[236]; - st0[3] ^= __aesti_inv_sbox[48] ^ __aesti_inv_sbox[112] ^ __aesti_inv_sbox[187] ^ __aesti_inv_sbox[247]; - - for (round = 0;; round += 2, rkp += 8) { - st1[0] = inv_mix_columns(inv_subshift(st0, 0)) ^ rkp[0]; - st1[1] = inv_mix_columns(inv_subshift(st0, 1)) ^ rkp[1]; - st1[2] = inv_mix_columns(inv_subshift(st0, 2)) ^ rkp[2]; - st1[3] = inv_mix_columns(inv_subshift(st0, 3)) ^ rkp[3]; - - if (round == rounds - 2) - break; - - st0[0] = inv_mix_columns(inv_subshift(st1, 0)) ^ rkp[4]; - st0[1] = inv_mix_columns(inv_subshift(st1, 1)) ^ rkp[5]; - st0[2] = inv_mix_columns(inv_subshift(st1, 2)) ^ rkp[6]; - st0[3] = inv_mix_columns(inv_subshift(st1, 3)) ^ rkp[7]; - } - - put_unaligned_le32(inv_subshift(st1, 0) ^ rkp[4], out); - put_unaligned_le32(inv_subshift(st1, 1) ^ rkp[5], out + 4); - put_unaligned_le32(inv_subshift(st1, 2) ^ rkp[6], out + 8); - put_unaligned_le32(inv_subshift(st1, 3) ^ rkp[7], out + 12); + aes_decrypt(ctx, out, in); local_irq_restore(flags); } diff --git a/include/crypto/aes.h b/include/crypto/aes.h index 0fdb542c70cd..d0067fca0cd0 100644 --- a/include/crypto/aes.h +++ b/include/crypto/aes.h @@ -37,4 +37,38 @@ int crypto_aes_set_key(struct crypto_tfm *tfm, const u8 *in_key, unsigned int key_len); int crypto_aes_expand_key(struct crypto_aes_ctx *ctx, const u8 *in_key, unsigned int key_len); + +/** + * aes_expandkey - Expands the AES key as described in FIPS-197 + * @ctx: The location where the computed key will be stored. + * @in_key: The supplied key. + * @key_len: The length of the supplied key. + * + * Returns 0 on success. The function fails only if an invalid key size (or + * pointer) is supplied. + * The expanded key size is 240 bytes (max of 14 rounds with a unique 16 bytes + * key schedule plus a 16 bytes key which is used before the first round). + * The decryption key is prepared for the "Equivalent Inverse Cipher" as + * described in FIPS-197. The first slot (16 bytes) of each key (enc or dec) is + * for the initial combination, the second slot for the first round and so on. + */ +int aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key, + unsigned int key_len); + +/** + * aes_encrypt - Encrypt a single AES block + * @ctx: Context struct containing the key schedule + * @out: Buffer to store the ciphertext + * @in: Buffer containing the plaintext + */ +void aes_encrypt(const struct crypto_aes_ctx *ctx, u8 *out, const u8 *in); + +/** + * aes_decrypt - Decrypt a single AES block + * @ctx: Context struct containing the key schedule + * @out: Buffer to store the plaintext + * @in: Buffer containing the ciphertext + */ +void aes_decrypt(const struct crypto_aes_ctx *ctx, u8 *out, const u8 *in); + #endif diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile index 88195c34932d..42a91c62d96d 100644 --- a/lib/crypto/Makefile +++ b/lib/crypto/Makefile @@ -1,4 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_CRYPTO_LIB_AES) += libaes.o +libaes-y := aes.o + obj-$(CONFIG_CRYPTO_LIB_ARC4) += libarc4.o libarc4-y := arc4.o diff --git a/lib/crypto/aes.c b/lib/crypto/aes.c new file mode 100644 index 000000000000..9928b23e0a8a --- /dev/null +++ b/lib/crypto/aes.c @@ -0,0 +1,350 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2017-2019 Linaro Ltd + */ + +#include +#include +#include +#include + +/* + * Emit the sbox as volatile const to prevent the compiler from doing + * constant folding on sbox references involving fixed indexes. + */ +static volatile const u8 __cacheline_aligned aes_sbox[] = { + 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, + 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76, + 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, + 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, + 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, + 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15, + 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, + 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75, + 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, + 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84, + 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, + 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf, + 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, + 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8, + 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, + 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, + 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, + 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73, + 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, + 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb, + 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, + 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, + 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, + 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08, + 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, + 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a, + 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, + 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, + 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, + 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf, + 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, + 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16, +}; + +static volatile const u8 __cacheline_aligned aes_inv_sbox[] = { + 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, + 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb, + 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, + 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb, + 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, + 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e, + 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, + 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25, + 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, + 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92, + 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, + 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84, + 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, + 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06, + 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, + 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b, + 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, + 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73, + 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, + 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e, + 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, + 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b, + 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, + 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4, + 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, + 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f, + 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, + 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef, + 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, + 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61, + 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, + 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d, +}; + +static u32 mul_by_x(u32 w) +{ + u32 x = w & 0x7f7f7f7f; + u32 y = w & 0x80808080; + + /* multiply by polynomial 'x' (0b10) in GF(2^8) */ + return (x << 1) ^ (y >> 7) * 0x1b; +} + +static u32 mul_by_x2(u32 w) +{ + u32 x = w & 0x3f3f3f3f; + u32 y = w & 0x80808080; + u32 z = w & 0x40404040; + + /* multiply by polynomial 'x^2' (0b100) in GF(2^8) */ + return (x << 2) ^ (y >> 7) * 0x36 ^ (z >> 6) * 0x1b; +} + +static u32 mix_columns(u32 x) +{ + /* + * Perform the following matrix multiplication in GF(2^8) + * + * | 0x2 0x3 0x1 0x1 | | x[0] | + * | 0x1 0x2 0x3 0x1 | | x[1] | + * | 0x1 0x1 0x2 0x3 | x | x[2] | + * | 0x3 0x1 0x1 0x2 | | x[3] | + */ + u32 y = mul_by_x(x) ^ ror32(x, 16); + + return y ^ ror32(x ^ y, 8); +} + +static u32 inv_mix_columns(u32 x) +{ + /* + * Perform the following matrix multiplication in GF(2^8) + * + * | 0xe 0xb 0xd 0x9 | | x[0] | + * | 0x9 0xe 0xb 0xd | | x[1] | + * | 0xd 0x9 0xe 0xb | x | x[2] | + * | 0xb 0xd 0x9 0xe | | x[3] | + * + * which can conveniently be reduced to + * + * | 0x2 0x3 0x1 0x1 | | 0x5 0x0 0x4 0x0 | | x[0] | + * | 0x1 0x2 0x3 0x1 | | 0x0 0x5 0x0 0x4 | | x[1] | + * | 0x1 0x1 0x2 0x3 | x | 0x4 0x0 0x5 0x0 | x | x[2] | + * | 0x3 0x1 0x1 0x2 | | 0x0 0x4 0x0 0x5 | | x[3] | + */ + u32 y = mul_by_x2(x); + + return mix_columns(x ^ y ^ ror32(y, 16)); +} + +static __always_inline u32 subshift(u32 in[], int pos) +{ + return (aes_sbox[in[pos] & 0xff]) ^ + (aes_sbox[(in[(pos + 1) % 4] >> 8) & 0xff] << 8) ^ + (aes_sbox[(in[(pos + 2) % 4] >> 16) & 0xff] << 16) ^ + (aes_sbox[(in[(pos + 3) % 4] >> 24) & 0xff] << 24); +} + +static __always_inline u32 inv_subshift(u32 in[], int pos) +{ + return (aes_inv_sbox[in[pos] & 0xff]) ^ + (aes_inv_sbox[(in[(pos + 3) % 4] >> 8) & 0xff] << 8) ^ + (aes_inv_sbox[(in[(pos + 2) % 4] >> 16) & 0xff] << 16) ^ + (aes_inv_sbox[(in[(pos + 1) % 4] >> 24) & 0xff] << 24); +} + +static u32 subw(u32 in) +{ + return (aes_sbox[in & 0xff]) ^ + (aes_sbox[(in >> 8) & 0xff] << 8) ^ + (aes_sbox[(in >> 16) & 0xff] << 16) ^ + (aes_sbox[(in >> 24) & 0xff] << 24); +} + +/** + * aes_expandkey - Expands the AES key as described in FIPS-197 + * @ctx: The location where the computed key will be stored. + * @in_key: The supplied key. + * @key_len: The length of the supplied key. + * + * Returns 0 on success. The function fails only if an invalid key size (or + * pointer) is supplied. + * The expanded key size is 240 bytes (max of 14 rounds with a unique 16 bytes + * key schedule plus a 16 bytes key which is used before the first round). + * The decryption key is prepared for the "Equivalent Inverse Cipher" as + * described in FIPS-197. The first slot (16 bytes) of each key (enc or dec) is + * for the initial combination, the second slot for the first round and so on. + */ +int aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key, + unsigned int key_len) +{ + u32 kwords = key_len / sizeof(u32); + u32 rc, i, j; + + if (key_len != AES_KEYSIZE_128 && + key_len != AES_KEYSIZE_192 && + key_len != AES_KEYSIZE_256) + return -EINVAL; + + ctx->key_length = key_len; + + for (i = 0; i < kwords; i++) + ctx->key_enc[i] = get_unaligned_le32(in_key + i * sizeof(u32)); + + for (i = 0, rc = 1; i < 10; i++, rc = mul_by_x(rc)) { + u32 *rki = ctx->key_enc + (i * kwords); + u32 *rko = rki + kwords; + + rko[0] = ror32(subw(rki[kwords - 1]), 8) ^ rc ^ rki[0]; + rko[1] = rko[0] ^ rki[1]; + rko[2] = rko[1] ^ rki[2]; + rko[3] = rko[2] ^ rki[3]; + + if (key_len == AES_KEYSIZE_192) { + if (i >= 7) + break; + rko[4] = rko[3] ^ rki[4]; + rko[5] = rko[4] ^ rki[5]; + } else if (key_len == AES_KEYSIZE_256) { + if (i >= 6) + break; + rko[4] = subw(rko[3]) ^ rki[4]; + rko[5] = rko[4] ^ rki[5]; + rko[6] = rko[5] ^ rki[6]; + rko[7] = rko[6] ^ rki[7]; + } + } + + /* + * Generate the decryption keys for the Equivalent Inverse Cipher. + * This involves reversing the order of the round keys, and applying + * the Inverse Mix Columns transformation to all but the first and + * the last one. + */ + ctx->key_dec[0] = ctx->key_enc[key_len + 24]; + ctx->key_dec[1] = ctx->key_enc[key_len + 25]; + ctx->key_dec[2] = ctx->key_enc[key_len + 26]; + ctx->key_dec[3] = ctx->key_enc[key_len + 27]; + + for (i = 4, j = key_len + 20; j > 0; i += 4, j -= 4) { + ctx->key_dec[i] = inv_mix_columns(ctx->key_enc[j]); + ctx->key_dec[i + 1] = inv_mix_columns(ctx->key_enc[j + 1]); + ctx->key_dec[i + 2] = inv_mix_columns(ctx->key_enc[j + 2]); + ctx->key_dec[i + 3] = inv_mix_columns(ctx->key_enc[j + 3]); + } + + ctx->key_dec[i] = ctx->key_enc[0]; + ctx->key_dec[i + 1] = ctx->key_enc[1]; + ctx->key_dec[i + 2] = ctx->key_enc[2]; + ctx->key_dec[i + 3] = ctx->key_enc[3]; + + return 0; +} +EXPORT_SYMBOL(aes_expandkey); + +/** + * aes_encrypt - Encrypt a single AES block + * @ctx: Context struct containing the key schedule + * @out: Buffer to store the ciphertext + * @in: Buffer containing the plaintext + */ +void aes_encrypt(const struct crypto_aes_ctx *ctx, u8 *out, const u8 *in) +{ + const u32 *rkp = ctx->key_enc + 4; + int rounds = 6 + ctx->key_length / 4; + u32 st0[4], st1[4]; + int round; + + st0[0] = ctx->key_enc[0] ^ get_unaligned_le32(in); + st0[1] = ctx->key_enc[1] ^ get_unaligned_le32(in + 4); + st0[2] = ctx->key_enc[2] ^ get_unaligned_le32(in + 8); + st0[3] = ctx->key_enc[3] ^ get_unaligned_le32(in + 12); + + /* + * Force the compiler to emit data independent Sbox references, + * by xoring the input with Sbox values that are known to add up + * to zero. This pulls the entire Sbox into the D-cache before any + * data dependent lookups are done. + */ + st0[0] ^= aes_sbox[ 0] ^ aes_sbox[ 64] ^ aes_sbox[134] ^ aes_sbox[195]; + st0[1] ^= aes_sbox[16] ^ aes_sbox[ 82] ^ aes_sbox[158] ^ aes_sbox[221]; + st0[2] ^= aes_sbox[32] ^ aes_sbox[ 96] ^ aes_sbox[160] ^ aes_sbox[234]; + st0[3] ^= aes_sbox[48] ^ aes_sbox[112] ^ aes_sbox[186] ^ aes_sbox[241]; + + for (round = 0;; round += 2, rkp += 8) { + st1[0] = mix_columns(subshift(st0, 0)) ^ rkp[0]; + st1[1] = mix_columns(subshift(st0, 1)) ^ rkp[1]; + st1[2] = mix_columns(subshift(st0, 2)) ^ rkp[2]; + st1[3] = mix_columns(subshift(st0, 3)) ^ rkp[3]; + + if (round == rounds - 2) + break; + + st0[0] = mix_columns(subshift(st1, 0)) ^ rkp[4]; + st0[1] = mix_columns(subshift(st1, 1)) ^ rkp[5]; + st0[2] = mix_columns(subshift(st1, 2)) ^ rkp[6]; + st0[3] = mix_columns(subshift(st1, 3)) ^ rkp[7]; + } + + put_unaligned_le32(subshift(st1, 0) ^ rkp[4], out); + put_unaligned_le32(subshift(st1, 1) ^ rkp[5], out + 4); + put_unaligned_le32(subshift(st1, 2) ^ rkp[6], out + 8); + put_unaligned_le32(subshift(st1, 3) ^ rkp[7], out + 12); +} +EXPORT_SYMBOL(aes_encrypt); + +/** + * aes_decrypt - Decrypt a single AES block + * @ctx: Context struct containing the key schedule + * @out: Buffer to store the plaintext + * @in: Buffer containing the ciphertext + */ +void aes_decrypt(const struct crypto_aes_ctx *ctx, u8 *out, const u8 *in) +{ + const u32 *rkp = ctx->key_dec + 4; + int rounds = 6 + ctx->key_length / 4; + u32 st0[4], st1[4]; + int round; + + st0[0] = ctx->key_dec[0] ^ get_unaligned_le32(in); + st0[1] = ctx->key_dec[1] ^ get_unaligned_le32(in + 4); + st0[2] = ctx->key_dec[2] ^ get_unaligned_le32(in + 8); + st0[3] = ctx->key_dec[3] ^ get_unaligned_le32(in + 12); + + /* + * Force the compiler to emit data independent Sbox references, + * by xoring the input with Sbox values that are known to add up + * to zero. This pulls the entire Sbox into the D-cache before any + * data dependent lookups are done. + */ + st0[0] ^= aes_inv_sbox[ 0] ^ aes_inv_sbox[ 64] ^ aes_inv_sbox[129] ^ aes_inv_sbox[200]; + st0[1] ^= aes_inv_sbox[16] ^ aes_inv_sbox[ 83] ^ aes_inv_sbox[150] ^ aes_inv_sbox[212]; + st0[2] ^= aes_inv_sbox[32] ^ aes_inv_sbox[ 96] ^ aes_inv_sbox[160] ^ aes_inv_sbox[236]; + st0[3] ^= aes_inv_sbox[48] ^ aes_inv_sbox[112] ^ aes_inv_sbox[187] ^ aes_inv_sbox[247]; + + for (round = 0;; round += 2, rkp += 8) { + st1[0] = inv_mix_columns(inv_subshift(st0, 0)) ^ rkp[0]; + st1[1] = inv_mix_columns(inv_subshift(st0, 1)) ^ rkp[1]; + st1[2] = inv_mix_columns(inv_subshift(st0, 2)) ^ rkp[2]; + st1[3] = inv_mix_columns(inv_subshift(st0, 3)) ^ rkp[3]; + + if (round == rounds - 2) + break; + + st0[0] = inv_mix_columns(inv_subshift(st1, 0)) ^ rkp[4]; + st0[1] = inv_mix_columns(inv_subshift(st1, 1)) ^ rkp[5]; + st0[2] = inv_mix_columns(inv_subshift(st1, 2)) ^ rkp[6]; + st0[3] = inv_mix_columns(inv_subshift(st1, 3)) ^ rkp[7]; + } + + put_unaligned_le32(inv_subshift(st1, 0) ^ rkp[4], out); + put_unaligned_le32(inv_subshift(st1, 1) ^ rkp[5], out + 4); + put_unaligned_le32(inv_subshift(st1, 2) ^ rkp[6], out + 8); + put_unaligned_le32(inv_subshift(st1, 3) ^ rkp[7], out + 12); +} +EXPORT_SYMBOL(aes_decrypt); + +MODULE_DESCRIPTION("Generic AES library"); +MODULE_AUTHOR("Ard Biesheuvel "); +MODULE_LICENSE("GPL v2"); -- cgit v1.2.3 From 5bb12d7825adf0e80b849a273834f3131a6cc4e1 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 2 Jul 2019 21:41:33 +0200 Subject: crypto: aes-generic - drop key expansion routine in favor of library version Drop aes-generic's version of crypto_aes_expand_key(), and switch to the key expansion routine provided by the AES library. AES key expansion is not performance critical, and it is better to have a single version shared by all AES implementations. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- crypto/Kconfig | 1 + crypto/aes_generic.c | 153 +-------------------------------------------------- include/crypto/aes.h | 2 - 3 files changed, 3 insertions(+), 153 deletions(-) (limited to 'include') diff --git a/crypto/Kconfig b/crypto/Kconfig index df6f0be66574..80ea118600ab 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -1072,6 +1072,7 @@ config CRYPTO_LIB_AES config CRYPTO_AES tristate "AES cipher algorithms" select CRYPTO_ALGAPI + select CRYPTO_LIB_AES help AES cipher algorithms (FIPS-197). AES uses the Rijndael algorithm. diff --git a/crypto/aes_generic.c b/crypto/aes_generic.c index 3aa4a715c216..426deb437f19 100644 --- a/crypto/aes_generic.c +++ b/crypto/aes_generic.c @@ -1125,155 +1125,6 @@ EXPORT_SYMBOL_GPL(crypto_fl_tab); EXPORT_SYMBOL_GPL(crypto_it_tab); EXPORT_SYMBOL_GPL(crypto_il_tab); -/* initialise the key schedule from the user supplied key */ - -#define star_x(x) (((x) & 0x7f7f7f7f) << 1) ^ ((((x) & 0x80808080) >> 7) * 0x1b) - -#define imix_col(y, x) do { \ - u = star_x(x); \ - v = star_x(u); \ - w = star_x(v); \ - t = w ^ (x); \ - (y) = u ^ v ^ w; \ - (y) ^= ror32(u ^ t, 8) ^ \ - ror32(v ^ t, 16) ^ \ - ror32(t, 24); \ -} while (0) - -#define ls_box(x) \ - crypto_fl_tab[0][byte(x, 0)] ^ \ - crypto_fl_tab[1][byte(x, 1)] ^ \ - crypto_fl_tab[2][byte(x, 2)] ^ \ - crypto_fl_tab[3][byte(x, 3)] - -#define loop4(i) do { \ - t = ror32(t, 8); \ - t = ls_box(t) ^ rco_tab[i]; \ - t ^= ctx->key_enc[4 * i]; \ - ctx->key_enc[4 * i + 4] = t; \ - t ^= ctx->key_enc[4 * i + 1]; \ - ctx->key_enc[4 * i + 5] = t; \ - t ^= ctx->key_enc[4 * i + 2]; \ - ctx->key_enc[4 * i + 6] = t; \ - t ^= ctx->key_enc[4 * i + 3]; \ - ctx->key_enc[4 * i + 7] = t; \ -} while (0) - -#define loop6(i) do { \ - t = ror32(t, 8); \ - t = ls_box(t) ^ rco_tab[i]; \ - t ^= ctx->key_enc[6 * i]; \ - ctx->key_enc[6 * i + 6] = t; \ - t ^= ctx->key_enc[6 * i + 1]; \ - ctx->key_enc[6 * i + 7] = t; \ - t ^= ctx->key_enc[6 * i + 2]; \ - ctx->key_enc[6 * i + 8] = t; \ - t ^= ctx->key_enc[6 * i + 3]; \ - ctx->key_enc[6 * i + 9] = t; \ - t ^= ctx->key_enc[6 * i + 4]; \ - ctx->key_enc[6 * i + 10] = t; \ - t ^= ctx->key_enc[6 * i + 5]; \ - ctx->key_enc[6 * i + 11] = t; \ -} while (0) - -#define loop8tophalf(i) do { \ - t = ror32(t, 8); \ - t = ls_box(t) ^ rco_tab[i]; \ - t ^= ctx->key_enc[8 * i]; \ - ctx->key_enc[8 * i + 8] = t; \ - t ^= ctx->key_enc[8 * i + 1]; \ - ctx->key_enc[8 * i + 9] = t; \ - t ^= ctx->key_enc[8 * i + 2]; \ - ctx->key_enc[8 * i + 10] = t; \ - t ^= ctx->key_enc[8 * i + 3]; \ - ctx->key_enc[8 * i + 11] = t; \ -} while (0) - -#define loop8(i) do { \ - loop8tophalf(i); \ - t = ctx->key_enc[8 * i + 4] ^ ls_box(t); \ - ctx->key_enc[8 * i + 12] = t; \ - t ^= ctx->key_enc[8 * i + 5]; \ - ctx->key_enc[8 * i + 13] = t; \ - t ^= ctx->key_enc[8 * i + 6]; \ - ctx->key_enc[8 * i + 14] = t; \ - t ^= ctx->key_enc[8 * i + 7]; \ - ctx->key_enc[8 * i + 15] = t; \ -} while (0) - -/** - * crypto_aes_expand_key - Expands the AES key as described in FIPS-197 - * @ctx: The location where the computed key will be stored. - * @in_key: The supplied key. - * @key_len: The length of the supplied key. - * - * Returns 0 on success. The function fails only if an invalid key size (or - * pointer) is supplied. - * The expanded key size is 240 bytes (max of 14 rounds with a unique 16 bytes - * key schedule plus a 16 bytes key which is used before the first round). - * The decryption key is prepared for the "Equivalent Inverse Cipher" as - * described in FIPS-197. The first slot (16 bytes) of each key (enc or dec) is - * for the initial combination, the second slot for the first round and so on. - */ -int crypto_aes_expand_key(struct crypto_aes_ctx *ctx, const u8 *in_key, - unsigned int key_len) -{ - u32 i, t, u, v, w, j; - - if (key_len != AES_KEYSIZE_128 && key_len != AES_KEYSIZE_192 && - key_len != AES_KEYSIZE_256) - return -EINVAL; - - ctx->key_length = key_len; - - ctx->key_enc[0] = get_unaligned_le32(in_key); - ctx->key_enc[1] = get_unaligned_le32(in_key + 4); - ctx->key_enc[2] = get_unaligned_le32(in_key + 8); - ctx->key_enc[3] = get_unaligned_le32(in_key + 12); - - ctx->key_dec[key_len + 24] = ctx->key_enc[0]; - ctx->key_dec[key_len + 25] = ctx->key_enc[1]; - ctx->key_dec[key_len + 26] = ctx->key_enc[2]; - ctx->key_dec[key_len + 27] = ctx->key_enc[3]; - - switch (key_len) { - case AES_KEYSIZE_128: - t = ctx->key_enc[3]; - for (i = 0; i < 10; ++i) - loop4(i); - break; - - case AES_KEYSIZE_192: - ctx->key_enc[4] = get_unaligned_le32(in_key + 16); - t = ctx->key_enc[5] = get_unaligned_le32(in_key + 20); - for (i = 0; i < 8; ++i) - loop6(i); - break; - - case AES_KEYSIZE_256: - ctx->key_enc[4] = get_unaligned_le32(in_key + 16); - ctx->key_enc[5] = get_unaligned_le32(in_key + 20); - ctx->key_enc[6] = get_unaligned_le32(in_key + 24); - t = ctx->key_enc[7] = get_unaligned_le32(in_key + 28); - for (i = 0; i < 6; ++i) - loop8(i); - loop8tophalf(i); - break; - } - - ctx->key_dec[0] = ctx->key_enc[key_len + 24]; - ctx->key_dec[1] = ctx->key_enc[key_len + 25]; - ctx->key_dec[2] = ctx->key_enc[key_len + 26]; - ctx->key_dec[3] = ctx->key_enc[key_len + 27]; - - for (i = 4; i < key_len + 24; ++i) { - j = key_len + 24 - (i & ~3) + (i & 3); - imix_col(ctx->key_dec[j], ctx->key_enc[i]); - } - return 0; -} -EXPORT_SYMBOL_GPL(crypto_aes_expand_key); - /** * crypto_aes_set_key - Set the AES key. * @tfm: The %crypto_tfm that is used in the context. @@ -1281,7 +1132,7 @@ EXPORT_SYMBOL_GPL(crypto_aes_expand_key); * @key_len: The size of the key. * * Returns 0 on success, on failure the %CRYPTO_TFM_RES_BAD_KEY_LEN flag in tfm - * is set. The function uses crypto_aes_expand_key() to expand the key. + * is set. The function uses aes_expand_key() to expand the key. * &crypto_aes_ctx _must_ be the private data embedded in @tfm which is * retrieved with crypto_tfm_ctx(). */ @@ -1292,7 +1143,7 @@ int crypto_aes_set_key(struct crypto_tfm *tfm, const u8 *in_key, u32 *flags = &tfm->crt_flags; int ret; - ret = crypto_aes_expand_key(ctx, in_key, key_len); + ret = aes_expandkey(ctx, in_key, key_len); if (!ret) return 0; diff --git a/include/crypto/aes.h b/include/crypto/aes.h index d0067fca0cd0..0a64a977f9b3 100644 --- a/include/crypto/aes.h +++ b/include/crypto/aes.h @@ -35,8 +35,6 @@ extern const u32 crypto_il_tab[4][256] ____cacheline_aligned; int crypto_aes_set_key(struct crypto_tfm *tfm, const u8 *in_key, unsigned int key_len); -int crypto_aes_expand_key(struct crypto_aes_ctx *ctx, const u8 *in_key, - unsigned int key_len); /** * aes_expandkey - Expands the AES key as described in FIPS-197 -- cgit v1.2.3 From d9ec772d9550b6e513c51bc4e6fa1e3ffb50181c Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 2 Jul 2019 21:41:34 +0200 Subject: crypto: ctr - add helper for performing a CTR encryption walk Add a static inline helper modeled after crypto_cbc_encrypt_walk() that can be reused for SIMD algorithms that need to implement a non-SIMD fallback for performing CTR encryption. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- include/crypto/ctr.h | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) (limited to 'include') diff --git a/include/crypto/ctr.h b/include/crypto/ctr.h index 06984a26c8cf..a1c66d1001af 100644 --- a/include/crypto/ctr.h +++ b/include/crypto/ctr.h @@ -8,8 +8,58 @@ #ifndef _CRYPTO_CTR_H #define _CRYPTO_CTR_H +#include +#include +#include +#include + #define CTR_RFC3686_NONCE_SIZE 4 #define CTR_RFC3686_IV_SIZE 8 #define CTR_RFC3686_BLOCK_SIZE 16 +static inline int crypto_ctr_encrypt_walk(struct skcipher_request *req, + void (*fn)(struct crypto_skcipher *, + const u8 *, u8 *)) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + int blocksize = crypto_skcipher_chunksize(tfm); + u8 buf[MAX_CIPHER_BLOCKSIZE]; + struct skcipher_walk walk; + int err; + + /* avoid integer division due to variable blocksize parameter */ + if (WARN_ON_ONCE(!is_power_of_2(blocksize))) + return -EINVAL; + + err = skcipher_walk_virt(&walk, req, false); + + while (walk.nbytes > 0) { + u8 *dst = walk.dst.virt.addr; + u8 *src = walk.src.virt.addr; + int nbytes = walk.nbytes; + int tail = 0; + + if (nbytes < walk.total) { + tail = walk.nbytes & (blocksize - 1); + nbytes -= tail; + } + + do { + int bsize = min(nbytes, blocksize); + + fn(tfm, walk.iv, buf); + + crypto_xor_cpy(dst, src, buf, bsize); + crypto_inc(walk.iv, blocksize); + + dst += bsize; + src += bsize; + nbytes -= bsize; + } while (nbytes > 0); + + err = skcipher_walk_done(&walk, tail); + } + return err; +} + #endif /* _CRYPTO_CTR_H */ -- cgit v1.2.3 From 1e25ca02a0619c10bc0eae975846926902381ce5 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 2 Jul 2019 21:41:45 +0200 Subject: crypto: aes-generic - unexport last-round AES tables The versions of the AES lookup tables that are only used during the last round are never used outside of the driver, so there is no need to export their symbols. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- crypto/aes_generic.c | 6 ++---- include/crypto/aes.h | 2 -- 2 files changed, 2 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/crypto/aes_generic.c b/crypto/aes_generic.c index 426deb437f19..71a5c190d360 100644 --- a/crypto/aes_generic.c +++ b/crypto/aes_generic.c @@ -328,7 +328,7 @@ __visible const u32 crypto_ft_tab[4][256] ____cacheline_aligned = { } }; -__visible const u32 crypto_fl_tab[4][256] ____cacheline_aligned = { +static const u32 crypto_fl_tab[4][256] ____cacheline_aligned = { { 0x00000063, 0x0000007c, 0x00000077, 0x0000007b, 0x000000f2, 0x0000006b, 0x0000006f, 0x000000c5, @@ -856,7 +856,7 @@ __visible const u32 crypto_it_tab[4][256] ____cacheline_aligned = { } }; -__visible const u32 crypto_il_tab[4][256] ____cacheline_aligned = { +static const u32 crypto_il_tab[4][256] ____cacheline_aligned = { { 0x00000052, 0x00000009, 0x0000006a, 0x000000d5, 0x00000030, 0x00000036, 0x000000a5, 0x00000038, @@ -1121,9 +1121,7 @@ __visible const u32 crypto_il_tab[4][256] ____cacheline_aligned = { }; EXPORT_SYMBOL_GPL(crypto_ft_tab); -EXPORT_SYMBOL_GPL(crypto_fl_tab); EXPORT_SYMBOL_GPL(crypto_it_tab); -EXPORT_SYMBOL_GPL(crypto_il_tab); /** * crypto_aes_set_key - Set the AES key. diff --git a/include/crypto/aes.h b/include/crypto/aes.h index 0a64a977f9b3..df8426fd8051 100644 --- a/include/crypto/aes.h +++ b/include/crypto/aes.h @@ -29,9 +29,7 @@ struct crypto_aes_ctx { }; extern const u32 crypto_ft_tab[4][256] ____cacheline_aligned; -extern const u32 crypto_fl_tab[4][256] ____cacheline_aligned; extern const u32 crypto_it_tab[4][256] ____cacheline_aligned; -extern const u32 crypto_il_tab[4][256] ____cacheline_aligned; int crypto_aes_set_key(struct crypto_tfm *tfm, const u8 *in_key, unsigned int key_len); -- cgit v1.2.3 From 9467a3150cf4afca638673e099af71e8c493a3a0 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 2 Jul 2019 21:41:46 +0200 Subject: crypto: lib/aes - export sbox and inverse sbox There are a few copies of the AES S-boxes floating around, so export the ones from the AES library so that we can reuse them in other modules. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- include/crypto/aes.h | 3 +++ lib/crypto/aes.c | 6 ++++++ 2 files changed, 9 insertions(+) (limited to 'include') diff --git a/include/crypto/aes.h b/include/crypto/aes.h index df8426fd8051..8e0f4cf948e5 100644 --- a/include/crypto/aes.h +++ b/include/crypto/aes.h @@ -67,4 +67,7 @@ void aes_encrypt(const struct crypto_aes_ctx *ctx, u8 *out, const u8 *in); */ void aes_decrypt(const struct crypto_aes_ctx *ctx, u8 *out, const u8 *in); +extern const u8 crypto_aes_sbox[]; +extern const u8 crypto_aes_inv_sbox[]; + #endif diff --git a/lib/crypto/aes.c b/lib/crypto/aes.c index 9928b23e0a8a..4e100af38c51 100644 --- a/lib/crypto/aes.c +++ b/lib/crypto/aes.c @@ -82,6 +82,12 @@ static volatile const u8 __cacheline_aligned aes_inv_sbox[] = { 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d, }; +extern const u8 crypto_aes_sbox[256] __alias(aes_sbox); +extern const u8 crypto_aes_inv_sbox[256] __alias(aes_inv_sbox); + +EXPORT_SYMBOL(crypto_aes_sbox); +EXPORT_SYMBOL(crypto_aes_inv_sbox); + static u32 mul_by_x(u32 w) { u32 x = w & 0x7f7f7f7f; -- cgit v1.2.3 From 5cb97700beaa005ceb2a127b6f53536a4544c9d8 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 3 Jul 2019 10:55:06 +0200 Subject: crypto: morus - remove generic and x86 implementations MORUS was not selected as a winner in the CAESAR competition, which is not surprising since it is considered to be cryptographically broken [0]. (Note that this is not an implementation defect, but a flaw in the underlying algorithm). Since it is unlikely to be in use currently, let's remove it before we're stuck with it. [0] https://eprint.iacr.org/2019/172.pdf Reviewed-by: Ondrej Mosnacek Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/x86/crypto/Makefile | 13 - arch/x86/crypto/morus1280-avx2-asm.S | 619 ------------ arch/x86/crypto/morus1280-avx2-glue.c | 62 -- arch/x86/crypto/morus1280-sse2-asm.S | 893 ----------------- arch/x86/crypto/morus1280-sse2-glue.c | 61 -- arch/x86/crypto/morus1280_glue.c | 205 ---- arch/x86/crypto/morus640-sse2-asm.S | 612 ------------ arch/x86/crypto/morus640-sse2-glue.c | 61 -- arch/x86/crypto/morus640_glue.c | 200 ---- crypto/Kconfig | 56 -- crypto/Makefile | 2 - crypto/morus1280.c | 542 ----------- crypto/morus640.c | 533 ---------- crypto/testmgr.c | 12 - crypto/testmgr.h | 1707 --------------------------------- include/crypto/morus1280_glue.h | 97 -- include/crypto/morus640_glue.h | 97 -- include/crypto/morus_common.h | 18 - 18 files changed, 5790 deletions(-) delete mode 100644 arch/x86/crypto/morus1280-avx2-asm.S delete mode 100644 arch/x86/crypto/morus1280-avx2-glue.c delete mode 100644 arch/x86/crypto/morus1280-sse2-asm.S delete mode 100644 arch/x86/crypto/morus1280-sse2-glue.c delete mode 100644 arch/x86/crypto/morus1280_glue.c delete mode 100644 arch/x86/crypto/morus640-sse2-asm.S delete mode 100644 arch/x86/crypto/morus640-sse2-glue.c delete mode 100644 arch/x86/crypto/morus640_glue.c delete mode 100644 crypto/morus1280.c delete mode 100644 crypto/morus640.c delete mode 100644 include/crypto/morus1280_glue.h delete mode 100644 include/crypto/morus640_glue.h delete mode 100644 include/crypto/morus_common.h (limited to 'include') diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index b96a14e67ab0..6f1d825fbb09 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -39,12 +39,6 @@ obj-$(CONFIG_CRYPTO_AEGIS128_AESNI_SSE2) += aegis128-aesni.o obj-$(CONFIG_CRYPTO_AEGIS128L_AESNI_SSE2) += aegis128l-aesni.o obj-$(CONFIG_CRYPTO_AEGIS256_AESNI_SSE2) += aegis256-aesni.o -obj-$(CONFIG_CRYPTO_MORUS640_GLUE) += morus640_glue.o -obj-$(CONFIG_CRYPTO_MORUS1280_GLUE) += morus1280_glue.o - -obj-$(CONFIG_CRYPTO_MORUS640_SSE2) += morus640-sse2.o -obj-$(CONFIG_CRYPTO_MORUS1280_SSE2) += morus1280-sse2.o - obj-$(CONFIG_CRYPTO_NHPOLY1305_SSE2) += nhpoly1305-sse2.o obj-$(CONFIG_CRYPTO_NHPOLY1305_AVX2) += nhpoly1305-avx2.o @@ -62,8 +56,6 @@ endif ifeq ($(avx2_supported),yes) obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64) += camellia-aesni-avx2.o obj-$(CONFIG_CRYPTO_SERPENT_AVX2_X86_64) += serpent-avx2.o - - obj-$(CONFIG_CRYPTO_MORUS1280_AVX2) += morus1280-avx2.o endif twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o @@ -81,9 +73,6 @@ aegis128-aesni-y := aegis128-aesni-asm.o aegis128-aesni-glue.o aegis128l-aesni-y := aegis128l-aesni-asm.o aegis128l-aesni-glue.o aegis256-aesni-y := aegis256-aesni-asm.o aegis256-aesni-glue.o -morus640-sse2-y := morus640-sse2-asm.o morus640-sse2-glue.o -morus1280-sse2-y := morus1280-sse2-asm.o morus1280-sse2-glue.o - nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o ifeq ($(avx_supported),yes) @@ -102,8 +91,6 @@ ifeq ($(avx2_supported),yes) chacha-x86_64-y += chacha-avx2-x86_64.o serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o - morus1280-avx2-y := morus1280-avx2-asm.o morus1280-avx2-glue.o - nhpoly1305-avx2-y := nh-avx2-x86_64.o nhpoly1305-avx2-glue.o endif diff --git a/arch/x86/crypto/morus1280-avx2-asm.S b/arch/x86/crypto/morus1280-avx2-asm.S deleted file mode 100644 index 5413fee33481..000000000000 --- a/arch/x86/crypto/morus1280-avx2-asm.S +++ /dev/null @@ -1,619 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * AVX2 implementation of MORUS-1280 - * - * Copyright (c) 2017-2018 Ondrej Mosnacek - * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. - */ - -#include -#include - -#define SHUFFLE_MASK(i0, i1, i2, i3) \ - (i0 | (i1 << 2) | (i2 << 4) | (i3 << 6)) - -#define MASK1 SHUFFLE_MASK(3, 0, 1, 2) -#define MASK2 SHUFFLE_MASK(2, 3, 0, 1) -#define MASK3 SHUFFLE_MASK(1, 2, 3, 0) - -#define STATE0 %ymm0 -#define STATE0_LOW %xmm0 -#define STATE1 %ymm1 -#define STATE2 %ymm2 -#define STATE3 %ymm3 -#define STATE4 %ymm4 -#define KEY %ymm5 -#define MSG %ymm5 -#define MSG_LOW %xmm5 -#define T0 %ymm6 -#define T0_LOW %xmm6 -#define T1 %ymm7 - -.section .rodata.cst32.morus1280_const, "aM", @progbits, 32 -.align 32 -.Lmorus1280_const: - .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d - .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62 - .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1 - .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd - -.section .rodata.cst32.morus1280_counter, "aM", @progbits, 32 -.align 32 -.Lmorus1280_counter: - .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 - .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f - .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 - .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f - -.text - -.macro morus1280_round s0, s1, s2, s3, s4, b, w - vpand \s1, \s2, T0 - vpxor T0, \s0, \s0 - vpxor \s3, \s0, \s0 - vpsllq $\b, \s0, T0 - vpsrlq $(64 - \b), \s0, \s0 - vpxor T0, \s0, \s0 - vpermq $\w, \s3, \s3 -.endm - -/* - * __morus1280_update: internal ABI - * input: - * STATE[0-4] - input state - * MSG - message block - * output: - * STATE[0-4] - output state - * changed: - * T0 - */ -__morus1280_update: - morus1280_round STATE0, STATE1, STATE2, STATE3, STATE4, 13, MASK1 - vpxor MSG, STATE1, STATE1 - morus1280_round STATE1, STATE2, STATE3, STATE4, STATE0, 46, MASK2 - vpxor MSG, STATE2, STATE2 - morus1280_round STATE2, STATE3, STATE4, STATE0, STATE1, 38, MASK3 - vpxor MSG, STATE3, STATE3 - morus1280_round STATE3, STATE4, STATE0, STATE1, STATE2, 7, MASK2 - vpxor MSG, STATE4, STATE4 - morus1280_round STATE4, STATE0, STATE1, STATE2, STATE3, 4, MASK1 - ret -ENDPROC(__morus1280_update) - -/* - * __morus1280_update_zero: internal ABI - * input: - * STATE[0-4] - input state - * output: - * STATE[0-4] - output state - * changed: - * T0 - */ -__morus1280_update_zero: - morus1280_round STATE0, STATE1, STATE2, STATE3, STATE4, 13, MASK1 - morus1280_round STATE1, STATE2, STATE3, STATE4, STATE0, 46, MASK2 - morus1280_round STATE2, STATE3, STATE4, STATE0, STATE1, 38, MASK3 - morus1280_round STATE3, STATE4, STATE0, STATE1, STATE2, 7, MASK2 - morus1280_round STATE4, STATE0, STATE1, STATE2, STATE3, 4, MASK1 - ret -ENDPROC(__morus1280_update_zero) - -/* - * __load_partial: internal ABI - * input: - * %rsi - src - * %rcx - bytes - * output: - * MSG - message block - * changed: - * %r8 - * %r9 - */ -__load_partial: - xor %r9d, %r9d - vpxor MSG, MSG, MSG - - mov %rcx, %r8 - and $0x1, %r8 - jz .Lld_partial_1 - - mov %rcx, %r8 - and $0x1E, %r8 - add %rsi, %r8 - mov (%r8), %r9b - -.Lld_partial_1: - mov %rcx, %r8 - and $0x2, %r8 - jz .Lld_partial_2 - - mov %rcx, %r8 - and $0x1C, %r8 - add %rsi, %r8 - shl $16, %r9 - mov (%r8), %r9w - -.Lld_partial_2: - mov %rcx, %r8 - and $0x4, %r8 - jz .Lld_partial_4 - - mov %rcx, %r8 - and $0x18, %r8 - add %rsi, %r8 - shl $32, %r9 - mov (%r8), %r8d - xor %r8, %r9 - -.Lld_partial_4: - movq %r9, MSG_LOW - - mov %rcx, %r8 - and $0x8, %r8 - jz .Lld_partial_8 - - mov %rcx, %r8 - and $0x10, %r8 - add %rsi, %r8 - pshufd $MASK2, MSG_LOW, MSG_LOW - pinsrq $0, (%r8), MSG_LOW - -.Lld_partial_8: - mov %rcx, %r8 - and $0x10, %r8 - jz .Lld_partial_16 - - vpermq $MASK2, MSG, MSG - movdqu (%rsi), MSG_LOW - -.Lld_partial_16: - ret -ENDPROC(__load_partial) - -/* - * __store_partial: internal ABI - * input: - * %rdx - dst - * %rcx - bytes - * output: - * T0 - message block - * changed: - * %r8 - * %r9 - * %r10 - */ -__store_partial: - mov %rcx, %r8 - mov %rdx, %r9 - - cmp $16, %r8 - jl .Lst_partial_16 - - movdqu T0_LOW, (%r9) - vpermq $MASK2, T0, T0 - - sub $16, %r8 - add $16, %r9 - -.Lst_partial_16: - movq T0_LOW, %r10 - - cmp $8, %r8 - jl .Lst_partial_8 - - mov %r10, (%r9) - pextrq $1, T0_LOW, %r10 - - sub $8, %r8 - add $8, %r9 - -.Lst_partial_8: - cmp $4, %r8 - jl .Lst_partial_4 - - mov %r10d, (%r9) - shr $32, %r10 - - sub $4, %r8 - add $4, %r9 - -.Lst_partial_4: - cmp $2, %r8 - jl .Lst_partial_2 - - mov %r10w, (%r9) - shr $16, %r10 - - sub $2, %r8 - add $2, %r9 - -.Lst_partial_2: - cmp $1, %r8 - jl .Lst_partial_1 - - mov %r10b, (%r9) - -.Lst_partial_1: - ret -ENDPROC(__store_partial) - -/* - * void crypto_morus1280_avx2_init(void *state, const void *key, - * const void *iv); - */ -ENTRY(crypto_morus1280_avx2_init) - FRAME_BEGIN - - /* load IV: */ - vpxor STATE0, STATE0, STATE0 - movdqu (%rdx), STATE0_LOW - /* load key: */ - vmovdqu (%rsi), KEY - vmovdqa KEY, STATE1 - /* load all ones: */ - vpcmpeqd STATE2, STATE2, STATE2 - /* load all zeros: */ - vpxor STATE3, STATE3, STATE3 - /* load the constant: */ - vmovdqa .Lmorus1280_const, STATE4 - - /* update 16 times with zero: */ - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - - /* xor-in the key again after updates: */ - vpxor KEY, STATE1, STATE1 - - /* store the state: */ - vmovdqu STATE0, (0 * 32)(%rdi) - vmovdqu STATE1, (1 * 32)(%rdi) - vmovdqu STATE2, (2 * 32)(%rdi) - vmovdqu STATE3, (3 * 32)(%rdi) - vmovdqu STATE4, (4 * 32)(%rdi) - - FRAME_END - ret -ENDPROC(crypto_morus1280_avx2_init) - -/* - * void crypto_morus1280_avx2_ad(void *state, const void *data, - * unsigned int length); - */ -ENTRY(crypto_morus1280_avx2_ad) - FRAME_BEGIN - - cmp $32, %rdx - jb .Lad_out - - /* load the state: */ - vmovdqu (0 * 32)(%rdi), STATE0 - vmovdqu (1 * 32)(%rdi), STATE1 - vmovdqu (2 * 32)(%rdi), STATE2 - vmovdqu (3 * 32)(%rdi), STATE3 - vmovdqu (4 * 32)(%rdi), STATE4 - - mov %rsi, %r8 - and $0x1F, %r8 - jnz .Lad_u_loop - -.align 4 -.Lad_a_loop: - vmovdqa (%rsi), MSG - call __morus1280_update - sub $32, %rdx - add $32, %rsi - cmp $32, %rdx - jge .Lad_a_loop - - jmp .Lad_cont -.align 4 -.Lad_u_loop: - vmovdqu (%rsi), MSG - call __morus1280_update - sub $32, %rdx - add $32, %rsi - cmp $32, %rdx - jge .Lad_u_loop - -.Lad_cont: - /* store the state: */ - vmovdqu STATE0, (0 * 32)(%rdi) - vmovdqu STATE1, (1 * 32)(%rdi) - vmovdqu STATE2, (2 * 32)(%rdi) - vmovdqu STATE3, (3 * 32)(%rdi) - vmovdqu STATE4, (4 * 32)(%rdi) - -.Lad_out: - FRAME_END - ret -ENDPROC(crypto_morus1280_avx2_ad) - -/* - * void crypto_morus1280_avx2_enc(void *state, const void *src, void *dst, - * unsigned int length); - */ -ENTRY(crypto_morus1280_avx2_enc) - FRAME_BEGIN - - cmp $32, %rcx - jb .Lenc_out - - /* load the state: */ - vmovdqu (0 * 32)(%rdi), STATE0 - vmovdqu (1 * 32)(%rdi), STATE1 - vmovdqu (2 * 32)(%rdi), STATE2 - vmovdqu (3 * 32)(%rdi), STATE3 - vmovdqu (4 * 32)(%rdi), STATE4 - - mov %rsi, %r8 - or %rdx, %r8 - and $0x1F, %r8 - jnz .Lenc_u_loop - -.align 4 -.Lenc_a_loop: - vmovdqa (%rsi), MSG - vmovdqa MSG, T0 - vpxor STATE0, T0, T0 - vpermq $MASK3, STATE1, T1 - vpxor T1, T0, T0 - vpand STATE2, STATE3, T1 - vpxor T1, T0, T0 - vmovdqa T0, (%rdx) - - call __morus1280_update - sub $32, %rcx - add $32, %rsi - add $32, %rdx - cmp $32, %rcx - jge .Lenc_a_loop - - jmp .Lenc_cont -.align 4 -.Lenc_u_loop: - vmovdqu (%rsi), MSG - vmovdqa MSG, T0 - vpxor STATE0, T0, T0 - vpermq $MASK3, STATE1, T1 - vpxor T1, T0, T0 - vpand STATE2, STATE3, T1 - vpxor T1, T0, T0 - vmovdqu T0, (%rdx) - - call __morus1280_update - sub $32, %rcx - add $32, %rsi - add $32, %rdx - cmp $32, %rcx - jge .Lenc_u_loop - -.Lenc_cont: - /* store the state: */ - vmovdqu STATE0, (0 * 32)(%rdi) - vmovdqu STATE1, (1 * 32)(%rdi) - vmovdqu STATE2, (2 * 32)(%rdi) - vmovdqu STATE3, (3 * 32)(%rdi) - vmovdqu STATE4, (4 * 32)(%rdi) - -.Lenc_out: - FRAME_END - ret -ENDPROC(crypto_morus1280_avx2_enc) - -/* - * void crypto_morus1280_avx2_enc_tail(void *state, const void *src, void *dst, - * unsigned int length); - */ -ENTRY(crypto_morus1280_avx2_enc_tail) - FRAME_BEGIN - - /* load the state: */ - vmovdqu (0 * 32)(%rdi), STATE0 - vmovdqu (1 * 32)(%rdi), STATE1 - vmovdqu (2 * 32)(%rdi), STATE2 - vmovdqu (3 * 32)(%rdi), STATE3 - vmovdqu (4 * 32)(%rdi), STATE4 - - /* encrypt message: */ - call __load_partial - - vmovdqa MSG, T0 - vpxor STATE0, T0, T0 - vpermq $MASK3, STATE1, T1 - vpxor T1, T0, T0 - vpand STATE2, STATE3, T1 - vpxor T1, T0, T0 - - call __store_partial - - call __morus1280_update - - /* store the state: */ - vmovdqu STATE0, (0 * 32)(%rdi) - vmovdqu STATE1, (1 * 32)(%rdi) - vmovdqu STATE2, (2 * 32)(%rdi) - vmovdqu STATE3, (3 * 32)(%rdi) - vmovdqu STATE4, (4 * 32)(%rdi) - - FRAME_END - ret -ENDPROC(crypto_morus1280_avx2_enc_tail) - -/* - * void crypto_morus1280_avx2_dec(void *state, const void *src, void *dst, - * unsigned int length); - */ -ENTRY(crypto_morus1280_avx2_dec) - FRAME_BEGIN - - cmp $32, %rcx - jb .Ldec_out - - /* load the state: */ - vmovdqu (0 * 32)(%rdi), STATE0 - vmovdqu (1 * 32)(%rdi), STATE1 - vmovdqu (2 * 32)(%rdi), STATE2 - vmovdqu (3 * 32)(%rdi), STATE3 - vmovdqu (4 * 32)(%rdi), STATE4 - - mov %rsi, %r8 - or %rdx, %r8 - and $0x1F, %r8 - jnz .Ldec_u_loop - -.align 4 -.Ldec_a_loop: - vmovdqa (%rsi), MSG - vpxor STATE0, MSG, MSG - vpermq $MASK3, STATE1, T0 - vpxor T0, MSG, MSG - vpand STATE2, STATE3, T0 - vpxor T0, MSG, MSG - vmovdqa MSG, (%rdx) - - call __morus1280_update - sub $32, %rcx - add $32, %rsi - add $32, %rdx - cmp $32, %rcx - jge .Ldec_a_loop - - jmp .Ldec_cont -.align 4 -.Ldec_u_loop: - vmovdqu (%rsi), MSG - vpxor STATE0, MSG, MSG - vpermq $MASK3, STATE1, T0 - vpxor T0, MSG, MSG - vpand STATE2, STATE3, T0 - vpxor T0, MSG, MSG - vmovdqu MSG, (%rdx) - - call __morus1280_update - sub $32, %rcx - add $32, %rsi - add $32, %rdx - cmp $32, %rcx - jge .Ldec_u_loop - -.Ldec_cont: - /* store the state: */ - vmovdqu STATE0, (0 * 32)(%rdi) - vmovdqu STATE1, (1 * 32)(%rdi) - vmovdqu STATE2, (2 * 32)(%rdi) - vmovdqu STATE3, (3 * 32)(%rdi) - vmovdqu STATE4, (4 * 32)(%rdi) - -.Ldec_out: - FRAME_END - ret -ENDPROC(crypto_morus1280_avx2_dec) - -/* - * void crypto_morus1280_avx2_dec_tail(void *state, const void *src, void *dst, - * unsigned int length); - */ -ENTRY(crypto_morus1280_avx2_dec_tail) - FRAME_BEGIN - - /* load the state: */ - vmovdqu (0 * 32)(%rdi), STATE0 - vmovdqu (1 * 32)(%rdi), STATE1 - vmovdqu (2 * 32)(%rdi), STATE2 - vmovdqu (3 * 32)(%rdi), STATE3 - vmovdqu (4 * 32)(%rdi), STATE4 - - /* decrypt message: */ - call __load_partial - - vpxor STATE0, MSG, MSG - vpermq $MASK3, STATE1, T0 - vpxor T0, MSG, MSG - vpand STATE2, STATE3, T0 - vpxor T0, MSG, MSG - vmovdqa MSG, T0 - - call __store_partial - - /* mask with byte count: */ - movq %rcx, T0_LOW - vpbroadcastb T0_LOW, T0 - vmovdqa .Lmorus1280_counter, T1 - vpcmpgtb T1, T0, T0 - vpand T0, MSG, MSG - - call __morus1280_update - - /* store the state: */ - vmovdqu STATE0, (0 * 32)(%rdi) - vmovdqu STATE1, (1 * 32)(%rdi) - vmovdqu STATE2, (2 * 32)(%rdi) - vmovdqu STATE3, (3 * 32)(%rdi) - vmovdqu STATE4, (4 * 32)(%rdi) - - FRAME_END - ret -ENDPROC(crypto_morus1280_avx2_dec_tail) - -/* - * void crypto_morus1280_avx2_final(void *state, void *tag_xor, - * u64 assoclen, u64 cryptlen); - */ -ENTRY(crypto_morus1280_avx2_final) - FRAME_BEGIN - - /* load the state: */ - vmovdqu (0 * 32)(%rdi), STATE0 - vmovdqu (1 * 32)(%rdi), STATE1 - vmovdqu (2 * 32)(%rdi), STATE2 - vmovdqu (3 * 32)(%rdi), STATE3 - vmovdqu (4 * 32)(%rdi), STATE4 - - /* xor state[0] into state[4]: */ - vpxor STATE0, STATE4, STATE4 - - /* prepare length block: */ - vpxor MSG, MSG, MSG - vpinsrq $0, %rdx, MSG_LOW, MSG_LOW - vpinsrq $1, %rcx, MSG_LOW, MSG_LOW - vpsllq $3, MSG, MSG /* multiply by 8 (to get bit count) */ - - /* update state: */ - call __morus1280_update - call __morus1280_update - call __morus1280_update - call __morus1280_update - call __morus1280_update - call __morus1280_update - call __morus1280_update - call __morus1280_update - call __morus1280_update - call __morus1280_update - - /* xor tag: */ - vmovdqu (%rsi), MSG - - vpxor STATE0, MSG, MSG - vpermq $MASK3, STATE1, T0 - vpxor T0, MSG, MSG - vpand STATE2, STATE3, T0 - vpxor T0, MSG, MSG - vmovdqu MSG, (%rsi) - - FRAME_END - ret -ENDPROC(crypto_morus1280_avx2_final) diff --git a/arch/x86/crypto/morus1280-avx2-glue.c b/arch/x86/crypto/morus1280-avx2-glue.c deleted file mode 100644 index 2d000d66ba4c..000000000000 --- a/arch/x86/crypto/morus1280-avx2-glue.c +++ /dev/null @@ -1,62 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * The MORUS-1280 Authenticated-Encryption Algorithm - * Glue for AVX2 implementation - * - * Copyright (c) 2016-2018 Ondrej Mosnacek - * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. - */ - -#include -#include -#include -#include -#include -#include - -asmlinkage void crypto_morus1280_avx2_init(void *state, const void *key, - const void *iv); -asmlinkage void crypto_morus1280_avx2_ad(void *state, const void *data, - unsigned int length); - -asmlinkage void crypto_morus1280_avx2_enc(void *state, const void *src, - void *dst, unsigned int length); -asmlinkage void crypto_morus1280_avx2_dec(void *state, const void *src, - void *dst, unsigned int length); - -asmlinkage void crypto_morus1280_avx2_enc_tail(void *state, const void *src, - void *dst, unsigned int length); -asmlinkage void crypto_morus1280_avx2_dec_tail(void *state, const void *src, - void *dst, unsigned int length); - -asmlinkage void crypto_morus1280_avx2_final(void *state, void *tag_xor, - u64 assoclen, u64 cryptlen); - -MORUS1280_DECLARE_ALG(avx2, "morus1280-avx2", 400); - -static struct simd_aead_alg *simd_alg; - -static int __init crypto_morus1280_avx2_module_init(void) -{ - if (!boot_cpu_has(X86_FEATURE_AVX2) || - !boot_cpu_has(X86_FEATURE_OSXSAVE) || - !cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) - return -ENODEV; - - return simd_register_aeads_compat(&crypto_morus1280_avx2_alg, 1, - &simd_alg); -} - -static void __exit crypto_morus1280_avx2_module_exit(void) -{ - simd_unregister_aeads(&crypto_morus1280_avx2_alg, 1, &simd_alg); -} - -module_init(crypto_morus1280_avx2_module_init); -module_exit(crypto_morus1280_avx2_module_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Ondrej Mosnacek "); -MODULE_DESCRIPTION("MORUS-1280 AEAD algorithm -- AVX2 implementation"); -MODULE_ALIAS_CRYPTO("morus1280"); -MODULE_ALIAS_CRYPTO("morus1280-avx2"); diff --git a/arch/x86/crypto/morus1280-sse2-asm.S b/arch/x86/crypto/morus1280-sse2-asm.S deleted file mode 100644 index 0eece772866b..000000000000 --- a/arch/x86/crypto/morus1280-sse2-asm.S +++ /dev/null @@ -1,893 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * SSE2 implementation of MORUS-1280 - * - * Copyright (c) 2017-2018 Ondrej Mosnacek - * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. - */ - -#include -#include - -#define SHUFFLE_MASK(i0, i1, i2, i3) \ - (i0 | (i1 << 2) | (i2 << 4) | (i3 << 6)) - -#define MASK2 SHUFFLE_MASK(2, 3, 0, 1) - -#define STATE0_LO %xmm0 -#define STATE0_HI %xmm1 -#define STATE1_LO %xmm2 -#define STATE1_HI %xmm3 -#define STATE2_LO %xmm4 -#define STATE2_HI %xmm5 -#define STATE3_LO %xmm6 -#define STATE3_HI %xmm7 -#define STATE4_LO %xmm8 -#define STATE4_HI %xmm9 -#define KEY_LO %xmm10 -#define KEY_HI %xmm11 -#define MSG_LO %xmm10 -#define MSG_HI %xmm11 -#define T0_LO %xmm12 -#define T0_HI %xmm13 -#define T1_LO %xmm14 -#define T1_HI %xmm15 - -.section .rodata.cst16.morus640_const, "aM", @progbits, 16 -.align 16 -.Lmorus640_const_0: - .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d - .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62 -.Lmorus640_const_1: - .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1 - .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd - -.section .rodata.cst16.morus640_counter, "aM", @progbits, 16 -.align 16 -.Lmorus640_counter_0: - .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 - .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f -.Lmorus640_counter_1: - .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 - .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f - -.text - -.macro rol1 hi, lo - /* - * HI_1 | HI_0 || LO_1 | LO_0 - * ==> - * HI_0 | HI_1 || LO_1 | LO_0 - * ==> - * HI_0 | LO_1 || LO_0 | HI_1 - */ - pshufd $MASK2, \hi, \hi - movdqa \hi, T0_LO - punpcklqdq \lo, T0_LO - punpckhqdq \hi, \lo - movdqa \lo, \hi - movdqa T0_LO, \lo -.endm - -.macro rol2 hi, lo - movdqa \lo, T0_LO - movdqa \hi, \lo - movdqa T0_LO, \hi -.endm - -.macro rol3 hi, lo - /* - * HI_1 | HI_0 || LO_1 | LO_0 - * ==> - * HI_0 | HI_1 || LO_1 | LO_0 - * ==> - * LO_0 | HI_1 || HI_0 | LO_1 - */ - pshufd $MASK2, \hi, \hi - movdqa \lo, T0_LO - punpckhqdq \hi, T0_LO - punpcklqdq \lo, \hi - movdqa T0_LO, \lo -.endm - -.macro morus1280_round s0_l, s0_h, s1_l, s1_h, s2_l, s2_h, s3_l, s3_h, s4_l, s4_h, b, w - movdqa \s1_l, T0_LO - pand \s2_l, T0_LO - pxor T0_LO, \s0_l - - movdqa \s1_h, T0_LO - pand \s2_h, T0_LO - pxor T0_LO, \s0_h - - pxor \s3_l, \s0_l - pxor \s3_h, \s0_h - - movdqa \s0_l, T0_LO - psllq $\b, T0_LO - psrlq $(64 - \b), \s0_l - pxor T0_LO, \s0_l - - movdqa \s0_h, T0_LO - psllq $\b, T0_LO - psrlq $(64 - \b), \s0_h - pxor T0_LO, \s0_h - - \w \s3_h, \s3_l -.endm - -/* - * __morus1280_update: internal ABI - * input: - * STATE[0-4] - input state - * MSG - message block - * output: - * STATE[0-4] - output state - * changed: - * T0 - */ -__morus1280_update: - morus1280_round \ - STATE0_LO, STATE0_HI, \ - STATE1_LO, STATE1_HI, \ - STATE2_LO, STATE2_HI, \ - STATE3_LO, STATE3_HI, \ - STATE4_LO, STATE4_HI, \ - 13, rol1 - pxor MSG_LO, STATE1_LO - pxor MSG_HI, STATE1_HI - morus1280_round \ - STATE1_LO, STATE1_HI, \ - STATE2_LO, STATE2_HI, \ - STATE3_LO, STATE3_HI, \ - STATE4_LO, STATE4_HI, \ - STATE0_LO, STATE0_HI, \ - 46, rol2 - pxor MSG_LO, STATE2_LO - pxor MSG_HI, STATE2_HI - morus1280_round \ - STATE2_LO, STATE2_HI, \ - STATE3_LO, STATE3_HI, \ - STATE4_LO, STATE4_HI, \ - STATE0_LO, STATE0_HI, \ - STATE1_LO, STATE1_HI, \ - 38, rol3 - pxor MSG_LO, STATE3_LO - pxor MSG_HI, STATE3_HI - morus1280_round \ - STATE3_LO, STATE3_HI, \ - STATE4_LO, STATE4_HI, \ - STATE0_LO, STATE0_HI, \ - STATE1_LO, STATE1_HI, \ - STATE2_LO, STATE2_HI, \ - 7, rol2 - pxor MSG_LO, STATE4_LO - pxor MSG_HI, STATE4_HI - morus1280_round \ - STATE4_LO, STATE4_HI, \ - STATE0_LO, STATE0_HI, \ - STATE1_LO, STATE1_HI, \ - STATE2_LO, STATE2_HI, \ - STATE3_LO, STATE3_HI, \ - 4, rol1 - ret -ENDPROC(__morus1280_update) - -/* - * __morus1280_update_zero: internal ABI - * input: - * STATE[0-4] - input state - * output: - * STATE[0-4] - output state - * changed: - * T0 - */ -__morus1280_update_zero: - morus1280_round \ - STATE0_LO, STATE0_HI, \ - STATE1_LO, STATE1_HI, \ - STATE2_LO, STATE2_HI, \ - STATE3_LO, STATE3_HI, \ - STATE4_LO, STATE4_HI, \ - 13, rol1 - morus1280_round \ - STATE1_LO, STATE1_HI, \ - STATE2_LO, STATE2_HI, \ - STATE3_LO, STATE3_HI, \ - STATE4_LO, STATE4_HI, \ - STATE0_LO, STATE0_HI, \ - 46, rol2 - morus1280_round \ - STATE2_LO, STATE2_HI, \ - STATE3_LO, STATE3_HI, \ - STATE4_LO, STATE4_HI, \ - STATE0_LO, STATE0_HI, \ - STATE1_LO, STATE1_HI, \ - 38, rol3 - morus1280_round \ - STATE3_LO, STATE3_HI, \ - STATE4_LO, STATE4_HI, \ - STATE0_LO, STATE0_HI, \ - STATE1_LO, STATE1_HI, \ - STATE2_LO, STATE2_HI, \ - 7, rol2 - morus1280_round \ - STATE4_LO, STATE4_HI, \ - STATE0_LO, STATE0_HI, \ - STATE1_LO, STATE1_HI, \ - STATE2_LO, STATE2_HI, \ - STATE3_LO, STATE3_HI, \ - 4, rol1 - ret -ENDPROC(__morus1280_update_zero) - -/* - * __load_partial: internal ABI - * input: - * %rsi - src - * %rcx - bytes - * output: - * MSG - message block - * changed: - * %r8 - * %r9 - */ -__load_partial: - xor %r9d, %r9d - pxor MSG_LO, MSG_LO - pxor MSG_HI, MSG_HI - - mov %rcx, %r8 - and $0x1, %r8 - jz .Lld_partial_1 - - mov %rcx, %r8 - and $0x1E, %r8 - add %rsi, %r8 - mov (%r8), %r9b - -.Lld_partial_1: - mov %rcx, %r8 - and $0x2, %r8 - jz .Lld_partial_2 - - mov %rcx, %r8 - and $0x1C, %r8 - add %rsi, %r8 - shl $16, %r9 - mov (%r8), %r9w - -.Lld_partial_2: - mov %rcx, %r8 - and $0x4, %r8 - jz .Lld_partial_4 - - mov %rcx, %r8 - and $0x18, %r8 - add %rsi, %r8 - shl $32, %r9 - mov (%r8), %r8d - xor %r8, %r9 - -.Lld_partial_4: - movq %r9, MSG_LO - - mov %rcx, %r8 - and $0x8, %r8 - jz .Lld_partial_8 - - mov %rcx, %r8 - and $0x10, %r8 - add %rsi, %r8 - pslldq $8, MSG_LO - movq (%r8), T0_LO - pxor T0_LO, MSG_LO - -.Lld_partial_8: - mov %rcx, %r8 - and $0x10, %r8 - jz .Lld_partial_16 - - movdqa MSG_LO, MSG_HI - movdqu (%rsi), MSG_LO - -.Lld_partial_16: - ret -ENDPROC(__load_partial) - -/* - * __store_partial: internal ABI - * input: - * %rdx - dst - * %rcx - bytes - * output: - * T0 - message block - * changed: - * %r8 - * %r9 - * %r10 - */ -__store_partial: - mov %rcx, %r8 - mov %rdx, %r9 - - cmp $16, %r8 - jl .Lst_partial_16 - - movdqu T0_LO, (%r9) - movdqa T0_HI, T0_LO - - sub $16, %r8 - add $16, %r9 - -.Lst_partial_16: - movq T0_LO, %r10 - - cmp $8, %r8 - jl .Lst_partial_8 - - mov %r10, (%r9) - psrldq $8, T0_LO - movq T0_LO, %r10 - - sub $8, %r8 - add $8, %r9 - -.Lst_partial_8: - cmp $4, %r8 - jl .Lst_partial_4 - - mov %r10d, (%r9) - shr $32, %r10 - - sub $4, %r8 - add $4, %r9 - -.Lst_partial_4: - cmp $2, %r8 - jl .Lst_partial_2 - - mov %r10w, (%r9) - shr $16, %r10 - - sub $2, %r8 - add $2, %r9 - -.Lst_partial_2: - cmp $1, %r8 - jl .Lst_partial_1 - - mov %r10b, (%r9) - -.Lst_partial_1: - ret -ENDPROC(__store_partial) - -/* - * void crypto_morus1280_sse2_init(void *state, const void *key, - * const void *iv); - */ -ENTRY(crypto_morus1280_sse2_init) - FRAME_BEGIN - - /* load IV: */ - pxor STATE0_HI, STATE0_HI - movdqu (%rdx), STATE0_LO - /* load key: */ - movdqu 0(%rsi), KEY_LO - movdqu 16(%rsi), KEY_HI - movdqa KEY_LO, STATE1_LO - movdqa KEY_HI, STATE1_HI - /* load all ones: */ - pcmpeqd STATE2_LO, STATE2_LO - pcmpeqd STATE2_HI, STATE2_HI - /* load all zeros: */ - pxor STATE3_LO, STATE3_LO - pxor STATE3_HI, STATE3_HI - /* load the constant: */ - movdqa .Lmorus640_const_0, STATE4_LO - movdqa .Lmorus640_const_1, STATE4_HI - - /* update 16 times with zero: */ - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - - /* xor-in the key again after updates: */ - pxor KEY_LO, STATE1_LO - pxor KEY_HI, STATE1_HI - - /* store the state: */ - movdqu STATE0_LO, (0 * 16)(%rdi) - movdqu STATE0_HI, (1 * 16)(%rdi) - movdqu STATE1_LO, (2 * 16)(%rdi) - movdqu STATE1_HI, (3 * 16)(%rdi) - movdqu STATE2_LO, (4 * 16)(%rdi) - movdqu STATE2_HI, (5 * 16)(%rdi) - movdqu STATE3_LO, (6 * 16)(%rdi) - movdqu STATE3_HI, (7 * 16)(%rdi) - movdqu STATE4_LO, (8 * 16)(%rdi) - movdqu STATE4_HI, (9 * 16)(%rdi) - - FRAME_END - ret -ENDPROC(crypto_morus1280_sse2_init) - -/* - * void crypto_morus1280_sse2_ad(void *state, const void *data, - * unsigned int length); - */ -ENTRY(crypto_morus1280_sse2_ad) - FRAME_BEGIN - - cmp $32, %rdx - jb .Lad_out - - /* load the state: */ - movdqu (0 * 16)(%rdi), STATE0_LO - movdqu (1 * 16)(%rdi), STATE0_HI - movdqu (2 * 16)(%rdi), STATE1_LO - movdqu (3 * 16)(%rdi), STATE1_HI - movdqu (4 * 16)(%rdi), STATE2_LO - movdqu (5 * 16)(%rdi), STATE2_HI - movdqu (6 * 16)(%rdi), STATE3_LO - movdqu (7 * 16)(%rdi), STATE3_HI - movdqu (8 * 16)(%rdi), STATE4_LO - movdqu (9 * 16)(%rdi), STATE4_HI - - mov %rsi, %r8 - and $0xF, %r8 - jnz .Lad_u_loop - -.align 4 -.Lad_a_loop: - movdqa 0(%rsi), MSG_LO - movdqa 16(%rsi), MSG_HI - call __morus1280_update - sub $32, %rdx - add $32, %rsi - cmp $32, %rdx - jge .Lad_a_loop - - jmp .Lad_cont -.align 4 -.Lad_u_loop: - movdqu 0(%rsi), MSG_LO - movdqu 16(%rsi), MSG_HI - call __morus1280_update - sub $32, %rdx - add $32, %rsi - cmp $32, %rdx - jge .Lad_u_loop - -.Lad_cont: - /* store the state: */ - movdqu STATE0_LO, (0 * 16)(%rdi) - movdqu STATE0_HI, (1 * 16)(%rdi) - movdqu STATE1_LO, (2 * 16)(%rdi) - movdqu STATE1_HI, (3 * 16)(%rdi) - movdqu STATE2_LO, (4 * 16)(%rdi) - movdqu STATE2_HI, (5 * 16)(%rdi) - movdqu STATE3_LO, (6 * 16)(%rdi) - movdqu STATE3_HI, (7 * 16)(%rdi) - movdqu STATE4_LO, (8 * 16)(%rdi) - movdqu STATE4_HI, (9 * 16)(%rdi) - -.Lad_out: - FRAME_END - ret -ENDPROC(crypto_morus1280_sse2_ad) - -/* - * void crypto_morus1280_sse2_enc(void *state, const void *src, void *dst, - * unsigned int length); - */ -ENTRY(crypto_morus1280_sse2_enc) - FRAME_BEGIN - - cmp $32, %rcx - jb .Lenc_out - - /* load the state: */ - movdqu (0 * 16)(%rdi), STATE0_LO - movdqu (1 * 16)(%rdi), STATE0_HI - movdqu (2 * 16)(%rdi), STATE1_LO - movdqu (3 * 16)(%rdi), STATE1_HI - movdqu (4 * 16)(%rdi), STATE2_LO - movdqu (5 * 16)(%rdi), STATE2_HI - movdqu (6 * 16)(%rdi), STATE3_LO - movdqu (7 * 16)(%rdi), STATE3_HI - movdqu (8 * 16)(%rdi), STATE4_LO - movdqu (9 * 16)(%rdi), STATE4_HI - - mov %rsi, %r8 - or %rdx, %r8 - and $0xF, %r8 - jnz .Lenc_u_loop - -.align 4 -.Lenc_a_loop: - movdqa 0(%rsi), MSG_LO - movdqa 16(%rsi), MSG_HI - movdqa STATE1_LO, T1_LO - movdqa STATE1_HI, T1_HI - rol3 T1_HI, T1_LO - movdqa MSG_LO, T0_LO - movdqa MSG_HI, T0_HI - pxor T1_LO, T0_LO - pxor T1_HI, T0_HI - pxor STATE0_LO, T0_LO - pxor STATE0_HI, T0_HI - movdqa STATE2_LO, T1_LO - movdqa STATE2_HI, T1_HI - pand STATE3_LO, T1_LO - pand STATE3_HI, T1_HI - pxor T1_LO, T0_LO - pxor T1_HI, T0_HI - movdqa T0_LO, 0(%rdx) - movdqa T0_HI, 16(%rdx) - - call __morus1280_update - sub $32, %rcx - add $32, %rsi - add $32, %rdx - cmp $32, %rcx - jge .Lenc_a_loop - - jmp .Lenc_cont -.align 4 -.Lenc_u_loop: - movdqu 0(%rsi), MSG_LO - movdqu 16(%rsi), MSG_HI - movdqa STATE1_LO, T1_LO - movdqa STATE1_HI, T1_HI - rol3 T1_HI, T1_LO - movdqa MSG_LO, T0_LO - movdqa MSG_HI, T0_HI - pxor T1_LO, T0_LO - pxor T1_HI, T0_HI - pxor STATE0_LO, T0_LO - pxor STATE0_HI, T0_HI - movdqa STATE2_LO, T1_LO - movdqa STATE2_HI, T1_HI - pand STATE3_LO, T1_LO - pand STATE3_HI, T1_HI - pxor T1_LO, T0_LO - pxor T1_HI, T0_HI - movdqu T0_LO, 0(%rdx) - movdqu T0_HI, 16(%rdx) - - call __morus1280_update - sub $32, %rcx - add $32, %rsi - add $32, %rdx - cmp $32, %rcx - jge .Lenc_u_loop - -.Lenc_cont: - /* store the state: */ - movdqu STATE0_LO, (0 * 16)(%rdi) - movdqu STATE0_HI, (1 * 16)(%rdi) - movdqu STATE1_LO, (2 * 16)(%rdi) - movdqu STATE1_HI, (3 * 16)(%rdi) - movdqu STATE2_LO, (4 * 16)(%rdi) - movdqu STATE2_HI, (5 * 16)(%rdi) - movdqu STATE3_LO, (6 * 16)(%rdi) - movdqu STATE3_HI, (7 * 16)(%rdi) - movdqu STATE4_LO, (8 * 16)(%rdi) - movdqu STATE4_HI, (9 * 16)(%rdi) - -.Lenc_out: - FRAME_END - ret -ENDPROC(crypto_morus1280_sse2_enc) - -/* - * void crypto_morus1280_sse2_enc_tail(void *state, const void *src, void *dst, - * unsigned int length); - */ -ENTRY(crypto_morus1280_sse2_enc_tail) - FRAME_BEGIN - - /* load the state: */ - movdqu (0 * 16)(%rdi), STATE0_LO - movdqu (1 * 16)(%rdi), STATE0_HI - movdqu (2 * 16)(%rdi), STATE1_LO - movdqu (3 * 16)(%rdi), STATE1_HI - movdqu (4 * 16)(%rdi), STATE2_LO - movdqu (5 * 16)(%rdi), STATE2_HI - movdqu (6 * 16)(%rdi), STATE3_LO - movdqu (7 * 16)(%rdi), STATE3_HI - movdqu (8 * 16)(%rdi), STATE4_LO - movdqu (9 * 16)(%rdi), STATE4_HI - - /* encrypt message: */ - call __load_partial - - movdqa STATE1_LO, T1_LO - movdqa STATE1_HI, T1_HI - rol3 T1_HI, T1_LO - movdqa MSG_LO, T0_LO - movdqa MSG_HI, T0_HI - pxor T1_LO, T0_LO - pxor T1_HI, T0_HI - pxor STATE0_LO, T0_LO - pxor STATE0_HI, T0_HI - movdqa STATE2_LO, T1_LO - movdqa STATE2_HI, T1_HI - pand STATE3_LO, T1_LO - pand STATE3_HI, T1_HI - pxor T1_LO, T0_LO - pxor T1_HI, T0_HI - - call __store_partial - - call __morus1280_update - - /* store the state: */ - movdqu STATE0_LO, (0 * 16)(%rdi) - movdqu STATE0_HI, (1 * 16)(%rdi) - movdqu STATE1_LO, (2 * 16)(%rdi) - movdqu STATE1_HI, (3 * 16)(%rdi) - movdqu STATE2_LO, (4 * 16)(%rdi) - movdqu STATE2_HI, (5 * 16)(%rdi) - movdqu STATE3_LO, (6 * 16)(%rdi) - movdqu STATE3_HI, (7 * 16)(%rdi) - movdqu STATE4_LO, (8 * 16)(%rdi) - movdqu STATE4_HI, (9 * 16)(%rdi) - - FRAME_END - ret -ENDPROC(crypto_morus1280_sse2_enc_tail) - -/* - * void crypto_morus1280_sse2_dec(void *state, const void *src, void *dst, - * unsigned int length); - */ -ENTRY(crypto_morus1280_sse2_dec) - FRAME_BEGIN - - cmp $32, %rcx - jb .Ldec_out - - /* load the state: */ - movdqu (0 * 16)(%rdi), STATE0_LO - movdqu (1 * 16)(%rdi), STATE0_HI - movdqu (2 * 16)(%rdi), STATE1_LO - movdqu (3 * 16)(%rdi), STATE1_HI - movdqu (4 * 16)(%rdi), STATE2_LO - movdqu (5 * 16)(%rdi), STATE2_HI - movdqu (6 * 16)(%rdi), STATE3_LO - movdqu (7 * 16)(%rdi), STATE3_HI - movdqu (8 * 16)(%rdi), STATE4_LO - movdqu (9 * 16)(%rdi), STATE4_HI - - mov %rsi, %r8 - or %rdx, %r8 - and $0xF, %r8 - jnz .Ldec_u_loop - -.align 4 -.Ldec_a_loop: - movdqa 0(%rsi), MSG_LO - movdqa 16(%rsi), MSG_HI - pxor STATE0_LO, MSG_LO - pxor STATE0_HI, MSG_HI - movdqa STATE1_LO, T1_LO - movdqa STATE1_HI, T1_HI - rol3 T1_HI, T1_LO - pxor T1_LO, MSG_LO - pxor T1_HI, MSG_HI - movdqa STATE2_LO, T1_LO - movdqa STATE2_HI, T1_HI - pand STATE3_LO, T1_LO - pand STATE3_HI, T1_HI - pxor T1_LO, MSG_LO - pxor T1_HI, MSG_HI - movdqa MSG_LO, 0(%rdx) - movdqa MSG_HI, 16(%rdx) - - call __morus1280_update - sub $32, %rcx - add $32, %rsi - add $32, %rdx - cmp $32, %rcx - jge .Ldec_a_loop - - jmp .Ldec_cont -.align 4 -.Ldec_u_loop: - movdqu 0(%rsi), MSG_LO - movdqu 16(%rsi), MSG_HI - pxor STATE0_LO, MSG_LO - pxor STATE0_HI, MSG_HI - movdqa STATE1_LO, T1_LO - movdqa STATE1_HI, T1_HI - rol3 T1_HI, T1_LO - pxor T1_LO, MSG_LO - pxor T1_HI, MSG_HI - movdqa STATE2_LO, T1_LO - movdqa STATE2_HI, T1_HI - pand STATE3_LO, T1_LO - pand STATE3_HI, T1_HI - pxor T1_LO, MSG_LO - pxor T1_HI, MSG_HI - movdqu MSG_LO, 0(%rdx) - movdqu MSG_HI, 16(%rdx) - - call __morus1280_update - sub $32, %rcx - add $32, %rsi - add $32, %rdx - cmp $32, %rcx - jge .Ldec_u_loop - -.Ldec_cont: - /* store the state: */ - movdqu STATE0_LO, (0 * 16)(%rdi) - movdqu STATE0_HI, (1 * 16)(%rdi) - movdqu STATE1_LO, (2 * 16)(%rdi) - movdqu STATE1_HI, (3 * 16)(%rdi) - movdqu STATE2_LO, (4 * 16)(%rdi) - movdqu STATE2_HI, (5 * 16)(%rdi) - movdqu STATE3_LO, (6 * 16)(%rdi) - movdqu STATE3_HI, (7 * 16)(%rdi) - movdqu STATE4_LO, (8 * 16)(%rdi) - movdqu STATE4_HI, (9 * 16)(%rdi) - -.Ldec_out: - FRAME_END - ret -ENDPROC(crypto_morus1280_sse2_dec) - -/* - * void crypto_morus1280_sse2_dec_tail(void *state, const void *src, void *dst, - * unsigned int length); - */ -ENTRY(crypto_morus1280_sse2_dec_tail) - FRAME_BEGIN - - /* load the state: */ - movdqu (0 * 16)(%rdi), STATE0_LO - movdqu (1 * 16)(%rdi), STATE0_HI - movdqu (2 * 16)(%rdi), STATE1_LO - movdqu (3 * 16)(%rdi), STATE1_HI - movdqu (4 * 16)(%rdi), STATE2_LO - movdqu (5 * 16)(%rdi), STATE2_HI - movdqu (6 * 16)(%rdi), STATE3_LO - movdqu (7 * 16)(%rdi), STATE3_HI - movdqu (8 * 16)(%rdi), STATE4_LO - movdqu (9 * 16)(%rdi), STATE4_HI - - /* decrypt message: */ - call __load_partial - - pxor STATE0_LO, MSG_LO - pxor STATE0_HI, MSG_HI - movdqa STATE1_LO, T1_LO - movdqa STATE1_HI, T1_HI - rol3 T1_HI, T1_LO - pxor T1_LO, MSG_LO - pxor T1_HI, MSG_HI - movdqa STATE2_LO, T1_LO - movdqa STATE2_HI, T1_HI - pand STATE3_LO, T1_LO - pand STATE3_HI, T1_HI - pxor T1_LO, MSG_LO - pxor T1_HI, MSG_HI - movdqa MSG_LO, T0_LO - movdqa MSG_HI, T0_HI - - call __store_partial - - /* mask with byte count: */ - movq %rcx, T0_LO - punpcklbw T0_LO, T0_LO - punpcklbw T0_LO, T0_LO - punpcklbw T0_LO, T0_LO - punpcklbw T0_LO, T0_LO - movdqa T0_LO, T0_HI - movdqa .Lmorus640_counter_0, T1_LO - movdqa .Lmorus640_counter_1, T1_HI - pcmpgtb T1_LO, T0_LO - pcmpgtb T1_HI, T0_HI - pand T0_LO, MSG_LO - pand T0_HI, MSG_HI - - call __morus1280_update - - /* store the state: */ - movdqu STATE0_LO, (0 * 16)(%rdi) - movdqu STATE0_HI, (1 * 16)(%rdi) - movdqu STATE1_LO, (2 * 16)(%rdi) - movdqu STATE1_HI, (3 * 16)(%rdi) - movdqu STATE2_LO, (4 * 16)(%rdi) - movdqu STATE2_HI, (5 * 16)(%rdi) - movdqu STATE3_LO, (6 * 16)(%rdi) - movdqu STATE3_HI, (7 * 16)(%rdi) - movdqu STATE4_LO, (8 * 16)(%rdi) - movdqu STATE4_HI, (9 * 16)(%rdi) - - FRAME_END - ret -ENDPROC(crypto_morus1280_sse2_dec_tail) - -/* - * void crypto_morus1280_sse2_final(void *state, void *tag_xor, - * u64 assoclen, u64 cryptlen); - */ -ENTRY(crypto_morus1280_sse2_final) - FRAME_BEGIN - - /* load the state: */ - movdqu (0 * 16)(%rdi), STATE0_LO - movdqu (1 * 16)(%rdi), STATE0_HI - movdqu (2 * 16)(%rdi), STATE1_LO - movdqu (3 * 16)(%rdi), STATE1_HI - movdqu (4 * 16)(%rdi), STATE2_LO - movdqu (5 * 16)(%rdi), STATE2_HI - movdqu (6 * 16)(%rdi), STATE3_LO - movdqu (7 * 16)(%rdi), STATE3_HI - movdqu (8 * 16)(%rdi), STATE4_LO - movdqu (9 * 16)(%rdi), STATE4_HI - - /* xor state[0] into state[4]: */ - pxor STATE0_LO, STATE4_LO - pxor STATE0_HI, STATE4_HI - - /* prepare length block: */ - movq %rdx, MSG_LO - movq %rcx, T0_LO - pslldq $8, T0_LO - pxor T0_LO, MSG_LO - psllq $3, MSG_LO /* multiply by 8 (to get bit count) */ - pxor MSG_HI, MSG_HI - - /* update state: */ - call __morus1280_update - call __morus1280_update - call __morus1280_update - call __morus1280_update - call __morus1280_update - call __morus1280_update - call __morus1280_update - call __morus1280_update - call __morus1280_update - call __morus1280_update - - /* xor tag: */ - movdqu 0(%rsi), MSG_LO - movdqu 16(%rsi), MSG_HI - - pxor STATE0_LO, MSG_LO - pxor STATE0_HI, MSG_HI - movdqa STATE1_LO, T0_LO - movdqa STATE1_HI, T0_HI - rol3 T0_HI, T0_LO - pxor T0_LO, MSG_LO - pxor T0_HI, MSG_HI - movdqa STATE2_LO, T0_LO - movdqa STATE2_HI, T0_HI - pand STATE3_LO, T0_LO - pand STATE3_HI, T0_HI - pxor T0_LO, MSG_LO - pxor T0_HI, MSG_HI - - movdqu MSG_LO, 0(%rsi) - movdqu MSG_HI, 16(%rsi) - - FRAME_END - ret -ENDPROC(crypto_morus1280_sse2_final) diff --git a/arch/x86/crypto/morus1280-sse2-glue.c b/arch/x86/crypto/morus1280-sse2-glue.c deleted file mode 100644 index aada9d774293..000000000000 --- a/arch/x86/crypto/morus1280-sse2-glue.c +++ /dev/null @@ -1,61 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * The MORUS-1280 Authenticated-Encryption Algorithm - * Glue for SSE2 implementation - * - * Copyright (c) 2016-2018 Ondrej Mosnacek - * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. - */ - -#include -#include -#include -#include -#include -#include - -asmlinkage void crypto_morus1280_sse2_init(void *state, const void *key, - const void *iv); -asmlinkage void crypto_morus1280_sse2_ad(void *state, const void *data, - unsigned int length); - -asmlinkage void crypto_morus1280_sse2_enc(void *state, const void *src, - void *dst, unsigned int length); -asmlinkage void crypto_morus1280_sse2_dec(void *state, const void *src, - void *dst, unsigned int length); - -asmlinkage void crypto_morus1280_sse2_enc_tail(void *state, const void *src, - void *dst, unsigned int length); -asmlinkage void crypto_morus1280_sse2_dec_tail(void *state, const void *src, - void *dst, unsigned int length); - -asmlinkage void crypto_morus1280_sse2_final(void *state, void *tag_xor, - u64 assoclen, u64 cryptlen); - -MORUS1280_DECLARE_ALG(sse2, "morus1280-sse2", 350); - -static struct simd_aead_alg *simd_alg; - -static int __init crypto_morus1280_sse2_module_init(void) -{ - if (!boot_cpu_has(X86_FEATURE_XMM2) || - !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL)) - return -ENODEV; - - return simd_register_aeads_compat(&crypto_morus1280_sse2_alg, 1, - &simd_alg); -} - -static void __exit crypto_morus1280_sse2_module_exit(void) -{ - simd_unregister_aeads(&crypto_morus1280_sse2_alg, 1, &simd_alg); -} - -module_init(crypto_morus1280_sse2_module_init); -module_exit(crypto_morus1280_sse2_module_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Ondrej Mosnacek "); -MODULE_DESCRIPTION("MORUS-1280 AEAD algorithm -- SSE2 implementation"); -MODULE_ALIAS_CRYPTO("morus1280"); -MODULE_ALIAS_CRYPTO("morus1280-sse2"); diff --git a/arch/x86/crypto/morus1280_glue.c b/arch/x86/crypto/morus1280_glue.c deleted file mode 100644 index ffbde8b22838..000000000000 --- a/arch/x86/crypto/morus1280_glue.c +++ /dev/null @@ -1,205 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * The MORUS-1280 Authenticated-Encryption Algorithm - * Common x86 SIMD glue skeleton - * - * Copyright (c) 2016-2018 Ondrej Mosnacek - * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -struct morus1280_state { - struct morus1280_block s[MORUS_STATE_BLOCKS]; -}; - -struct morus1280_ops { - int (*skcipher_walk_init)(struct skcipher_walk *walk, - struct aead_request *req, bool atomic); - - void (*crypt_blocks)(void *state, const void *src, void *dst, - unsigned int length); - void (*crypt_tail)(void *state, const void *src, void *dst, - unsigned int length); -}; - -static void crypto_morus1280_glue_process_ad( - struct morus1280_state *state, - const struct morus1280_glue_ops *ops, - struct scatterlist *sg_src, unsigned int assoclen) -{ - struct scatter_walk walk; - struct morus1280_block buf; - unsigned int pos = 0; - - scatterwalk_start(&walk, sg_src); - while (assoclen != 0) { - unsigned int size = scatterwalk_clamp(&walk, assoclen); - unsigned int left = size; - void *mapped = scatterwalk_map(&walk); - const u8 *src = (const u8 *)mapped; - - if (pos + size >= MORUS1280_BLOCK_SIZE) { - if (pos > 0) { - unsigned int fill = MORUS1280_BLOCK_SIZE - pos; - memcpy(buf.bytes + pos, src, fill); - ops->ad(state, buf.bytes, MORUS1280_BLOCK_SIZE); - pos = 0; - left -= fill; - src += fill; - } - - ops->ad(state, src, left); - src += left & ~(MORUS1280_BLOCK_SIZE - 1); - left &= MORUS1280_BLOCK_SIZE - 1; - } - - memcpy(buf.bytes + pos, src, left); - - pos += left; - assoclen -= size; - scatterwalk_unmap(mapped); - scatterwalk_advance(&walk, size); - scatterwalk_done(&walk, 0, assoclen); - } - - if (pos > 0) { - memset(buf.bytes + pos, 0, MORUS1280_BLOCK_SIZE - pos); - ops->ad(state, buf.bytes, MORUS1280_BLOCK_SIZE); - } -} - -static void crypto_morus1280_glue_process_crypt(struct morus1280_state *state, - struct morus1280_ops ops, - struct skcipher_walk *walk) -{ - while (walk->nbytes >= MORUS1280_BLOCK_SIZE) { - ops.crypt_blocks(state, walk->src.virt.addr, - walk->dst.virt.addr, - round_down(walk->nbytes, - MORUS1280_BLOCK_SIZE)); - skcipher_walk_done(walk, walk->nbytes % MORUS1280_BLOCK_SIZE); - } - - if (walk->nbytes) { - ops.crypt_tail(state, walk->src.virt.addr, walk->dst.virt.addr, - walk->nbytes); - skcipher_walk_done(walk, 0); - } -} - -int crypto_morus1280_glue_setkey(struct crypto_aead *aead, const u8 *key, - unsigned int keylen) -{ - struct morus1280_ctx *ctx = crypto_aead_ctx(aead); - - if (keylen == MORUS1280_BLOCK_SIZE) { - memcpy(ctx->key.bytes, key, MORUS1280_BLOCK_SIZE); - } else if (keylen == MORUS1280_BLOCK_SIZE / 2) { - memcpy(ctx->key.bytes, key, keylen); - memcpy(ctx->key.bytes + keylen, key, keylen); - } else { - crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN); - return -EINVAL; - } - - return 0; -} -EXPORT_SYMBOL_GPL(crypto_morus1280_glue_setkey); - -int crypto_morus1280_glue_setauthsize(struct crypto_aead *tfm, - unsigned int authsize) -{ - return (authsize <= MORUS_MAX_AUTH_SIZE) ? 0 : -EINVAL; -} -EXPORT_SYMBOL_GPL(crypto_morus1280_glue_setauthsize); - -static void crypto_morus1280_glue_crypt(struct aead_request *req, - struct morus1280_ops ops, - unsigned int cryptlen, - struct morus1280_block *tag_xor) -{ - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct morus1280_ctx *ctx = crypto_aead_ctx(tfm); - struct morus1280_state state; - struct skcipher_walk walk; - - ops.skcipher_walk_init(&walk, req, true); - - kernel_fpu_begin(); - - ctx->ops->init(&state, &ctx->key, req->iv); - crypto_morus1280_glue_process_ad(&state, ctx->ops, req->src, req->assoclen); - crypto_morus1280_glue_process_crypt(&state, ops, &walk); - ctx->ops->final(&state, tag_xor, req->assoclen, cryptlen); - - kernel_fpu_end(); -} - -int crypto_morus1280_glue_encrypt(struct aead_request *req) -{ - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct morus1280_ctx *ctx = crypto_aead_ctx(tfm); - struct morus1280_ops OPS = { - .skcipher_walk_init = skcipher_walk_aead_encrypt, - .crypt_blocks = ctx->ops->enc, - .crypt_tail = ctx->ops->enc_tail, - }; - - struct morus1280_block tag = {}; - unsigned int authsize = crypto_aead_authsize(tfm); - unsigned int cryptlen = req->cryptlen; - - crypto_morus1280_glue_crypt(req, OPS, cryptlen, &tag); - - scatterwalk_map_and_copy(tag.bytes, req->dst, - req->assoclen + cryptlen, authsize, 1); - return 0; -} -EXPORT_SYMBOL_GPL(crypto_morus1280_glue_encrypt); - -int crypto_morus1280_glue_decrypt(struct aead_request *req) -{ - static const u8 zeros[MORUS1280_BLOCK_SIZE] = {}; - - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct morus1280_ctx *ctx = crypto_aead_ctx(tfm); - struct morus1280_ops OPS = { - .skcipher_walk_init = skcipher_walk_aead_decrypt, - .crypt_blocks = ctx->ops->dec, - .crypt_tail = ctx->ops->dec_tail, - }; - - struct morus1280_block tag; - unsigned int authsize = crypto_aead_authsize(tfm); - unsigned int cryptlen = req->cryptlen - authsize; - - scatterwalk_map_and_copy(tag.bytes, req->src, - req->assoclen + cryptlen, authsize, 0); - - crypto_morus1280_glue_crypt(req, OPS, cryptlen, &tag); - - return crypto_memneq(tag.bytes, zeros, authsize) ? -EBADMSG : 0; -} -EXPORT_SYMBOL_GPL(crypto_morus1280_glue_decrypt); - -void crypto_morus1280_glue_init_ops(struct crypto_aead *aead, - const struct morus1280_glue_ops *ops) -{ - struct morus1280_ctx *ctx = crypto_aead_ctx(aead); - ctx->ops = ops; -} -EXPORT_SYMBOL_GPL(crypto_morus1280_glue_init_ops); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Ondrej Mosnacek "); -MODULE_DESCRIPTION("MORUS-1280 AEAD mode -- glue for x86 optimizations"); diff --git a/arch/x86/crypto/morus640-sse2-asm.S b/arch/x86/crypto/morus640-sse2-asm.S deleted file mode 100644 index a60891101bbd..000000000000 --- a/arch/x86/crypto/morus640-sse2-asm.S +++ /dev/null @@ -1,612 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * SSE2 implementation of MORUS-640 - * - * Copyright (c) 2017-2018 Ondrej Mosnacek - * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. - */ - -#include -#include - -#define SHUFFLE_MASK(i0, i1, i2, i3) \ - (i0 | (i1 << 2) | (i2 << 4) | (i3 << 6)) - -#define MASK1 SHUFFLE_MASK(3, 0, 1, 2) -#define MASK2 SHUFFLE_MASK(2, 3, 0, 1) -#define MASK3 SHUFFLE_MASK(1, 2, 3, 0) - -#define STATE0 %xmm0 -#define STATE1 %xmm1 -#define STATE2 %xmm2 -#define STATE3 %xmm3 -#define STATE4 %xmm4 -#define KEY %xmm5 -#define MSG %xmm5 -#define T0 %xmm6 -#define T1 %xmm7 - -.section .rodata.cst16.morus640_const, "aM", @progbits, 32 -.align 16 -.Lmorus640_const_0: - .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d - .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62 -.Lmorus640_const_1: - .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1 - .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd - -.section .rodata.cst16.morus640_counter, "aM", @progbits, 16 -.align 16 -.Lmorus640_counter: - .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 - .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f - -.text - -.macro morus640_round s0, s1, s2, s3, s4, b, w - movdqa \s1, T0 - pand \s2, T0 - pxor T0, \s0 - pxor \s3, \s0 - movdqa \s0, T0 - pslld $\b, T0 - psrld $(32 - \b), \s0 - pxor T0, \s0 - pshufd $\w, \s3, \s3 -.endm - -/* - * __morus640_update: internal ABI - * input: - * STATE[0-4] - input state - * MSG - message block - * output: - * STATE[0-4] - output state - * changed: - * T0 - */ -__morus640_update: - morus640_round STATE0, STATE1, STATE2, STATE3, STATE4, 5, MASK1 - pxor MSG, STATE1 - morus640_round STATE1, STATE2, STATE3, STATE4, STATE0, 31, MASK2 - pxor MSG, STATE2 - morus640_round STATE2, STATE3, STATE4, STATE0, STATE1, 7, MASK3 - pxor MSG, STATE3 - morus640_round STATE3, STATE4, STATE0, STATE1, STATE2, 22, MASK2 - pxor MSG, STATE4 - morus640_round STATE4, STATE0, STATE1, STATE2, STATE3, 13, MASK1 - ret -ENDPROC(__morus640_update) - - -/* - * __morus640_update_zero: internal ABI - * input: - * STATE[0-4] - input state - * output: - * STATE[0-4] - output state - * changed: - * T0 - */ -__morus640_update_zero: - morus640_round STATE0, STATE1, STATE2, STATE3, STATE4, 5, MASK1 - morus640_round STATE1, STATE2, STATE3, STATE4, STATE0, 31, MASK2 - morus640_round STATE2, STATE3, STATE4, STATE0, STATE1, 7, MASK3 - morus640_round STATE3, STATE4, STATE0, STATE1, STATE2, 22, MASK2 - morus640_round STATE4, STATE0, STATE1, STATE2, STATE3, 13, MASK1 - ret -ENDPROC(__morus640_update_zero) - -/* - * __load_partial: internal ABI - * input: - * %rsi - src - * %rcx - bytes - * output: - * MSG - message block - * changed: - * T0 - * %r8 - * %r9 - */ -__load_partial: - xor %r9d, %r9d - pxor MSG, MSG - - mov %rcx, %r8 - and $0x1, %r8 - jz .Lld_partial_1 - - mov %rcx, %r8 - and $0x1E, %r8 - add %rsi, %r8 - mov (%r8), %r9b - -.Lld_partial_1: - mov %rcx, %r8 - and $0x2, %r8 - jz .Lld_partial_2 - - mov %rcx, %r8 - and $0x1C, %r8 - add %rsi, %r8 - shl $16, %r9 - mov (%r8), %r9w - -.Lld_partial_2: - mov %rcx, %r8 - and $0x4, %r8 - jz .Lld_partial_4 - - mov %rcx, %r8 - and $0x18, %r8 - add %rsi, %r8 - shl $32, %r9 - mov (%r8), %r8d - xor %r8, %r9 - -.Lld_partial_4: - movq %r9, MSG - - mov %rcx, %r8 - and $0x8, %r8 - jz .Lld_partial_8 - - mov %rcx, %r8 - and $0x10, %r8 - add %rsi, %r8 - pslldq $8, MSG - movq (%r8), T0 - pxor T0, MSG - -.Lld_partial_8: - ret -ENDPROC(__load_partial) - -/* - * __store_partial: internal ABI - * input: - * %rdx - dst - * %rcx - bytes - * output: - * T0 - message block - * changed: - * %r8 - * %r9 - * %r10 - */ -__store_partial: - mov %rcx, %r8 - mov %rdx, %r9 - - movq T0, %r10 - - cmp $8, %r8 - jl .Lst_partial_8 - - mov %r10, (%r9) - psrldq $8, T0 - movq T0, %r10 - - sub $8, %r8 - add $8, %r9 - -.Lst_partial_8: - cmp $4, %r8 - jl .Lst_partial_4 - - mov %r10d, (%r9) - shr $32, %r10 - - sub $4, %r8 - add $4, %r9 - -.Lst_partial_4: - cmp $2, %r8 - jl .Lst_partial_2 - - mov %r10w, (%r9) - shr $16, %r10 - - sub $2, %r8 - add $2, %r9 - -.Lst_partial_2: - cmp $1, %r8 - jl .Lst_partial_1 - - mov %r10b, (%r9) - -.Lst_partial_1: - ret -ENDPROC(__store_partial) - -/* - * void crypto_morus640_sse2_init(void *state, const void *key, const void *iv); - */ -ENTRY(crypto_morus640_sse2_init) - FRAME_BEGIN - - /* load IV: */ - movdqu (%rdx), STATE0 - /* load key: */ - movdqu (%rsi), KEY - movdqa KEY, STATE1 - /* load all ones: */ - pcmpeqd STATE2, STATE2 - /* load the constants: */ - movdqa .Lmorus640_const_0, STATE3 - movdqa .Lmorus640_const_1, STATE4 - - /* update 16 times with zero: */ - call __morus640_update_zero - call __morus640_update_zero - call __morus640_update_zero - call __morus640_update_zero - call __morus640_update_zero - call __morus640_update_zero - call __morus640_update_zero - call __morus640_update_zero - call __morus640_update_zero - call __morus640_update_zero - call __morus640_update_zero - call __morus640_update_zero - call __morus640_update_zero - call __morus640_update_zero - call __morus640_update_zero - call __morus640_update_zero - - /* xor-in the key again after updates: */ - pxor KEY, STATE1 - - /* store the state: */ - movdqu STATE0, (0 * 16)(%rdi) - movdqu STATE1, (1 * 16)(%rdi) - movdqu STATE2, (2 * 16)(%rdi) - movdqu STATE3, (3 * 16)(%rdi) - movdqu STATE4, (4 * 16)(%rdi) - - FRAME_END - ret -ENDPROC(crypto_morus640_sse2_init) - -/* - * void crypto_morus640_sse2_ad(void *state, const void *data, - * unsigned int length); - */ -ENTRY(crypto_morus640_sse2_ad) - FRAME_BEGIN - - cmp $16, %rdx - jb .Lad_out - - /* load the state: */ - movdqu (0 * 16)(%rdi), STATE0 - movdqu (1 * 16)(%rdi), STATE1 - movdqu (2 * 16)(%rdi), STATE2 - movdqu (3 * 16)(%rdi), STATE3 - movdqu (4 * 16)(%rdi), STATE4 - - mov %rsi, %r8 - and $0xF, %r8 - jnz .Lad_u_loop - -.align 4 -.Lad_a_loop: - movdqa (%rsi), MSG - call __morus640_update - sub $16, %rdx - add $16, %rsi - cmp $16, %rdx - jge .Lad_a_loop - - jmp .Lad_cont -.align 4 -.Lad_u_loop: - movdqu (%rsi), MSG - call __morus640_update - sub $16, %rdx - add $16, %rsi - cmp $16, %rdx - jge .Lad_u_loop - -.Lad_cont: - /* store the state: */ - movdqu STATE0, (0 * 16)(%rdi) - movdqu STATE1, (1 * 16)(%rdi) - movdqu STATE2, (2 * 16)(%rdi) - movdqu STATE3, (3 * 16)(%rdi) - movdqu STATE4, (4 * 16)(%rdi) - -.Lad_out: - FRAME_END - ret -ENDPROC(crypto_morus640_sse2_ad) - -/* - * void crypto_morus640_sse2_enc(void *state, const void *src, void *dst, - * unsigned int length); - */ -ENTRY(crypto_morus640_sse2_enc) - FRAME_BEGIN - - cmp $16, %rcx - jb .Lenc_out - - /* load the state: */ - movdqu (0 * 16)(%rdi), STATE0 - movdqu (1 * 16)(%rdi), STATE1 - movdqu (2 * 16)(%rdi), STATE2 - movdqu (3 * 16)(%rdi), STATE3 - movdqu (4 * 16)(%rdi), STATE4 - - mov %rsi, %r8 - or %rdx, %r8 - and $0xF, %r8 - jnz .Lenc_u_loop - -.align 4 -.Lenc_a_loop: - movdqa (%rsi), MSG - movdqa MSG, T0 - pxor STATE0, T0 - pshufd $MASK3, STATE1, T1 - pxor T1, T0 - movdqa STATE2, T1 - pand STATE3, T1 - pxor T1, T0 - movdqa T0, (%rdx) - - call __morus640_update - sub $16, %rcx - add $16, %rsi - add $16, %rdx - cmp $16, %rcx - jge .Lenc_a_loop - - jmp .Lenc_cont -.align 4 -.Lenc_u_loop: - movdqu (%rsi), MSG - movdqa MSG, T0 - pxor STATE0, T0 - pshufd $MASK3, STATE1, T1 - pxor T1, T0 - movdqa STATE2, T1 - pand STATE3, T1 - pxor T1, T0 - movdqu T0, (%rdx) - - call __morus640_update - sub $16, %rcx - add $16, %rsi - add $16, %rdx - cmp $16, %rcx - jge .Lenc_u_loop - -.Lenc_cont: - /* store the state: */ - movdqu STATE0, (0 * 16)(%rdi) - movdqu STATE1, (1 * 16)(%rdi) - movdqu STATE2, (2 * 16)(%rdi) - movdqu STATE3, (3 * 16)(%rdi) - movdqu STATE4, (4 * 16)(%rdi) - -.Lenc_out: - FRAME_END - ret -ENDPROC(crypto_morus640_sse2_enc) - -/* - * void crypto_morus640_sse2_enc_tail(void *state, const void *src, void *dst, - * unsigned int length); - */ -ENTRY(crypto_morus640_sse2_enc_tail) - FRAME_BEGIN - - /* load the state: */ - movdqu (0 * 16)(%rdi), STATE0 - movdqu (1 * 16)(%rdi), STATE1 - movdqu (2 * 16)(%rdi), STATE2 - movdqu (3 * 16)(%rdi), STATE3 - movdqu (4 * 16)(%rdi), STATE4 - - /* encrypt message: */ - call __load_partial - - movdqa MSG, T0 - pxor STATE0, T0 - pshufd $MASK3, STATE1, T1 - pxor T1, T0 - movdqa STATE2, T1 - pand STATE3, T1 - pxor T1, T0 - - call __store_partial - - call __morus640_update - - /* store the state: */ - movdqu STATE0, (0 * 16)(%rdi) - movdqu STATE1, (1 * 16)(%rdi) - movdqu STATE2, (2 * 16)(%rdi) - movdqu STATE3, (3 * 16)(%rdi) - movdqu STATE4, (4 * 16)(%rdi) - - FRAME_END - ret -ENDPROC(crypto_morus640_sse2_enc_tail) - -/* - * void crypto_morus640_sse2_dec(void *state, const void *src, void *dst, - * unsigned int length); - */ -ENTRY(crypto_morus640_sse2_dec) - FRAME_BEGIN - - cmp $16, %rcx - jb .Ldec_out - - /* load the state: */ - movdqu (0 * 16)(%rdi), STATE0 - movdqu (1 * 16)(%rdi), STATE1 - movdqu (2 * 16)(%rdi), STATE2 - movdqu (3 * 16)(%rdi), STATE3 - movdqu (4 * 16)(%rdi), STATE4 - - mov %rsi, %r8 - or %rdx, %r8 - and $0xF, %r8 - jnz .Ldec_u_loop - -.align 4 -.Ldec_a_loop: - movdqa (%rsi), MSG - pxor STATE0, MSG - pshufd $MASK3, STATE1, T0 - pxor T0, MSG - movdqa STATE2, T0 - pand STATE3, T0 - pxor T0, MSG - movdqa MSG, (%rdx) - - call __morus640_update - sub $16, %rcx - add $16, %rsi - add $16, %rdx - cmp $16, %rcx - jge .Ldec_a_loop - - jmp .Ldec_cont -.align 4 -.Ldec_u_loop: - movdqu (%rsi), MSG - pxor STATE0, MSG - pshufd $MASK3, STATE1, T0 - pxor T0, MSG - movdqa STATE2, T0 - pand STATE3, T0 - pxor T0, MSG - movdqu MSG, (%rdx) - - call __morus640_update - sub $16, %rcx - add $16, %rsi - add $16, %rdx - cmp $16, %rcx - jge .Ldec_u_loop - -.Ldec_cont: - /* store the state: */ - movdqu STATE0, (0 * 16)(%rdi) - movdqu STATE1, (1 * 16)(%rdi) - movdqu STATE2, (2 * 16)(%rdi) - movdqu STATE3, (3 * 16)(%rdi) - movdqu STATE4, (4 * 16)(%rdi) - -.Ldec_out: - FRAME_END - ret -ENDPROC(crypto_morus640_sse2_dec) - -/* - * void crypto_morus640_sse2_dec_tail(void *state, const void *src, void *dst, - * unsigned int length); - */ -ENTRY(crypto_morus640_sse2_dec_tail) - FRAME_BEGIN - - /* load the state: */ - movdqu (0 * 16)(%rdi), STATE0 - movdqu (1 * 16)(%rdi), STATE1 - movdqu (2 * 16)(%rdi), STATE2 - movdqu (3 * 16)(%rdi), STATE3 - movdqu (4 * 16)(%rdi), STATE4 - - /* decrypt message: */ - call __load_partial - - pxor STATE0, MSG - pshufd $MASK3, STATE1, T0 - pxor T0, MSG - movdqa STATE2, T0 - pand STATE3, T0 - pxor T0, MSG - movdqa MSG, T0 - - call __store_partial - - /* mask with byte count: */ - movq %rcx, T0 - punpcklbw T0, T0 - punpcklbw T0, T0 - punpcklbw T0, T0 - punpcklbw T0, T0 - movdqa .Lmorus640_counter, T1 - pcmpgtb T1, T0 - pand T0, MSG - - call __morus640_update - - /* store the state: */ - movdqu STATE0, (0 * 16)(%rdi) - movdqu STATE1, (1 * 16)(%rdi) - movdqu STATE2, (2 * 16)(%rdi) - movdqu STATE3, (3 * 16)(%rdi) - movdqu STATE4, (4 * 16)(%rdi) - - FRAME_END - ret -ENDPROC(crypto_morus640_sse2_dec_tail) - -/* - * void crypto_morus640_sse2_final(void *state, void *tag_xor, - * u64 assoclen, u64 cryptlen); - */ -ENTRY(crypto_morus640_sse2_final) - FRAME_BEGIN - - /* load the state: */ - movdqu (0 * 16)(%rdi), STATE0 - movdqu (1 * 16)(%rdi), STATE1 - movdqu (2 * 16)(%rdi), STATE2 - movdqu (3 * 16)(%rdi), STATE3 - movdqu (4 * 16)(%rdi), STATE4 - - /* xor state[0] into state[4]: */ - pxor STATE0, STATE4 - - /* prepare length block: */ - movq %rdx, MSG - movq %rcx, T0 - pslldq $8, T0 - pxor T0, MSG - psllq $3, MSG /* multiply by 8 (to get bit count) */ - - /* update state: */ - call __morus640_update - call __morus640_update - call __morus640_update - call __morus640_update - call __morus640_update - call __morus640_update - call __morus640_update - call __morus640_update - call __morus640_update - call __morus640_update - - /* xor tag: */ - movdqu (%rsi), MSG - - pxor STATE0, MSG - pshufd $MASK3, STATE1, T0 - pxor T0, MSG - movdqa STATE2, T0 - pand STATE3, T0 - pxor T0, MSG - - movdqu MSG, (%rsi) - - FRAME_END - ret -ENDPROC(crypto_morus640_sse2_final) diff --git a/arch/x86/crypto/morus640-sse2-glue.c b/arch/x86/crypto/morus640-sse2-glue.c deleted file mode 100644 index 8ef68134aef4..000000000000 --- a/arch/x86/crypto/morus640-sse2-glue.c +++ /dev/null @@ -1,61 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * The MORUS-640 Authenticated-Encryption Algorithm - * Glue for SSE2 implementation - * - * Copyright (c) 2016-2018 Ondrej Mosnacek - * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. - */ - -#include -#include -#include -#include -#include -#include - -asmlinkage void crypto_morus640_sse2_init(void *state, const void *key, - const void *iv); -asmlinkage void crypto_morus640_sse2_ad(void *state, const void *data, - unsigned int length); - -asmlinkage void crypto_morus640_sse2_enc(void *state, const void *src, - void *dst, unsigned int length); -asmlinkage void crypto_morus640_sse2_dec(void *state, const void *src, - void *dst, unsigned int length); - -asmlinkage void crypto_morus640_sse2_enc_tail(void *state, const void *src, - void *dst, unsigned int length); -asmlinkage void crypto_morus640_sse2_dec_tail(void *state, const void *src, - void *dst, unsigned int length); - -asmlinkage void crypto_morus640_sse2_final(void *state, void *tag_xor, - u64 assoclen, u64 cryptlen); - -MORUS640_DECLARE_ALG(sse2, "morus640-sse2", 400); - -static struct simd_aead_alg *simd_alg; - -static int __init crypto_morus640_sse2_module_init(void) -{ - if (!boot_cpu_has(X86_FEATURE_XMM2) || - !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL)) - return -ENODEV; - - return simd_register_aeads_compat(&crypto_morus640_sse2_alg, 1, - &simd_alg); -} - -static void __exit crypto_morus640_sse2_module_exit(void) -{ - simd_unregister_aeads(&crypto_morus640_sse2_alg, 1, &simd_alg); -} - -module_init(crypto_morus640_sse2_module_init); -module_exit(crypto_morus640_sse2_module_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Ondrej Mosnacek "); -MODULE_DESCRIPTION("MORUS-640 AEAD algorithm -- SSE2 implementation"); -MODULE_ALIAS_CRYPTO("morus640"); -MODULE_ALIAS_CRYPTO("morus640-sse2"); diff --git a/arch/x86/crypto/morus640_glue.c b/arch/x86/crypto/morus640_glue.c deleted file mode 100644 index d8b5fd6cef29..000000000000 --- a/arch/x86/crypto/morus640_glue.c +++ /dev/null @@ -1,200 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * The MORUS-640 Authenticated-Encryption Algorithm - * Common x86 SIMD glue skeleton - * - * Copyright (c) 2016-2018 Ondrej Mosnacek - * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -struct morus640_state { - struct morus640_block s[MORUS_STATE_BLOCKS]; -}; - -struct morus640_ops { - int (*skcipher_walk_init)(struct skcipher_walk *walk, - struct aead_request *req, bool atomic); - - void (*crypt_blocks)(void *state, const void *src, void *dst, - unsigned int length); - void (*crypt_tail)(void *state, const void *src, void *dst, - unsigned int length); -}; - -static void crypto_morus640_glue_process_ad( - struct morus640_state *state, - const struct morus640_glue_ops *ops, - struct scatterlist *sg_src, unsigned int assoclen) -{ - struct scatter_walk walk; - struct morus640_block buf; - unsigned int pos = 0; - - scatterwalk_start(&walk, sg_src); - while (assoclen != 0) { - unsigned int size = scatterwalk_clamp(&walk, assoclen); - unsigned int left = size; - void *mapped = scatterwalk_map(&walk); - const u8 *src = (const u8 *)mapped; - - if (pos + size >= MORUS640_BLOCK_SIZE) { - if (pos > 0) { - unsigned int fill = MORUS640_BLOCK_SIZE - pos; - memcpy(buf.bytes + pos, src, fill); - ops->ad(state, buf.bytes, MORUS640_BLOCK_SIZE); - pos = 0; - left -= fill; - src += fill; - } - - ops->ad(state, src, left); - src += left & ~(MORUS640_BLOCK_SIZE - 1); - left &= MORUS640_BLOCK_SIZE - 1; - } - - memcpy(buf.bytes + pos, src, left); - - pos += left; - assoclen -= size; - scatterwalk_unmap(mapped); - scatterwalk_advance(&walk, size); - scatterwalk_done(&walk, 0, assoclen); - } - - if (pos > 0) { - memset(buf.bytes + pos, 0, MORUS640_BLOCK_SIZE - pos); - ops->ad(state, buf.bytes, MORUS640_BLOCK_SIZE); - } -} - -static void crypto_morus640_glue_process_crypt(struct morus640_state *state, - struct morus640_ops ops, - struct skcipher_walk *walk) -{ - while (walk->nbytes >= MORUS640_BLOCK_SIZE) { - ops.crypt_blocks(state, walk->src.virt.addr, - walk->dst.virt.addr, - round_down(walk->nbytes, MORUS640_BLOCK_SIZE)); - skcipher_walk_done(walk, walk->nbytes % MORUS640_BLOCK_SIZE); - } - - if (walk->nbytes) { - ops.crypt_tail(state, walk->src.virt.addr, walk->dst.virt.addr, - walk->nbytes); - skcipher_walk_done(walk, 0); - } -} - -int crypto_morus640_glue_setkey(struct crypto_aead *aead, const u8 *key, - unsigned int keylen) -{ - struct morus640_ctx *ctx = crypto_aead_ctx(aead); - - if (keylen != MORUS640_BLOCK_SIZE) { - crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN); - return -EINVAL; - } - - memcpy(ctx->key.bytes, key, MORUS640_BLOCK_SIZE); - return 0; -} -EXPORT_SYMBOL_GPL(crypto_morus640_glue_setkey); - -int crypto_morus640_glue_setauthsize(struct crypto_aead *tfm, - unsigned int authsize) -{ - return (authsize <= MORUS_MAX_AUTH_SIZE) ? 0 : -EINVAL; -} -EXPORT_SYMBOL_GPL(crypto_morus640_glue_setauthsize); - -static void crypto_morus640_glue_crypt(struct aead_request *req, - struct morus640_ops ops, - unsigned int cryptlen, - struct morus640_block *tag_xor) -{ - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct morus640_ctx *ctx = crypto_aead_ctx(tfm); - struct morus640_state state; - struct skcipher_walk walk; - - ops.skcipher_walk_init(&walk, req, true); - - kernel_fpu_begin(); - - ctx->ops->init(&state, &ctx->key, req->iv); - crypto_morus640_glue_process_ad(&state, ctx->ops, req->src, req->assoclen); - crypto_morus640_glue_process_crypt(&state, ops, &walk); - ctx->ops->final(&state, tag_xor, req->assoclen, cryptlen); - - kernel_fpu_end(); -} - -int crypto_morus640_glue_encrypt(struct aead_request *req) -{ - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct morus640_ctx *ctx = crypto_aead_ctx(tfm); - struct morus640_ops OPS = { - .skcipher_walk_init = skcipher_walk_aead_encrypt, - .crypt_blocks = ctx->ops->enc, - .crypt_tail = ctx->ops->enc_tail, - }; - - struct morus640_block tag = {}; - unsigned int authsize = crypto_aead_authsize(tfm); - unsigned int cryptlen = req->cryptlen; - - crypto_morus640_glue_crypt(req, OPS, cryptlen, &tag); - - scatterwalk_map_and_copy(tag.bytes, req->dst, - req->assoclen + cryptlen, authsize, 1); - return 0; -} -EXPORT_SYMBOL_GPL(crypto_morus640_glue_encrypt); - -int crypto_morus640_glue_decrypt(struct aead_request *req) -{ - static const u8 zeros[MORUS640_BLOCK_SIZE] = {}; - - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct morus640_ctx *ctx = crypto_aead_ctx(tfm); - struct morus640_ops OPS = { - .skcipher_walk_init = skcipher_walk_aead_decrypt, - .crypt_blocks = ctx->ops->dec, - .crypt_tail = ctx->ops->dec_tail, - }; - - struct morus640_block tag; - unsigned int authsize = crypto_aead_authsize(tfm); - unsigned int cryptlen = req->cryptlen - authsize; - - scatterwalk_map_and_copy(tag.bytes, req->src, - req->assoclen + cryptlen, authsize, 0); - - crypto_morus640_glue_crypt(req, OPS, cryptlen, &tag); - - return crypto_memneq(tag.bytes, zeros, authsize) ? -EBADMSG : 0; -} -EXPORT_SYMBOL_GPL(crypto_morus640_glue_decrypt); - -void crypto_morus640_glue_init_ops(struct crypto_aead *aead, - const struct morus640_glue_ops *ops) -{ - struct morus640_ctx *ctx = crypto_aead_ctx(aead); - ctx->ops = ops; -} -EXPORT_SYMBOL_GPL(crypto_morus640_glue_init_ops); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Ondrej Mosnacek "); -MODULE_DESCRIPTION("MORUS-640 AEAD mode -- glue for x86 optimizations"); diff --git a/crypto/Kconfig b/crypto/Kconfig index 80ea118600ab..160dc1a9c7ec 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -344,62 +344,6 @@ config CRYPTO_AEGIS256_AESNI_SSE2 help AESNI+SSE2 implementation of the AEGIS-256 dedicated AEAD algorithm. -config CRYPTO_MORUS640 - tristate "MORUS-640 AEAD algorithm" - select CRYPTO_AEAD - help - Support for the MORUS-640 dedicated AEAD algorithm. - -config CRYPTO_MORUS640_GLUE - tristate - depends on X86 - select CRYPTO_AEAD - select CRYPTO_SIMD - help - Common glue for SIMD optimizations of the MORUS-640 dedicated AEAD - algorithm. - -config CRYPTO_MORUS640_SSE2 - tristate "MORUS-640 AEAD algorithm (x86_64 SSE2 implementation)" - depends on X86 && 64BIT - select CRYPTO_AEAD - select CRYPTO_MORUS640_GLUE - help - SSE2 implementation of the MORUS-640 dedicated AEAD algorithm. - -config CRYPTO_MORUS1280 - tristate "MORUS-1280 AEAD algorithm" - select CRYPTO_AEAD - help - Support for the MORUS-1280 dedicated AEAD algorithm. - -config CRYPTO_MORUS1280_GLUE - tristate - depends on X86 - select CRYPTO_AEAD - select CRYPTO_SIMD - help - Common glue for SIMD optimizations of the MORUS-1280 dedicated AEAD - algorithm. - -config CRYPTO_MORUS1280_SSE2 - tristate "MORUS-1280 AEAD algorithm (x86_64 SSE2 implementation)" - depends on X86 && 64BIT - select CRYPTO_AEAD - select CRYPTO_MORUS1280_GLUE - help - SSE2 optimizedimplementation of the MORUS-1280 dedicated AEAD - algorithm. - -config CRYPTO_MORUS1280_AVX2 - tristate "MORUS-1280 AEAD algorithm (x86_64 AVX2 implementation)" - depends on X86 && 64BIT - select CRYPTO_AEAD - select CRYPTO_MORUS1280_GLUE - help - AVX2 optimized implementation of the MORUS-1280 dedicated AEAD - algorithm. - config CRYPTO_SEQIV tristate "Sequence Number IV Generator" select CRYPTO_AEAD diff --git a/crypto/Makefile b/crypto/Makefile index 9479e1a45d8c..4bc1951d3787 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -92,8 +92,6 @@ obj-$(CONFIG_CRYPTO_CHACHA20POLY1305) += chacha20poly1305.o obj-$(CONFIG_CRYPTO_AEGIS128) += aegis128.o obj-$(CONFIG_CRYPTO_AEGIS128L) += aegis128l.o obj-$(CONFIG_CRYPTO_AEGIS256) += aegis256.o -obj-$(CONFIG_CRYPTO_MORUS640) += morus640.o -obj-$(CONFIG_CRYPTO_MORUS1280) += morus1280.o obj-$(CONFIG_CRYPTO_PCRYPT) += pcrypt.o obj-$(CONFIG_CRYPTO_CRYPTD) += cryptd.o obj-$(CONFIG_CRYPTO_DES) += des_generic.o diff --git a/crypto/morus1280.c b/crypto/morus1280.c deleted file mode 100644 index f8734c6576af..000000000000 --- a/crypto/morus1280.c +++ /dev/null @@ -1,542 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * The MORUS-1280 Authenticated-Encryption Algorithm - * - * Copyright (c) 2016-2018 Ondrej Mosnacek - * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define MORUS1280_WORD_SIZE 8 -#define MORUS1280_BLOCK_SIZE (MORUS_BLOCK_WORDS * MORUS1280_WORD_SIZE) -#define MORUS1280_BLOCK_ALIGN (__alignof__(__le64)) -#define MORUS1280_ALIGNED(p) IS_ALIGNED((uintptr_t)p, MORUS1280_BLOCK_ALIGN) - -struct morus1280_block { - u64 words[MORUS_BLOCK_WORDS]; -}; - -union morus1280_block_in { - __le64 words[MORUS_BLOCK_WORDS]; - u8 bytes[MORUS1280_BLOCK_SIZE]; -}; - -struct morus1280_state { - struct morus1280_block s[MORUS_STATE_BLOCKS]; -}; - -struct morus1280_ctx { - struct morus1280_block key; -}; - -struct morus1280_ops { - int (*skcipher_walk_init)(struct skcipher_walk *walk, - struct aead_request *req, bool atomic); - - void (*crypt_chunk)(struct morus1280_state *state, - u8 *dst, const u8 *src, unsigned int size); -}; - -static const struct morus1280_block crypto_morus1280_const[1] = { - { .words = { - U64_C(0x0d08050302010100), - U64_C(0x6279e99059372215), - U64_C(0xf12fc26d55183ddb), - U64_C(0xdd28b57342311120), - } }, -}; - -static void crypto_morus1280_round(struct morus1280_block *b0, - struct morus1280_block *b1, - struct morus1280_block *b2, - struct morus1280_block *b3, - struct morus1280_block *b4, - const struct morus1280_block *m, - unsigned int b, unsigned int w) -{ - unsigned int i; - struct morus1280_block tmp; - - for (i = 0; i < MORUS_BLOCK_WORDS; i++) { - b0->words[i] ^= b1->words[i] & b2->words[i]; - b0->words[i] ^= b3->words[i]; - b0->words[i] ^= m->words[i]; - b0->words[i] = rol64(b0->words[i], b); - } - - tmp = *b3; - for (i = 0; i < MORUS_BLOCK_WORDS; i++) - b3->words[(i + w) % MORUS_BLOCK_WORDS] = tmp.words[i]; -} - -static void crypto_morus1280_update(struct morus1280_state *state, - const struct morus1280_block *m) -{ - static const struct morus1280_block z = {}; - - struct morus1280_block *s = state->s; - - crypto_morus1280_round(&s[0], &s[1], &s[2], &s[3], &s[4], &z, 13, 1); - crypto_morus1280_round(&s[1], &s[2], &s[3], &s[4], &s[0], m, 46, 2); - crypto_morus1280_round(&s[2], &s[3], &s[4], &s[0], &s[1], m, 38, 3); - crypto_morus1280_round(&s[3], &s[4], &s[0], &s[1], &s[2], m, 7, 2); - crypto_morus1280_round(&s[4], &s[0], &s[1], &s[2], &s[3], m, 4, 1); -} - -static void crypto_morus1280_load_a(struct morus1280_block *dst, const u8 *src) -{ - unsigned int i; - for (i = 0; i < MORUS_BLOCK_WORDS; i++) { - dst->words[i] = le64_to_cpu(*(const __le64 *)src); - src += MORUS1280_WORD_SIZE; - } -} - -static void crypto_morus1280_load_u(struct morus1280_block *dst, const u8 *src) -{ - unsigned int i; - for (i = 0; i < MORUS_BLOCK_WORDS; i++) { - dst->words[i] = get_unaligned_le64(src); - src += MORUS1280_WORD_SIZE; - } -} - -static void crypto_morus1280_load(struct morus1280_block *dst, const u8 *src) -{ - if (MORUS1280_ALIGNED(src)) - crypto_morus1280_load_a(dst, src); - else - crypto_morus1280_load_u(dst, src); -} - -static void crypto_morus1280_store_a(u8 *dst, const struct morus1280_block *src) -{ - unsigned int i; - for (i = 0; i < MORUS_BLOCK_WORDS; i++) { - *(__le64 *)dst = cpu_to_le64(src->words[i]); - dst += MORUS1280_WORD_SIZE; - } -} - -static void crypto_morus1280_store_u(u8 *dst, const struct morus1280_block *src) -{ - unsigned int i; - for (i = 0; i < MORUS_BLOCK_WORDS; i++) { - put_unaligned_le64(src->words[i], dst); - dst += MORUS1280_WORD_SIZE; - } -} - -static void crypto_morus1280_store(u8 *dst, const struct morus1280_block *src) -{ - if (MORUS1280_ALIGNED(dst)) - crypto_morus1280_store_a(dst, src); - else - crypto_morus1280_store_u(dst, src); -} - -static void crypto_morus1280_ad(struct morus1280_state *state, const u8 *src, - unsigned int size) -{ - struct morus1280_block m; - - if (MORUS1280_ALIGNED(src)) { - while (size >= MORUS1280_BLOCK_SIZE) { - crypto_morus1280_load_a(&m, src); - crypto_morus1280_update(state, &m); - - size -= MORUS1280_BLOCK_SIZE; - src += MORUS1280_BLOCK_SIZE; - } - } else { - while (size >= MORUS1280_BLOCK_SIZE) { - crypto_morus1280_load_u(&m, src); - crypto_morus1280_update(state, &m); - - size -= MORUS1280_BLOCK_SIZE; - src += MORUS1280_BLOCK_SIZE; - } - } -} - -static void crypto_morus1280_core(const struct morus1280_state *state, - struct morus1280_block *blk) -{ - unsigned int i; - - for (i = 0; i < MORUS_BLOCK_WORDS; i++) - blk->words[(i + 3) % MORUS_BLOCK_WORDS] ^= state->s[1].words[i]; - - for (i = 0; i < MORUS_BLOCK_WORDS; i++) { - blk->words[i] ^= state->s[0].words[i]; - blk->words[i] ^= state->s[2].words[i] & state->s[3].words[i]; - } -} - -static void crypto_morus1280_encrypt_chunk(struct morus1280_state *state, - u8 *dst, const u8 *src, - unsigned int size) -{ - struct morus1280_block c, m; - - if (MORUS1280_ALIGNED(src) && MORUS1280_ALIGNED(dst)) { - while (size >= MORUS1280_BLOCK_SIZE) { - crypto_morus1280_load_a(&m, src); - c = m; - crypto_morus1280_core(state, &c); - crypto_morus1280_store_a(dst, &c); - crypto_morus1280_update(state, &m); - - src += MORUS1280_BLOCK_SIZE; - dst += MORUS1280_BLOCK_SIZE; - size -= MORUS1280_BLOCK_SIZE; - } - } else { - while (size >= MORUS1280_BLOCK_SIZE) { - crypto_morus1280_load_u(&m, src); - c = m; - crypto_morus1280_core(state, &c); - crypto_morus1280_store_u(dst, &c); - crypto_morus1280_update(state, &m); - - src += MORUS1280_BLOCK_SIZE; - dst += MORUS1280_BLOCK_SIZE; - size -= MORUS1280_BLOCK_SIZE; - } - } - - if (size > 0) { - union morus1280_block_in tail; - - memcpy(tail.bytes, src, size); - memset(tail.bytes + size, 0, MORUS1280_BLOCK_SIZE - size); - - crypto_morus1280_load_a(&m, tail.bytes); - c = m; - crypto_morus1280_core(state, &c); - crypto_morus1280_store_a(tail.bytes, &c); - crypto_morus1280_update(state, &m); - - memcpy(dst, tail.bytes, size); - } -} - -static void crypto_morus1280_decrypt_chunk(struct morus1280_state *state, - u8 *dst, const u8 *src, - unsigned int size) -{ - struct morus1280_block m; - - if (MORUS1280_ALIGNED(src) && MORUS1280_ALIGNED(dst)) { - while (size >= MORUS1280_BLOCK_SIZE) { - crypto_morus1280_load_a(&m, src); - crypto_morus1280_core(state, &m); - crypto_morus1280_store_a(dst, &m); - crypto_morus1280_update(state, &m); - - src += MORUS1280_BLOCK_SIZE; - dst += MORUS1280_BLOCK_SIZE; - size -= MORUS1280_BLOCK_SIZE; - } - } else { - while (size >= MORUS1280_BLOCK_SIZE) { - crypto_morus1280_load_u(&m, src); - crypto_morus1280_core(state, &m); - crypto_morus1280_store_u(dst, &m); - crypto_morus1280_update(state, &m); - - src += MORUS1280_BLOCK_SIZE; - dst += MORUS1280_BLOCK_SIZE; - size -= MORUS1280_BLOCK_SIZE; - } - } - - if (size > 0) { - union morus1280_block_in tail; - - memcpy(tail.bytes, src, size); - memset(tail.bytes + size, 0, MORUS1280_BLOCK_SIZE - size); - - crypto_morus1280_load_a(&m, tail.bytes); - crypto_morus1280_core(state, &m); - crypto_morus1280_store_a(tail.bytes, &m); - memset(tail.bytes + size, 0, MORUS1280_BLOCK_SIZE - size); - crypto_morus1280_load_a(&m, tail.bytes); - crypto_morus1280_update(state, &m); - - memcpy(dst, tail.bytes, size); - } -} - -static void crypto_morus1280_init(struct morus1280_state *state, - const struct morus1280_block *key, - const u8 *iv) -{ - static const struct morus1280_block z = {}; - - union morus1280_block_in tmp; - unsigned int i; - - memcpy(tmp.bytes, iv, MORUS_NONCE_SIZE); - memset(tmp.bytes + MORUS_NONCE_SIZE, 0, - MORUS1280_BLOCK_SIZE - MORUS_NONCE_SIZE); - - crypto_morus1280_load(&state->s[0], tmp.bytes); - state->s[1] = *key; - for (i = 0; i < MORUS_BLOCK_WORDS; i++) - state->s[2].words[i] = U64_C(0xFFFFFFFFFFFFFFFF); - state->s[3] = z; - state->s[4] = crypto_morus1280_const[0]; - - for (i = 0; i < 16; i++) - crypto_morus1280_update(state, &z); - - for (i = 0; i < MORUS_BLOCK_WORDS; i++) - state->s[1].words[i] ^= key->words[i]; -} - -static void crypto_morus1280_process_ad(struct morus1280_state *state, - struct scatterlist *sg_src, - unsigned int assoclen) -{ - struct scatter_walk walk; - struct morus1280_block m; - union morus1280_block_in buf; - unsigned int pos = 0; - - scatterwalk_start(&walk, sg_src); - while (assoclen != 0) { - unsigned int size = scatterwalk_clamp(&walk, assoclen); - unsigned int left = size; - void *mapped = scatterwalk_map(&walk); - const u8 *src = (const u8 *)mapped; - - if (pos + size >= MORUS1280_BLOCK_SIZE) { - if (pos > 0) { - unsigned int fill = MORUS1280_BLOCK_SIZE - pos; - memcpy(buf.bytes + pos, src, fill); - - crypto_morus1280_load_a(&m, buf.bytes); - crypto_morus1280_update(state, &m); - - pos = 0; - left -= fill; - src += fill; - } - - crypto_morus1280_ad(state, src, left); - src += left & ~(MORUS1280_BLOCK_SIZE - 1); - left &= MORUS1280_BLOCK_SIZE - 1; - } - - memcpy(buf.bytes + pos, src, left); - - pos += left; - assoclen -= size; - scatterwalk_unmap(mapped); - scatterwalk_advance(&walk, size); - scatterwalk_done(&walk, 0, assoclen); - } - - if (pos > 0) { - memset(buf.bytes + pos, 0, MORUS1280_BLOCK_SIZE - pos); - - crypto_morus1280_load_a(&m, buf.bytes); - crypto_morus1280_update(state, &m); - } -} - -static void crypto_morus1280_process_crypt(struct morus1280_state *state, - struct aead_request *req, - const struct morus1280_ops *ops) -{ - struct skcipher_walk walk; - - ops->skcipher_walk_init(&walk, req, false); - - while (walk.nbytes) { - unsigned int nbytes = walk.nbytes; - - if (nbytes < walk.total) - nbytes = round_down(nbytes, walk.stride); - - ops->crypt_chunk(state, walk.dst.virt.addr, walk.src.virt.addr, - nbytes); - - skcipher_walk_done(&walk, walk.nbytes - nbytes); - } -} - -static void crypto_morus1280_final(struct morus1280_state *state, - struct morus1280_block *tag_xor, - u64 assoclen, u64 cryptlen) -{ - struct morus1280_block tmp; - unsigned int i; - - tmp.words[0] = assoclen * 8; - tmp.words[1] = cryptlen * 8; - tmp.words[2] = 0; - tmp.words[3] = 0; - - for (i = 0; i < MORUS_BLOCK_WORDS; i++) - state->s[4].words[i] ^= state->s[0].words[i]; - - for (i = 0; i < 10; i++) - crypto_morus1280_update(state, &tmp); - - crypto_morus1280_core(state, tag_xor); -} - -static int crypto_morus1280_setkey(struct crypto_aead *aead, const u8 *key, - unsigned int keylen) -{ - struct morus1280_ctx *ctx = crypto_aead_ctx(aead); - union morus1280_block_in tmp; - - if (keylen == MORUS1280_BLOCK_SIZE) - crypto_morus1280_load(&ctx->key, key); - else if (keylen == MORUS1280_BLOCK_SIZE / 2) { - memcpy(tmp.bytes, key, keylen); - memcpy(tmp.bytes + keylen, key, keylen); - - crypto_morus1280_load(&ctx->key, tmp.bytes); - } else { - crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN); - return -EINVAL; - } - - return 0; -} - -static int crypto_morus1280_setauthsize(struct crypto_aead *tfm, - unsigned int authsize) -{ - return (authsize <= MORUS_MAX_AUTH_SIZE) ? 0 : -EINVAL; -} - -static void crypto_morus1280_crypt(struct aead_request *req, - struct morus1280_block *tag_xor, - unsigned int cryptlen, - const struct morus1280_ops *ops) -{ - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct morus1280_ctx *ctx = crypto_aead_ctx(tfm); - struct morus1280_state state; - - crypto_morus1280_init(&state, &ctx->key, req->iv); - crypto_morus1280_process_ad(&state, req->src, req->assoclen); - crypto_morus1280_process_crypt(&state, req, ops); - crypto_morus1280_final(&state, tag_xor, req->assoclen, cryptlen); -} - -static int crypto_morus1280_encrypt(struct aead_request *req) -{ - static const struct morus1280_ops ops = { - .skcipher_walk_init = skcipher_walk_aead_encrypt, - .crypt_chunk = crypto_morus1280_encrypt_chunk, - }; - - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct morus1280_block tag = {}; - union morus1280_block_in tag_out; - unsigned int authsize = crypto_aead_authsize(tfm); - unsigned int cryptlen = req->cryptlen; - - crypto_morus1280_crypt(req, &tag, cryptlen, &ops); - crypto_morus1280_store(tag_out.bytes, &tag); - - scatterwalk_map_and_copy(tag_out.bytes, req->dst, - req->assoclen + cryptlen, authsize, 1); - return 0; -} - -static int crypto_morus1280_decrypt(struct aead_request *req) -{ - static const struct morus1280_ops ops = { - .skcipher_walk_init = skcipher_walk_aead_decrypt, - .crypt_chunk = crypto_morus1280_decrypt_chunk, - }; - static const u8 zeros[MORUS1280_BLOCK_SIZE] = {}; - - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - union morus1280_block_in tag_in; - struct morus1280_block tag; - unsigned int authsize = crypto_aead_authsize(tfm); - unsigned int cryptlen = req->cryptlen - authsize; - - scatterwalk_map_and_copy(tag_in.bytes, req->src, - req->assoclen + cryptlen, authsize, 0); - - crypto_morus1280_load(&tag, tag_in.bytes); - crypto_morus1280_crypt(req, &tag, cryptlen, &ops); - crypto_morus1280_store(tag_in.bytes, &tag); - - return crypto_memneq(tag_in.bytes, zeros, authsize) ? -EBADMSG : 0; -} - -static int crypto_morus1280_init_tfm(struct crypto_aead *tfm) -{ - return 0; -} - -static void crypto_morus1280_exit_tfm(struct crypto_aead *tfm) -{ -} - -static struct aead_alg crypto_morus1280_alg = { - .setkey = crypto_morus1280_setkey, - .setauthsize = crypto_morus1280_setauthsize, - .encrypt = crypto_morus1280_encrypt, - .decrypt = crypto_morus1280_decrypt, - .init = crypto_morus1280_init_tfm, - .exit = crypto_morus1280_exit_tfm, - - .ivsize = MORUS_NONCE_SIZE, - .maxauthsize = MORUS_MAX_AUTH_SIZE, - .chunksize = MORUS1280_BLOCK_SIZE, - - .base = { - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct morus1280_ctx), - .cra_alignmask = 0, - - .cra_priority = 100, - - .cra_name = "morus1280", - .cra_driver_name = "morus1280-generic", - - .cra_module = THIS_MODULE, - } -}; - - -static int __init crypto_morus1280_module_init(void) -{ - return crypto_register_aead(&crypto_morus1280_alg); -} - -static void __exit crypto_morus1280_module_exit(void) -{ - crypto_unregister_aead(&crypto_morus1280_alg); -} - -subsys_initcall(crypto_morus1280_module_init); -module_exit(crypto_morus1280_module_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Ondrej Mosnacek "); -MODULE_DESCRIPTION("MORUS-1280 AEAD algorithm"); -MODULE_ALIAS_CRYPTO("morus1280"); -MODULE_ALIAS_CRYPTO("morus1280-generic"); diff --git a/crypto/morus640.c b/crypto/morus640.c deleted file mode 100644 index ae5aa9482cb4..000000000000 --- a/crypto/morus640.c +++ /dev/null @@ -1,533 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * The MORUS-640 Authenticated-Encryption Algorithm - * - * Copyright (c) 2016-2018 Ondrej Mosnacek - * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define MORUS640_WORD_SIZE 4 -#define MORUS640_BLOCK_SIZE (MORUS_BLOCK_WORDS * MORUS640_WORD_SIZE) -#define MORUS640_BLOCK_ALIGN (__alignof__(__le32)) -#define MORUS640_ALIGNED(p) IS_ALIGNED((uintptr_t)p, MORUS640_BLOCK_ALIGN) - -struct morus640_block { - u32 words[MORUS_BLOCK_WORDS]; -}; - -union morus640_block_in { - __le32 words[MORUS_BLOCK_WORDS]; - u8 bytes[MORUS640_BLOCK_SIZE]; -}; - -struct morus640_state { - struct morus640_block s[MORUS_STATE_BLOCKS]; -}; - -struct morus640_ctx { - struct morus640_block key; -}; - -struct morus640_ops { - int (*skcipher_walk_init)(struct skcipher_walk *walk, - struct aead_request *req, bool atomic); - - void (*crypt_chunk)(struct morus640_state *state, - u8 *dst, const u8 *src, unsigned int size); -}; - -static const struct morus640_block crypto_morus640_const[2] = { - { .words = { - U32_C(0x02010100), - U32_C(0x0d080503), - U32_C(0x59372215), - U32_C(0x6279e990), - } }, - { .words = { - U32_C(0x55183ddb), - U32_C(0xf12fc26d), - U32_C(0x42311120), - U32_C(0xdd28b573), - } }, -}; - -static void crypto_morus640_round(struct morus640_block *b0, - struct morus640_block *b1, - struct morus640_block *b2, - struct morus640_block *b3, - struct morus640_block *b4, - const struct morus640_block *m, - unsigned int b, unsigned int w) -{ - unsigned int i; - struct morus640_block tmp; - - for (i = 0; i < MORUS_BLOCK_WORDS; i++) { - b0->words[i] ^= b1->words[i] & b2->words[i]; - b0->words[i] ^= b3->words[i]; - b0->words[i] ^= m->words[i]; - b0->words[i] = rol32(b0->words[i], b); - } - - tmp = *b3; - for (i = 0; i < MORUS_BLOCK_WORDS; i++) - b3->words[(i + w) % MORUS_BLOCK_WORDS] = tmp.words[i]; -} - -static void crypto_morus640_update(struct morus640_state *state, - const struct morus640_block *m) -{ - static const struct morus640_block z = {}; - - struct morus640_block *s = state->s; - - crypto_morus640_round(&s[0], &s[1], &s[2], &s[3], &s[4], &z, 5, 1); - crypto_morus640_round(&s[1], &s[2], &s[3], &s[4], &s[0], m, 31, 2); - crypto_morus640_round(&s[2], &s[3], &s[4], &s[0], &s[1], m, 7, 3); - crypto_morus640_round(&s[3], &s[4], &s[0], &s[1], &s[2], m, 22, 2); - crypto_morus640_round(&s[4], &s[0], &s[1], &s[2], &s[3], m, 13, 1); -} - -static void crypto_morus640_load_a(struct morus640_block *dst, const u8 *src) -{ - unsigned int i; - for (i = 0; i < MORUS_BLOCK_WORDS; i++) { - dst->words[i] = le32_to_cpu(*(const __le32 *)src); - src += MORUS640_WORD_SIZE; - } -} - -static void crypto_morus640_load_u(struct morus640_block *dst, const u8 *src) -{ - unsigned int i; - for (i = 0; i < MORUS_BLOCK_WORDS; i++) { - dst->words[i] = get_unaligned_le32(src); - src += MORUS640_WORD_SIZE; - } -} - -static void crypto_morus640_load(struct morus640_block *dst, const u8 *src) -{ - if (MORUS640_ALIGNED(src)) - crypto_morus640_load_a(dst, src); - else - crypto_morus640_load_u(dst, src); -} - -static void crypto_morus640_store_a(u8 *dst, const struct morus640_block *src) -{ - unsigned int i; - for (i = 0; i < MORUS_BLOCK_WORDS; i++) { - *(__le32 *)dst = cpu_to_le32(src->words[i]); - dst += MORUS640_WORD_SIZE; - } -} - -static void crypto_morus640_store_u(u8 *dst, const struct morus640_block *src) -{ - unsigned int i; - for (i = 0; i < MORUS_BLOCK_WORDS; i++) { - put_unaligned_le32(src->words[i], dst); - dst += MORUS640_WORD_SIZE; - } -} - -static void crypto_morus640_store(u8 *dst, const struct morus640_block *src) -{ - if (MORUS640_ALIGNED(dst)) - crypto_morus640_store_a(dst, src); - else - crypto_morus640_store_u(dst, src); -} - -static void crypto_morus640_ad(struct morus640_state *state, const u8 *src, - unsigned int size) -{ - struct morus640_block m; - - if (MORUS640_ALIGNED(src)) { - while (size >= MORUS640_BLOCK_SIZE) { - crypto_morus640_load_a(&m, src); - crypto_morus640_update(state, &m); - - size -= MORUS640_BLOCK_SIZE; - src += MORUS640_BLOCK_SIZE; - } - } else { - while (size >= MORUS640_BLOCK_SIZE) { - crypto_morus640_load_u(&m, src); - crypto_morus640_update(state, &m); - - size -= MORUS640_BLOCK_SIZE; - src += MORUS640_BLOCK_SIZE; - } - } -} - -static void crypto_morus640_core(const struct morus640_state *state, - struct morus640_block *blk) -{ - unsigned int i; - - for (i = 0; i < MORUS_BLOCK_WORDS; i++) - blk->words[(i + 3) % MORUS_BLOCK_WORDS] ^= state->s[1].words[i]; - - for (i = 0; i < MORUS_BLOCK_WORDS; i++) { - blk->words[i] ^= state->s[0].words[i]; - blk->words[i] ^= state->s[2].words[i] & state->s[3].words[i]; - } -} - -static void crypto_morus640_encrypt_chunk(struct morus640_state *state, u8 *dst, - const u8 *src, unsigned int size) -{ - struct morus640_block c, m; - - if (MORUS640_ALIGNED(src) && MORUS640_ALIGNED(dst)) { - while (size >= MORUS640_BLOCK_SIZE) { - crypto_morus640_load_a(&m, src); - c = m; - crypto_morus640_core(state, &c); - crypto_morus640_store_a(dst, &c); - crypto_morus640_update(state, &m); - - src += MORUS640_BLOCK_SIZE; - dst += MORUS640_BLOCK_SIZE; - size -= MORUS640_BLOCK_SIZE; - } - } else { - while (size >= MORUS640_BLOCK_SIZE) { - crypto_morus640_load_u(&m, src); - c = m; - crypto_morus640_core(state, &c); - crypto_morus640_store_u(dst, &c); - crypto_morus640_update(state, &m); - - src += MORUS640_BLOCK_SIZE; - dst += MORUS640_BLOCK_SIZE; - size -= MORUS640_BLOCK_SIZE; - } - } - - if (size > 0) { - union morus640_block_in tail; - - memcpy(tail.bytes, src, size); - memset(tail.bytes + size, 0, MORUS640_BLOCK_SIZE - size); - - crypto_morus640_load_a(&m, tail.bytes); - c = m; - crypto_morus640_core(state, &c); - crypto_morus640_store_a(tail.bytes, &c); - crypto_morus640_update(state, &m); - - memcpy(dst, tail.bytes, size); - } -} - -static void crypto_morus640_decrypt_chunk(struct morus640_state *state, u8 *dst, - const u8 *src, unsigned int size) -{ - struct morus640_block m; - - if (MORUS640_ALIGNED(src) && MORUS640_ALIGNED(dst)) { - while (size >= MORUS640_BLOCK_SIZE) { - crypto_morus640_load_a(&m, src); - crypto_morus640_core(state, &m); - crypto_morus640_store_a(dst, &m); - crypto_morus640_update(state, &m); - - src += MORUS640_BLOCK_SIZE; - dst += MORUS640_BLOCK_SIZE; - size -= MORUS640_BLOCK_SIZE; - } - } else { - while (size >= MORUS640_BLOCK_SIZE) { - crypto_morus640_load_u(&m, src); - crypto_morus640_core(state, &m); - crypto_morus640_store_u(dst, &m); - crypto_morus640_update(state, &m); - - src += MORUS640_BLOCK_SIZE; - dst += MORUS640_BLOCK_SIZE; - size -= MORUS640_BLOCK_SIZE; - } - } - - if (size > 0) { - union morus640_block_in tail; - - memcpy(tail.bytes, src, size); - memset(tail.bytes + size, 0, MORUS640_BLOCK_SIZE - size); - - crypto_morus640_load_a(&m, tail.bytes); - crypto_morus640_core(state, &m); - crypto_morus640_store_a(tail.bytes, &m); - memset(tail.bytes + size, 0, MORUS640_BLOCK_SIZE - size); - crypto_morus640_load_a(&m, tail.bytes); - crypto_morus640_update(state, &m); - - memcpy(dst, tail.bytes, size); - } -} - -static void crypto_morus640_init(struct morus640_state *state, - const struct morus640_block *key, - const u8 *iv) -{ - static const struct morus640_block z = {}; - - unsigned int i; - - crypto_morus640_load(&state->s[0], iv); - state->s[1] = *key; - for (i = 0; i < MORUS_BLOCK_WORDS; i++) - state->s[2].words[i] = U32_C(0xFFFFFFFF); - state->s[3] = crypto_morus640_const[0]; - state->s[4] = crypto_morus640_const[1]; - - for (i = 0; i < 16; i++) - crypto_morus640_update(state, &z); - - for (i = 0; i < MORUS_BLOCK_WORDS; i++) - state->s[1].words[i] ^= key->words[i]; -} - -static void crypto_morus640_process_ad(struct morus640_state *state, - struct scatterlist *sg_src, - unsigned int assoclen) -{ - struct scatter_walk walk; - struct morus640_block m; - union morus640_block_in buf; - unsigned int pos = 0; - - scatterwalk_start(&walk, sg_src); - while (assoclen != 0) { - unsigned int size = scatterwalk_clamp(&walk, assoclen); - unsigned int left = size; - void *mapped = scatterwalk_map(&walk); - const u8 *src = (const u8 *)mapped; - - if (pos + size >= MORUS640_BLOCK_SIZE) { - if (pos > 0) { - unsigned int fill = MORUS640_BLOCK_SIZE - pos; - memcpy(buf.bytes + pos, src, fill); - - crypto_morus640_load_a(&m, buf.bytes); - crypto_morus640_update(state, &m); - - pos = 0; - left -= fill; - src += fill; - } - - crypto_morus640_ad(state, src, left); - src += left & ~(MORUS640_BLOCK_SIZE - 1); - left &= MORUS640_BLOCK_SIZE - 1; - } - - memcpy(buf.bytes + pos, src, left); - - pos += left; - assoclen -= size; - scatterwalk_unmap(mapped); - scatterwalk_advance(&walk, size); - scatterwalk_done(&walk, 0, assoclen); - } - - if (pos > 0) { - memset(buf.bytes + pos, 0, MORUS640_BLOCK_SIZE - pos); - - crypto_morus640_load_a(&m, buf.bytes); - crypto_morus640_update(state, &m); - } -} - -static void crypto_morus640_process_crypt(struct morus640_state *state, - struct aead_request *req, - const struct morus640_ops *ops) -{ - struct skcipher_walk walk; - - ops->skcipher_walk_init(&walk, req, false); - - while (walk.nbytes) { - unsigned int nbytes = walk.nbytes; - - if (nbytes < walk.total) - nbytes = round_down(nbytes, walk.stride); - - ops->crypt_chunk(state, walk.dst.virt.addr, walk.src.virt.addr, - nbytes); - - skcipher_walk_done(&walk, walk.nbytes - nbytes); - } -} - -static void crypto_morus640_final(struct morus640_state *state, - struct morus640_block *tag_xor, - u64 assoclen, u64 cryptlen) -{ - struct morus640_block tmp; - unsigned int i; - - tmp.words[0] = lower_32_bits(assoclen * 8); - tmp.words[1] = upper_32_bits(assoclen * 8); - tmp.words[2] = lower_32_bits(cryptlen * 8); - tmp.words[3] = upper_32_bits(cryptlen * 8); - - for (i = 0; i < MORUS_BLOCK_WORDS; i++) - state->s[4].words[i] ^= state->s[0].words[i]; - - for (i = 0; i < 10; i++) - crypto_morus640_update(state, &tmp); - - crypto_morus640_core(state, tag_xor); -} - -static int crypto_morus640_setkey(struct crypto_aead *aead, const u8 *key, - unsigned int keylen) -{ - struct morus640_ctx *ctx = crypto_aead_ctx(aead); - - if (keylen != MORUS640_BLOCK_SIZE) { - crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN); - return -EINVAL; - } - - crypto_morus640_load(&ctx->key, key); - return 0; -} - -static int crypto_morus640_setauthsize(struct crypto_aead *tfm, - unsigned int authsize) -{ - return (authsize <= MORUS_MAX_AUTH_SIZE) ? 0 : -EINVAL; -} - -static void crypto_morus640_crypt(struct aead_request *req, - struct morus640_block *tag_xor, - unsigned int cryptlen, - const struct morus640_ops *ops) -{ - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct morus640_ctx *ctx = crypto_aead_ctx(tfm); - struct morus640_state state; - - crypto_morus640_init(&state, &ctx->key, req->iv); - crypto_morus640_process_ad(&state, req->src, req->assoclen); - crypto_morus640_process_crypt(&state, req, ops); - crypto_morus640_final(&state, tag_xor, req->assoclen, cryptlen); -} - -static int crypto_morus640_encrypt(struct aead_request *req) -{ - static const struct morus640_ops ops = { - .skcipher_walk_init = skcipher_walk_aead_encrypt, - .crypt_chunk = crypto_morus640_encrypt_chunk, - }; - - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct morus640_block tag = {}; - union morus640_block_in tag_out; - unsigned int authsize = crypto_aead_authsize(tfm); - unsigned int cryptlen = req->cryptlen; - - crypto_morus640_crypt(req, &tag, cryptlen, &ops); - crypto_morus640_store(tag_out.bytes, &tag); - - scatterwalk_map_and_copy(tag_out.bytes, req->dst, - req->assoclen + cryptlen, authsize, 1); - return 0; -} - -static int crypto_morus640_decrypt(struct aead_request *req) -{ - static const struct morus640_ops ops = { - .skcipher_walk_init = skcipher_walk_aead_decrypt, - .crypt_chunk = crypto_morus640_decrypt_chunk, - }; - static const u8 zeros[MORUS640_BLOCK_SIZE] = {}; - - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - union morus640_block_in tag_in; - struct morus640_block tag; - unsigned int authsize = crypto_aead_authsize(tfm); - unsigned int cryptlen = req->cryptlen - authsize; - - scatterwalk_map_and_copy(tag_in.bytes, req->src, - req->assoclen + cryptlen, authsize, 0); - - crypto_morus640_load(&tag, tag_in.bytes); - crypto_morus640_crypt(req, &tag, cryptlen, &ops); - crypto_morus640_store(tag_in.bytes, &tag); - - return crypto_memneq(tag_in.bytes, zeros, authsize) ? -EBADMSG : 0; -} - -static int crypto_morus640_init_tfm(struct crypto_aead *tfm) -{ - return 0; -} - -static void crypto_morus640_exit_tfm(struct crypto_aead *tfm) -{ -} - -static struct aead_alg crypto_morus640_alg = { - .setkey = crypto_morus640_setkey, - .setauthsize = crypto_morus640_setauthsize, - .encrypt = crypto_morus640_encrypt, - .decrypt = crypto_morus640_decrypt, - .init = crypto_morus640_init_tfm, - .exit = crypto_morus640_exit_tfm, - - .ivsize = MORUS_NONCE_SIZE, - .maxauthsize = MORUS_MAX_AUTH_SIZE, - .chunksize = MORUS640_BLOCK_SIZE, - - .base = { - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct morus640_ctx), - .cra_alignmask = 0, - - .cra_priority = 100, - - .cra_name = "morus640", - .cra_driver_name = "morus640-generic", - - .cra_module = THIS_MODULE, - } -}; - -static int __init crypto_morus640_module_init(void) -{ - return crypto_register_aead(&crypto_morus640_alg); -} - -static void __exit crypto_morus640_module_exit(void) -{ - crypto_unregister_aead(&crypto_morus640_alg); -} - -subsys_initcall(crypto_morus640_module_init); -module_exit(crypto_morus640_module_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Ondrej Mosnacek "); -MODULE_DESCRIPTION("MORUS-640 AEAD algorithm"); -MODULE_ALIAS_CRYPTO("morus640"); -MODULE_ALIAS_CRYPTO("morus640-generic"); diff --git a/crypto/testmgr.c b/crypto/testmgr.c index 5fe90ea46319..6258581aa628 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -4768,18 +4768,6 @@ static const struct alg_test_desc alg_test_descs[] = { .suite = { .hash = __VECS(michael_mic_tv_template) } - }, { - .alg = "morus1280", - .test = alg_test_aead, - .suite = { - .aead = __VECS(morus1280_tv_template) - } - }, { - .alg = "morus640", - .test = alg_test_aead, - .suite = { - .aead = __VECS(morus640_tv_template) - } }, { .alg = "nhpoly1305", .test = alg_test_hash, diff --git a/crypto/testmgr.h b/crypto/testmgr.h index b743ca6d12cc..fca03fa018fc 100644 --- a/crypto/testmgr.h +++ b/crypto/testmgr.h @@ -20472,1713 +20472,6 @@ static const struct aead_testvec aegis256_tv_template[] = { }, }; -/* - * MORUS-640 test vectors - generated via reference implementation from - * SUPERCOP (https://bench.cr.yp.to/supercop.html): - * - * https://bench.cr.yp.to/supercop/supercop-20170228.tar.xz - * (see crypto_aead/morus640128v2/) - */ -static const struct aead_testvec morus640_tv_template[] = { - { - .key = "\x00\x00\x00\x00\x00\x00\x00\x00" - "\x00\x00\x00\x00\x00\x00\x00\x00", - .klen = 16, - .iv = "\x0f\xc9\x8e\x67\x44\x9e\xaa\x86" - "\x20\x36\x2c\x24\xfe\xc9\x30\x81", - .assoc = "", - .alen = 0, - .ptext = "", - .plen = 0, - .ctext = "\x89\x62\x7d\xf3\x07\x9d\x52\x05" - "\x53\xc3\x04\x60\x93\xb4\x37\x9a", - .clen = 16, - }, { - .key = "\x3c\x24\x39\x9f\x10\x7b\xa8\x1b" - "\x80\xda\xb2\x91\xf9\x24\xc2\x06", - .klen = 16, - .iv = "\x4b\xed\xc8\x07\x54\x1a\x52\xa2" - "\xa1\x10\xde\xb5\xf8\xed\xf3\x87", - .assoc = "", - .alen = 0, - .ptext = "\x69", - .plen = 1, - .ctext = "\xa8\x8d\xe4\x90\xb5\x50\x8f\x78" - "\xb6\x10\x9a\x59\x5f\x61\x37\x70" - "\x09", - .clen = 17, - }, { - .key = "\x79\x49\x73\x3e\x20\xf7\x51\x37" - "\x01\xb4\x64\x22\xf3\x48\x85\x0c", - .klen = 16, - .iv = "\x88\x12\x01\xa6\x64\x96\xfb\xbe" - "\x22\xea\x90\x47\xf2\x11\xb5\x8e", - .assoc = "", - .alen = 0, - .ptext = "\xa6\xa4\x1e\x76\xec\xd4\x50\xcc" - "\x62\x58\xe9\x8f\xef\xa4\x17", - .plen = 15, - .ctext = "\x76\xdd\xb9\x05\x3d\xce\x61\x38" - "\xf3\xef\xf7\xe5\xd7\xfd\x70\xa5" - "\xcf\x9d\x64\xb8\x0a\x9f\xfd\x8b" - "\xd4\x6e\xfe\xd9\xc8\x63\x4b", - .clen = 31, - }, { - .key = "\xb5\x6e\xad\xdd\x30\x72\xfa\x53" - "\x82\x8e\x16\xb4\xed\x6d\x47\x12", - .klen = 16, - .iv = "\xc4\x37\x3b\x45\x74\x11\xa4\xda" - "\xa2\xc5\x42\xd8\xec\x36\x78\x94", - .assoc = "", - .alen = 0, - .ptext = "\xe2\xc9\x58\x15\xfc\x4f\xf8\xe8" - "\xe3\x32\x9b\x21\xe9\xc8\xd9\x97", - .plen = 16, - .ctext = "\xdc\x72\xe8\x14\xfb\x63\xad\x72" - "\x1f\x57\x9a\x1f\x88\x81\xdb\xd6" - "\xc1\x91\x9d\xb9\x25\xc4\x99\x4c" - "\x97\xcd\x8a\x0c\x9d\x68\x00\x1c", - .clen = 32, - }, { - .key = "\xf2\x92\xe6\x7d\x40\xee\xa3\x6f" - "\x03\x68\xc8\x45\xe7\x91\x0a\x18", - .klen = 16, - .iv = "\x01\x5c\x75\xe5\x84\x8d\x4d\xf6" - "\x23\x9f\xf4\x6a\xe6\x5a\x3b\x9a", - .assoc = "", - .alen = 0, - .ptext = "\x1f\xee\x92\xb4\x0c\xcb\xa1\x04" - "\x64\x0c\x4d\xb2\xe3\xec\x9c\x9d" - "\x09", - .plen = 17, - .ctext = "\x6b\x4f\x3b\x90\x9a\xa2\xb3\x82" - "\x0a\xb8\x55\xee\xeb\x73\x4d\x7f" - "\x54\x11\x3a\x8a\x31\xa3\xb5\xf2" - "\xcd\x49\xdb\xf3\xee\x26\xbd\xa2" - "\x0d", - .clen = 33, - }, { - .key = "\x2e\xb7\x20\x1c\x50\x6a\x4b\x8b" - "\x84\x42\x7a\xd7\xe1\xb5\xcd\x1f", - .klen = 16, - .iv = "\x3d\x80\xae\x84\x94\x09\xf6\x12" - "\xa4\x79\xa6\xfb\xe0\x7f\xfd\xa0", - .assoc = "", - .alen = 0, - .ptext = "\x5c\x13\xcb\x54\x1c\x47\x4a\x1f" - "\xe5\xe6\xff\x44\xdd\x11\x5f\xa3" - "\x33\xdd\xc2\xf8\xdd\x18\x2b\x93" - "\x57\x05\x01\x1c\x66\x22\xd3", - .plen = 31, - .ctext = "\x59\xd1\x0f\x6b\xee\x27\x84\x92" - "\xb7\xa9\xb5\xdd\x02\xa4\x12\xa5" - "\x50\x32\xb4\x9a\x2e\x35\x83\x55" - "\x36\x12\x12\xed\xa3\x31\xc5\x30" - "\xa7\xe2\x4a\x6d\x05\x59\x43\x91" - "\x75\xfa\x6c\x17\xc6\x73\xca", - .clen = 47, - }, { - .key = "\x6b\xdc\x5a\xbb\x60\xe5\xf4\xa6" - "\x05\x1d\x2c\x68\xdb\xda\x8f\x25", - .klen = 16, - .iv = "\x7a\xa5\xe8\x23\xa4\x84\x9e\x2d" - "\x25\x53\x58\x8c\xda\xa3\xc0\xa6", - .assoc = "", - .alen = 0, - .ptext = "\x98\x37\x05\xf3\x2c\xc2\xf3\x3b" - "\x66\xc0\xb1\xd5\xd7\x35\x21\xaa" - "\x5d\x9f\xce\x7c\xe2\xb8\xad\xad" - "\x19\x33\xe0\xf4\x40\x81\x72\x28", - .plen = 32, - .ctext = "\xdb\x49\x68\x0f\x91\x5b\x21\xb1" - "\xcf\x50\xb2\x4c\x32\xe1\xa6\x69" - "\xc0\xfb\x44\x1f\xa0\x9a\xeb\x39" - "\x1b\xde\x68\x38\xcc\x27\x52\xc5" - "\xf6\x3e\x74\xea\x66\x5b\x5f\x0c" - "\x65\x9e\x58\xe6\x52\xa2\xfe\x59", - .clen = 48, - }, { - .key = "\xa7\x00\x93\x5b\x70\x61\x9d\xc2" - "\x86\xf7\xde\xfa\xd5\xfe\x52\x2b", - .klen = 16, - .iv = "\xb6\xca\x22\xc3\xb4\x00\x47\x49" - "\xa6\x2d\x0a\x1e\xd4\xc7\x83\xad", - .assoc = "\xc5", - .alen = 1, - .ptext = "", - .plen = 0, - .ctext = "\x56\xe7\x24\x52\xdd\x95\x60\x5b" - "\x09\x48\x39\x69\x9c\xb3\x62\x46", - .clen = 16, - }, { - .key = "\xe4\x25\xcd\xfa\x80\xdd\x46\xde" - "\x07\xd1\x90\x8b\xcf\x23\x15\x31", - .klen = 16, - .iv = "\xf3\xee\x5c\x62\xc4\x7c\xf0\x65" - "\x27\x08\xbd\xaf\xce\xec\x45\xb3", - .assoc = "\x02\xb8\xea\xca\x09\x1b\x9a\xec" - "\x47\x3e\xe9\xd4\xcc\xb5\x76", - .alen = 15, - .ptext = "", - .plen = 0, - .ctext = "\xdd\xfa\x6c\x1f\x5d\x86\x87\x01" - "\x13\xe5\x73\x46\x46\xf2\x5c\xe1", - .clen = 16, - }, { - .key = "\x20\x4a\x07\x99\x91\x58\xee\xfa" - "\x88\xab\x42\x1c\xc9\x47\xd7\x38", - .klen = 16, - .iv = "\x2f\x13\x95\x01\xd5\xf7\x99\x81" - "\xa8\xe2\x6f\x41\xc8\x10\x08\xb9", - .assoc = "\x3f\xdc\x24\x69\x19\x96\x43\x08" - "\xc8\x18\x9b\x65\xc6\xd9\x39\x3b", - .alen = 16, - .ptext = "", - .plen = 0, - .ctext = "\xa6\x1b\xb9\xd7\x5e\x3c\xcf\xac" - "\xa9\x21\x45\x0b\x16\x52\xf7\xe1", - .clen = 16, - }, { - .key = "\x5d\x6f\x41\x39\xa1\xd4\x97\x16" - "\x09\x85\xf4\xae\xc3\x6b\x9a\x3e", - .klen = 16, - .iv = "\x6c\x38\xcf\xa1\xe5\x73\x41\x9d" - "\x29\xbc\x21\xd2\xc2\x35\xcb\xbf", - .assoc = "\x7b\x01\x5d\x08\x29\x12\xec\x24" - "\x49\xf3\x4d\xf7\xc0\xfe\xfb\x41" - "\x3c", - .alen = 17, - .ptext = "", - .plen = 0, - .ctext = "\x15\xff\xde\x3b\x34\xfc\xf6\xf9" - "\xbb\xa8\x62\xad\x0a\xf5\x48\x60", - .clen = 16, - }, { - .key = "\x99\x93\x7a\xd8\xb1\x50\x40\x31" - "\x8a\x60\xa6\x3f\xbd\x90\x5d\x44", - .klen = 16, - .iv = "\xa8\x5c\x09\x40\xf5\xef\xea\xb8" - "\xaa\x96\xd3\x64\xbc\x59\x8d\xc6", - .assoc = "\xb8\x26\x97\xa8\x39\x8e\x94\x3f" - "\xca\xcd\xff\x88\xba\x22\xbe\x47" - "\x67\xba\x85\xf1\xbb\x30\x56\x26" - "\xaf\x0b\x02\x38\xcc\x44\xa7", - .alen = 31, - .ptext = "", - .plen = 0, - .ctext = "\xd2\x9d\xf8\x3b\xd7\x84\xe9\x2d" - "\x4b\xef\x75\x16\x0a\x99\xae\x6b", - .clen = 16, - }, { - .key = "\xd6\xb8\xb4\x77\xc1\xcb\xe9\x4d" - "\x0a\x3a\x58\xd1\xb7\xb4\x1f\x4a", - .klen = 16, - .iv = "\xe5\x81\x42\xdf\x05\x6a\x93\xd4" - "\x2b\x70\x85\xf5\xb6\x7d\x50\xcc", - .assoc = "\xf4\x4a\xd1\x47\x49\x09\x3d\x5b" - "\x4b\xa7\xb1\x19\xb4\x46\x81\x4d" - "\x91\x7c\x91\x75\xc0\xd0\xd8\x40" - "\x71\x39\xe1\x10\xa6\xa3\x46\x7a", - .alen = 32, - .ptext = "", - .plen = 0, - .ctext = "\xe4\x8d\xa7\xa7\x45\xc1\x31\x4f" - "\xce\xfb\xaf\xd6\xc2\xe6\xee\xc0", - .clen = 16, - }, { - .key = "\x12\xdd\xee\x17\xd1\x47\x92\x69" - "\x8b\x14\x0a\x62\xb1\xd9\xe2\x50", - .klen = 16, - .iv = "\x22\xa6\x7c\x7f\x15\xe6\x3c\xf0" - "\xac\x4b\x37\x86\xb0\xa2\x13\xd2", - .assoc = "\x31", - .alen = 1, - .ptext = "\x40", - .plen = 1, - .ctext = "\xe2\x67\x38\x4f\xb9\xad\x7d\x38" - "\x01\xfe\x84\x14\x85\xf8\xd1\xe3" - "\x22", - .clen = 17, - }, { - .key = "\x4f\x01\x27\xb6\xe1\xc3\x3a\x85" - "\x0c\xee\xbc\xf4\xab\xfd\xa5\x57", - .klen = 16, - .iv = "\x5e\xcb\xb6\x1e\x25\x62\xe4\x0c" - "\x2d\x25\xe9\x18\xaa\xc6\xd5\xd8", - .assoc = "\x6d\x94\x44\x86\x69\x00\x8f\x93" - "\x4d\x5b\x15\x3c\xa8\x8f\x06", - .alen = 15, - .ptext = "\x7c\x5d\xd3\xee\xad\x9f\x39\x1a" - "\x6d\x92\x42\x61\xa7\x58\x37", - .plen = 15, - .ctext = "\x77\x32\x61\xeb\xb4\x33\x29\x92" - "\x29\x95\xc5\x8e\x85\x76\xab\xfc" - "\x07\x95\xa7\x44\x74\xf7\x22\xff" - "\xd8\xd8\x36\x3d\x8a\x7f\x9e", - .clen = 31, - }, { - .key = "\x8b\x26\x61\x55\xf1\x3e\xe3\xa1" - "\x8d\xc8\x6e\x85\xa5\x21\x67\x5d", - .klen = 16, - .iv = "\x9b\xef\xf0\xbd\x35\xdd\x8d\x28" - "\xad\xff\x9b\xa9\xa4\xeb\x98\xdf", - .assoc = "\xaa\xb8\x7e\x25\x79\x7c\x37\xaf" - "\xce\x36\xc7\xce\xa2\xb4\xc9\x60", - .alen = 16, - .ptext = "\xb9\x82\x0c\x8d\xbd\x1b\xe2\x36" - "\xee\x6c\xf4\xf2\xa1\x7d\xf9\xe2", - .plen = 16, - .ctext = "\xd8\xfd\x44\x45\xf6\x42\x12\x38" - "\xf2\x0b\xea\x4f\x9e\x11\x61\x07" - "\x48\x67\x98\x18\x9b\xd0\x0c\x59" - "\x67\xa4\x11\xb3\x2b\xd6\xc1\x70", - .clen = 32, - }, { - .key = "\xc8\x4b\x9b\xf5\x01\xba\x8c\xbd" - "\x0e\xa3\x21\x16\x9f\x46\x2a\x63", - .klen = 16, - .iv = "\xd7\x14\x29\x5d\x45\x59\x36\x44" - "\x2e\xd9\x4d\x3b\x9e\x0f\x5b\xe5", - .assoc = "\xe6\xdd\xb8\xc4\x89\xf8\xe0\xca" - "\x4f\x10\x7a\x5f\x9c\xd8\x8b\x66" - "\x3b", - .alen = 17, - .ptext = "\xf5\xa6\x46\x2c\xce\x97\x8a\x51" - "\x6f\x46\xa6\x83\x9b\xa1\xbc\xe8" - "\x05", - .plen = 17, - .ctext = "\xb1\xab\x53\x4e\xc7\x40\x16\xb6" - "\x71\x3a\x00\x9f\x41\x88\xb0\xb2" - "\x71\x83\x85\x5f\xc8\x79\x0a\x99" - "\x99\xdc\x89\x1c\x88\xd2\x3e\xf9" - "\x83", - .clen = 33, - }, { - .key = "\x05\x70\xd5\x94\x12\x36\x35\xd8" - "\x8f\x7d\xd3\xa8\x99\x6a\xed\x69", - .klen = 16, - .iv = "\x14\x39\x63\xfc\x56\xd5\xdf\x5f" - "\xaf\xb3\xff\xcc\x98\x33\x1d\xeb", - .assoc = "\x23\x02\xf1\x64\x9a\x73\x89\xe6" - "\xd0\xea\x2c\xf1\x96\xfc\x4e\x6d" - "\x65\x48\xcb\x0a\xda\xf0\x62\xc0" - "\x38\x1d\x3b\x4a\xe9\x7e\x62", - .alen = 31, - .ptext = "\x32\xcb\x80\xcc\xde\x12\x33\x6d" - "\xf0\x20\x58\x15\x95\xc6\x7f\xee" - "\x2f\xf9\x4e\x2c\x1b\x98\x43\xc7" - "\x68\x28\x73\x40\x9f\x96\x4a", - .plen = 31, - .ctext = "\x29\xc4\xf0\x03\xc1\x86\xdf\x06" - "\x5c\x7b\xef\x64\x87\x00\xd1\x37" - "\xa7\x08\xbc\x7f\x8f\x41\x54\xd0" - "\x3e\xf1\xc3\xa2\x96\x84\xdd\x2a" - "\x2d\x21\x30\xf9\x02\xdb\x06\x0c" - "\xf1\x5a\x66\x69\xe0\xca\x83", - .clen = 47, - }, { - .key = "\x41\x94\x0e\x33\x22\xb1\xdd\xf4" - "\x10\x57\x85\x39\x93\x8f\xaf\x70", - .klen = 16, - .iv = "\x50\x5d\x9d\x9b\x66\x50\x88\x7b" - "\x30\x8e\xb1\x5e\x92\x58\xe0\xf1", - .assoc = "\x5f\x27\x2b\x03\xaa\xef\x32\x02" - "\x50\xc4\xde\x82\x90\x21\x11\x73" - "\x8f\x0a\xd6\x8f\xdf\x90\xe4\xda" - "\xf9\x4a\x1a\x23\xc3\xdd\x02\x81", - .alen = 32, - .ptext = "\x6e\xf0\xba\x6b\xee\x8e\xdc\x89" - "\x71\xfb\x0a\xa6\x8f\xea\x41\xf4" - "\x5a\xbb\x59\xb0\x20\x38\xc5\xe0" - "\x29\x56\x52\x19\x79\xf5\xe9\x37", - .plen = 32, - .ctext = "\xe2\x2e\x44\xdf\xd3\x60\x6d\xb2" - "\x70\x57\x37\xc5\xc2\x4f\x8d\x14" - "\xc6\xbf\x8b\xec\xf5\x62\x67\xf2" - "\x2f\xa1\xe6\xd6\xa7\xb1\x8c\x54" - "\xe5\x6b\x49\xf9\x6e\x90\xc3\xaa" - "\x7a\x00\x2e\x4d\x7f\x31\x2e\x81", - .clen = 48, - }, { - .key = "\x7e\xb9\x48\xd3\x32\x2d\x86\x10" - "\x91\x31\x37\xcb\x8d\xb3\x72\x76", - .klen = 16, - .iv = "\x8d\x82\xd6\x3b\x76\xcc\x30\x97" - "\xb1\x68\x63\xef\x8c\x7c\xa3\xf7", - .assoc = "\x9c\x4b\x65\xa2\xba\x6b\xdb\x1e" - "\xd1\x9e\x90\x13\x8a\x45\xd3\x79" - "\xba\xcd\xe2\x13\xe4\x30\x66\xf4" - "\xba\x78\xf9\xfb\x9d\x3c\xa1\x58" - "\x1a", - .alen = 33, - .ptext = "\xab\x14\xf3\x0a\xfe\x0a\x85\xa5" - "\xf2\xd5\xbc\x38\x89\x0e\x04\xfb" - "\x84\x7d\x65\x34\x25\xd8\x47\xfa" - "\xeb\x83\x31\xf1\x54\x54\x89\x0d" - "\x9d\x4d\x54\x51\x84\x61\xf6\x8e" - "\x03\x31\xf2\x25\x16\xcc\xaa\xc6" - "\x75\x73\x20\x30\x59\x54\xb2\xf0" - "\x3a\x4b\xe0\x23\x8e\xa6\x08\x35" - "\x8a", - .plen = 65, - .ctext = "\xc7\xca\x26\x61\x57\xee\xa2\xb9" - "\xb1\x37\xde\x95\x06\x90\x11\x08" - "\x4d\x30\x9f\x24\xc0\x56\xb7\xe1" - "\x0b\x9f\xd2\x57\xe9\xd2\xb1\x76" - "\x56\x9a\xb4\x58\xc5\x08\xfc\xb5" - "\xf2\x31\x9b\xc9\xcd\xb3\x64\xdb" - "\x6f\x50\xbf\xf4\x73\x9d\xfb\x6b" - "\xef\x35\x25\x48\xed\xcf\x29\xa8" - "\xac\xc3\xb9\xcb\x61\x8f\x73\x92" - "\x2c\x7a\x6f\xda\xf9\x09\x6f\xe1" - "\xc4", - .clen = 81, - }, { - .key = "\xba\xde\x82\x72\x42\xa9\x2f\x2c" - "\x12\x0b\xe9\x5c\x87\xd7\x35\x7c", - .klen = 16, - .iv = "\xc9\xa7\x10\xda\x86\x48\xd9\xb3" - "\x32\x42\x15\x80\x85\xa1\x65\xfe", - .assoc = "\xd8\x70\x9f\x42\xca\xe6\x83\x3a" - "\x52\x79\x42\xa5\x84\x6a\x96\x7f" - "\xe4\x8f\xed\x97\xe9\xd0\xe8\x0d" - "\x7c\xa6\xd8\xd4\x77\x9b\x40\x2e" - "\x28\xce\x57\x34\xcd\x6e\x84\x4c" - "\x17\x3c\xe1\xb2\xa8\x0b\xbb\xf1" - "\x96\x41\x0d\x69\xe8\x54\x0a\xc8" - "\x15\x4e\x91\x92\x89\x4b\xb7\x9b" - "\x21", - .alen = 65, - .ptext = "\xe8\x39\x2d\xaa\x0e\x85\x2d\xc1" - "\x72\xaf\x6e\xc9\x82\x33\xc7\x01" - "\xaf\x40\x70\xb8\x2a\x78\xc9\x14" - "\xac\xb1\x10\xca\x2e\xb3\x28\xe4" - "\xac", - .plen = 33, - .ctext = "\x57\xcd\x3d\x46\xc5\xf9\x68\x3b" - "\x2c\x0f\xb4\x7e\x7b\x64\x3e\x40" - "\xf3\x78\x63\x34\x89\x79\x39\x6b" - "\x61\x64\x4a\x9a\xfa\x70\xa4\xd3" - "\x54\x0b\xea\x05\xa6\x95\x64\xed" - "\x3d\x69\xa2\x0c\x27\x56\x2f\x34" - "\x66", - .clen = 49, - }, { - .key = "\xf7\x02\xbb\x11\x52\x24\xd8\x48" - "\x93\xe6\x9b\xee\x81\xfc\xf7\x82", - .klen = 16, - .iv = "\x06\xcc\x4a\x79\x96\xc3\x82\xcf" - "\xb3\x1c\xc7\x12\x7f\xc5\x28\x04", - .assoc = "\x15\x95\xd8\xe1\xda\x62\x2c\x56" - "\xd3\x53\xf4\x36\x7e\x8e\x59\x85", - .alen = 16, - .ptext = "\x24\x5e\x67\x49\x1e\x01\xd6\xdd" - "\xf3\x89\x20\x5b\x7c\x57\x89\x07", - .plen = 16, - .ctext = "\xfc\x85\x06\x28\x8f\xe8\x23\x1f" - "\x33\x98\x87\xde\x08\xb6\xb6\xae" - "\x3e\xa4\xf8\x19\xf1\x92\x60\x39" - "\xb9\x6b\x3f\xdf\xc8\xcb\x30", - .clen = 31, - }, { - .key = "\x33\x27\xf5\xb1\x62\xa0\x80\x63" - "\x14\xc0\x4d\x7f\x7b\x20\xba\x89", - .klen = 16, - .iv = "\x42\xf0\x84\x19\xa6\x3f\x2b\xea" - "\x34\xf6\x79\xa3\x79\xe9\xeb\x0a", - .assoc = "\x51\xb9\x12\x80\xea\xde\xd5\x71" - "\x54\x2d\xa6\xc8\x78\xb2\x1b\x8c", - .alen = 16, - .ptext = "\x61\x83\xa0\xe8\x2e\x7d\x7f\xf8" - "\x74\x63\xd2\xec\x76\x7c\x4c\x0d", - .plen = 16, - .ctext = "\x74\x7d\x70\x07\xe9\xba\x01\xee" - "\x6c\xc6\x6f\x50\x25\x33\xbe\x50" - "\x17\xb8\x17\x62\xed\x80\xa2\xf5" - "\x03\xde\x85\x71\x5d\x34", - .clen = 30, - }, { - .key = "\x70\x4c\x2f\x50\x72\x1c\x29\x7f" - "\x95\x9a\xff\x10\x75\x45\x7d\x8f", - .klen = 16, - .iv = "\x7f\x15\xbd\xb8\xb6\xba\xd3\x06" - "\xb5\xd1\x2b\x35\x73\x0e\xad\x10", - .assoc = "\x8e\xde\x4c\x20\xfa\x59\x7e\x8d" - "\xd5\x07\x58\x59\x72\xd7\xde\x92", - .alen = 16, - .ptext = "\x9d\xa7\xda\x88\x3e\xf8\x28\x14" - "\xf5\x3e\x85\x7d\x70\xa0\x0f\x13", - .plen = 16, - .ctext = "\xf4\xb3\x85\xf9\xac\xde\xb1\x38" - "\x29\xfd\x6c\x7c\x49\xe5\x1d\xaf" - "\xba\xea\xd4\xfa\x3f\x11\x33\x98", - .clen = 24, - }, { - .key = "\xac\x70\x69\xef\x82\x97\xd2\x9b" - "\x15\x74\xb1\xa2\x6f\x69\x3f\x95", - .klen = 16, - .iv = "\xbb\x3a\xf7\x57\xc6\x36\x7c\x22" - "\x36\xab\xde\xc6\x6d\x32\x70\x17", - .assoc = "\xcb\x03\x85\xbf\x0a\xd5\x26\xa9" - "\x56\xe1\x0a\xeb\x6c\xfb\xa1\x98", - .alen = 16, - .ptext = "\xda\xcc\x14\x27\x4e\x74\xd1\x30" - "\x76\x18\x37\x0f\x6a\xc4\xd1\x1a", - .plen = 16, - .ctext = "\xe6\x5c\x49\x4f\x78\xf3\x62\x86" - "\xe1\xb7\xa5\xc3\x32\x88\x3c\x8c" - "\x6e", - .clen = 17, - }, -}; - -/* - * MORUS-1280 test vectors - generated via reference implementation from - * SUPERCOP (https://bench.cr.yp.to/supercop.html): - * - * https://bench.cr.yp.to/supercop/supercop-20170228.tar.xz - * (see crypto_aead/morus1280128v2/ and crypto_aead/morus1280256v2/ ) - */ -static const struct aead_testvec morus1280_tv_template[] = { - { - .key = "\x00\x00\x00\x00\x00\x00\x00\x00" - "\x00\x00\x00\x00\x00\x00\x00\x00", - .klen = 16, - .iv = "\x0f\xc9\x8e\x67\x44\x9e\xaa\x86" - "\x20\x36\x2c\x24\xfe\xc9\x30\x81", - .assoc = "", - .alen = 0, - .ptext = "", - .plen = 0, - .ctext = "\x91\x85\x0f\xf5\x52\x9e\xce\xce" - "\x65\x99\xc7\xbf\xd3\x76\xe8\x98", - .clen = 16, - }, { - .key = "\x3c\x24\x39\x9f\x10\x7b\xa8\x1b" - "\x80\xda\xb2\x91\xf9\x24\xc2\x06", - .klen = 16, - .iv = "\x4b\xed\xc8\x07\x54\x1a\x52\xa2" - "\xa1\x10\xde\xb5\xf8\xed\xf3\x87", - .assoc = "", - .alen = 0, - .ptext = "\x69", - .plen = 1, - .ctext = "\x88\xc3\x4c\xf0\x2f\x43\x76\x13" - "\x96\xda\x76\x34\x33\x4e\xd5\x39" - "\x73", - .clen = 17, - }, { - .key = "\x79\x49\x73\x3e\x20\xf7\x51\x37" - "\x01\xb4\x64\x22\xf3\x48\x85\x0c", - .klen = 16, - .iv = "\x88\x12\x01\xa6\x64\x96\xfb\xbe" - "\x22\xea\x90\x47\xf2\x11\xb5\x8e", - .assoc = "", - .alen = 0, - .ptext = "\xa6\xa4\x1e\x76\xec\xd4\x50\xcc" - "\x62\x58\xe9\x8f\xef\xa4\x17\x91" - "\xb4\x96\x9f\x6b\xce\x38\xa5\x46" - "\x13\x7d\x64\x93\xd7\x05\xf5", - .plen = 31, - .ctext = "\x3e\x5c\x3b\x58\x3b\x7d\x2a\x22" - "\x75\x0b\x24\xa6\x0e\xc3\xde\x52" - "\x97\x0b\x64\xd4\xce\x90\x52\xf7" - "\xef\xdb\x6a\x38\xd2\xa8\xa1\x0d" - "\xe0\x61\x33\x24\xc6\x4d\x51\xbc" - "\xa4\x21\x74\xcf\x19\x16\x59", - .clen = 47, - }, { - .key = "\xb5\x6e\xad\xdd\x30\x72\xfa\x53" - "\x82\x8e\x16\xb4\xed\x6d\x47\x12", - .klen = 16, - .iv = "\xc4\x37\x3b\x45\x74\x11\xa4\xda" - "\xa2\xc5\x42\xd8\xec\x36\x78\x94", - .assoc = "", - .alen = 0, - .ptext = "\xe2\xc9\x58\x15\xfc\x4f\xf8\xe8" - "\xe3\x32\x9b\x21\xe9\xc8\xd9\x97" - "\xde\x58\xab\xf0\xd3\xd8\x27\x60" - "\xd5\xaa\x43\x6b\xb1\x64\x95\xa4", - .plen = 32, - .ctext = "\x30\x82\x9c\x2b\x67\xcb\xf9\x1f" - "\xde\x9f\x77\xb2\xda\x92\x61\x5c" - "\x09\x0b\x2d\x9a\x26\xaa\x1c\x06" - "\xab\x74\xb7\x2b\x95\x5f\x9f\xa1" - "\x9a\xff\x50\xa0\xa2\xff\xc5\xad" - "\x21\x8e\x84\x5c\x12\x61\xb2\xae", - .clen = 48, - }, { - .key = "\xf2\x92\xe6\x7d\x40\xee\xa3\x6f" - "\x03\x68\xc8\x45\xe7\x91\x0a\x18", - .klen = 16, - .iv = "\x01\x5c\x75\xe5\x84\x8d\x4d\xf6" - "\x23\x9f\xf4\x6a\xe6\x5a\x3b\x9a", - .assoc = "", - .alen = 0, - .ptext = "\x1f\xee\x92\xb4\x0c\xcb\xa1\x04" - "\x64\x0c\x4d\xb2\xe3\xec\x9c\x9d" - "\x09\x1a\xb7\x74\xd8\x78\xa9\x79" - "\x96\xd8\x22\x43\x8c\xc3\x34\x7b" - "\xc4", - .plen = 33, - .ctext = "\x67\x5d\x8e\x45\xc8\x39\xf5\x17" - "\xc1\x1d\x2a\xdd\x88\x67\xda\x1f" - "\x6d\xe8\x37\x28\x5a\xc1\x5e\x9f" - "\xa6\xec\xc6\x92\x05\x4b\xc0\xa3" - "\x63\xef\x88\xa4\x9b\x0a\x5c\xed" - "\x2b\x6a\xac\x63\x52\xaa\x10\x94" - "\xd0", - .clen = 49, - }, { - .key = "\x2e\xb7\x20\x1c\x50\x6a\x4b\x8b" - "\x84\x42\x7a\xd7\xe1\xb5\xcd\x1f", - .klen = 16, - .iv = "\x3d\x80\xae\x84\x94\x09\xf6\x12" - "\xa4\x79\xa6\xfb\xe0\x7f\xfd\xa0", - .assoc = "", - .alen = 0, - .ptext = "\x5c\x13\xcb\x54\x1c\x47\x4a\x1f" - "\xe5\xe6\xff\x44\xdd\x11\x5f\xa3" - "\x33\xdd\xc2\xf8\xdd\x18\x2b\x93" - "\x57\x05\x01\x1c\x66\x22\xd3\x51" - "\xd3\xdf\x18\xc9\x30\x66\xed\xb1" - "\x96\x58\xd5\x8c\x64\x8c\x7c\xf5" - "\x01\xd0\x74\x5f\x9b\xaa\xf6\xd1" - "\xe6\x16\xa2\xac\xde\x47\x40", - .plen = 63, - .ctext = "\x7d\x61\x1a\x35\x20\xcc\x07\x88" - "\x03\x98\x87\xcf\xc0\x6e\x4d\x19" - "\xe3\xd4\x0b\xfb\x29\x8f\x49\x1a" - "\x3a\x06\x77\xce\x71\x2c\xcd\xdd" - "\xed\xf6\xc9\xbe\xa6\x3b\xb8\xfc" - "\x6c\xbe\x77\xed\x74\x0e\x20\x85" - "\xd0\x65\xde\x24\x6f\xe3\x25\xc5" - "\xdf\x5b\x0f\xbd\x8a\x88\x78\xc9" - "\xe5\x81\x37\xde\x84\x7a\xf6\x84" - "\x99\x7a\x72\x9c\x54\x31\xa1", - .clen = 79, - }, { - .key = "\x6b\xdc\x5a\xbb\x60\xe5\xf4\xa6" - "\x05\x1d\x2c\x68\xdb\xda\x8f\x25", - .klen = 16, - .iv = "\x7a\xa5\xe8\x23\xa4\x84\x9e\x2d" - "\x25\x53\x58\x8c\xda\xa3\xc0\xa6", - .assoc = "", - .alen = 0, - .ptext = "\x98\x37\x05\xf3\x2c\xc2\xf3\x3b" - "\x66\xc0\xb1\xd5\xd7\x35\x21\xaa" - "\x5d\x9f\xce\x7c\xe2\xb8\xad\xad" - "\x19\x33\xe0\xf4\x40\x81\x72\x28" - "\xe1\x8b\x1c\xf8\x91\x78\xff\xaf" - "\xb0\x68\x69\xf2\x27\x35\x91\x84" - "\x2e\x37\x5b\x00\x04\xff\x16\x9c" - "\xb5\x19\x39\xeb\xd9\xcd\x29\x9a", - .plen = 64, - .ctext = "\x05\xc5\xb1\xf9\x1b\xb9\xab\x2c" - "\xa5\x07\x12\xa7\x12\x39\x60\x66" - "\x30\x81\x4a\x03\x78\x28\x45\x52" - "\xd2\x2b\x24\xfd\x8b\xa5\xb7\x66" - "\x6f\x45\xd7\x3b\x67\x6f\x51\xb9" - "\xc0\x3d\x6c\xca\x1e\xae\xff\xb6" - "\x79\xa9\xe4\x82\x5d\x4c\x2d\xdf" - "\xeb\x71\x40\xc9\x2c\x40\x45\x6d" - "\x73\x77\x01\xf3\x4f\xf3\x9d\x2a" - "\x5d\x57\xa8\xa1\x18\xa2\xad\xcb", - .clen = 80, - }, { - .key = "\xa7\x00\x93\x5b\x70\x61\x9d\xc2" - "\x86\xf7\xde\xfa\xd5\xfe\x52\x2b", - .klen = 16, - .iv = "\xb6\xca\x22\xc3\xb4\x00\x47\x49" - "\xa6\x2d\x0a\x1e\xd4\xc7\x83\xad", - .assoc = "\xc5", - .alen = 1, - .ptext = "", - .plen = 0, - .ctext = "\x4d\xbf\x11\xac\x7f\x97\x0b\x2e" - "\x89\x3b\x9d\x0f\x83\x1c\x08\xc3", - .clen = 16, - }, { - .key = "\xe4\x25\xcd\xfa\x80\xdd\x46\xde" - "\x07\xd1\x90\x8b\xcf\x23\x15\x31", - .klen = 16, - .iv = "\xf3\xee\x5c\x62\xc4\x7c\xf0\x65" - "\x27\x08\xbd\xaf\xce\xec\x45\xb3", - .assoc = "\x02\xb8\xea\xca\x09\x1b\x9a\xec" - "\x47\x3e\xe9\xd4\xcc\xb5\x76\x34" - "\xe8\x73\x62\x64\xab\x50\xd0\xda" - "\x6b\x83\x66\xaf\x3e\x27\xc9", - .alen = 31, - .ptext = "", - .plen = 0, - .ctext = "\x5b\xc0\x8d\x54\xe4\xec\xbe\x38" - "\x03\x12\xf9\xcc\x9e\x46\x42\x92", - .clen = 16, - }, { - .key = "\x20\x4a\x07\x99\x91\x58\xee\xfa" - "\x88\xab\x42\x1c\xc9\x47\xd7\x38", - .klen = 16, - .iv = "\x2f\x13\x95\x01\xd5\xf7\x99\x81" - "\xa8\xe2\x6f\x41\xc8\x10\x08\xb9", - .assoc = "\x3f\xdc\x24\x69\x19\x96\x43\x08" - "\xc8\x18\x9b\x65\xc6\xd9\x39\x3b" - "\x12\x35\x6e\xe8\xb0\xf0\x52\xf3" - "\x2d\xb0\x45\x87\x18\x86\x68\xf6", - .alen = 32, - .ptext = "", - .plen = 0, - .ctext = "\x48\xc5\xc3\x4c\x40\x2e\x2f\xc2" - "\x6d\x65\xe0\x67\x9c\x1d\xa0\xf0", - .clen = 16, - }, { - .key = "\x5d\x6f\x41\x39\xa1\xd4\x97\x16" - "\x09\x85\xf4\xae\xc3\x6b\x9a\x3e", - .klen = 16, - .iv = "\x6c\x38\xcf\xa1\xe5\x73\x41\x9d" - "\x29\xbc\x21\xd2\xc2\x35\xcb\xbf", - .assoc = "\x7b\x01\x5d\x08\x29\x12\xec\x24" - "\x49\xf3\x4d\xf7\xc0\xfe\xfb\x41" - "\x3c\xf8\x79\x6c\xb6\x90\xd4\x0d" - "\xee\xde\x23\x60\xf2\xe5\x08\xcc" - "\x97", - .alen = 33, - .ptext = "", - .plen = 0, - .ctext = "\x28\x64\x78\x51\x55\xd8\x56\x4a" - "\x58\x3e\xf7\xbe\xee\x21\xfe\x94", - .clen = 16, - }, { - .key = "\x99\x93\x7a\xd8\xb1\x50\x40\x31" - "\x8a\x60\xa6\x3f\xbd\x90\x5d\x44", - .klen = 16, - .iv = "\xa8\x5c\x09\x40\xf5\xef\xea\xb8" - "\xaa\x96\xd3\x64\xbc\x59\x8d\xc6", - .assoc = "\xb8\x26\x97\xa8\x39\x8e\x94\x3f" - "\xca\xcd\xff\x88\xba\x22\xbe\x47" - "\x67\xba\x85\xf1\xbb\x30\x56\x26" - "\xaf\x0b\x02\x38\xcc\x44\xa7\xa3" - "\xa6\xbf\x31\x93\x60\xcd\xda\x63" - "\x2c\xb1\xaa\x19\xc8\x19\xf8\xeb" - "\x03\xa1\xe8\xbe\x37\x54\xec\xa2" - "\xcd\x2c\x45\x58\xbd\x8e\x80", - .alen = 63, - .ptext = "", - .plen = 0, - .ctext = "\xb3\xa6\x00\x4e\x09\x20\xac\x21" - "\x77\x72\x69\x76\x2d\x36\xe5\xc8", - .clen = 16, - }, { - .key = "\xd6\xb8\xb4\x77\xc1\xcb\xe9\x4d" - "\x0a\x3a\x58\xd1\xb7\xb4\x1f\x4a", - .klen = 16, - .iv = "\xe5\x81\x42\xdf\x05\x6a\x93\xd4" - "\x2b\x70\x85\xf5\xb6\x7d\x50\xcc", - .assoc = "\xf4\x4a\xd1\x47\x49\x09\x3d\x5b" - "\x4b\xa7\xb1\x19\xb4\x46\x81\x4d" - "\x91\x7c\x91\x75\xc0\xd0\xd8\x40" - "\x71\x39\xe1\x10\xa6\xa3\x46\x7a" - "\xb4\x6b\x35\xc2\xc1\xdf\xed\x60" - "\x46\xc1\x3e\x7f\x8c\xc2\x0e\x7a" - "\x30\x08\xd0\x5f\xa0\xaa\x0c\x6d" - "\x9c\x2f\xdb\x97\xb8\x15\x69\x01", - .alen = 64, - .ptext = "", - .plen = 0, - .ctext = "\x65\x33\x7b\xa1\x63\xf4\x20\xdd" - "\xe4\xb9\x4a\xaa\x9a\x21\xaa\x14", - .clen = 16, - }, { - .key = "\x12\xdd\xee\x17\xd1\x47\x92\x69" - "\x8b\x14\x0a\x62\xb1\xd9\xe2\x50", - .klen = 16, - .iv = "\x22\xa6\x7c\x7f\x15\xe6\x3c\xf0" - "\xac\x4b\x37\x86\xb0\xa2\x13\xd2", - .assoc = "\x31", - .alen = 1, - .ptext = "\x40", - .plen = 1, - .ctext = "\x1d\x47\x17\x34\x86\xf5\x54\x1a" - "\x6d\x28\xb8\x5d\x6c\xcf\xa0\xb9" - "\xbf", - .clen = 17, - }, { - .key = "\x4f\x01\x27\xb6\xe1\xc3\x3a\x85" - "\x0c\xee\xbc\xf4\xab\xfd\xa5\x57", - .klen = 16, - .iv = "\x5e\xcb\xb6\x1e\x25\x62\xe4\x0c" - "\x2d\x25\xe9\x18\xaa\xc6\xd5\xd8", - .assoc = "\x6d\x94\x44\x86\x69\x00\x8f\x93" - "\x4d\x5b\x15\x3c\xa8\x8f\x06\x5a" - "\xe6\x01\xa8\x7e\xca\x10\xdc\x73" - "\xf4\x94\x9f\xc1\x5a\x61\x85", - .alen = 31, - .ptext = "\x7c\x5d\xd3\xee\xad\x9f\x39\x1a" - "\x6d\x92\x42\x61\xa7\x58\x37\xdb" - "\xb0\xb2\x2b\x9f\x0b\xb8\xbd\x7a" - "\x24\xa0\xd6\xb7\x11\x79\x6c", - .plen = 31, - .ctext = "\x78\x90\x52\xae\x0f\xf7\x2e\xef" - "\x63\x09\x08\x58\xb5\x56\xbd\x72" - "\x6e\x42\xcf\x27\x04\x7c\xdb\x92" - "\x18\xe9\xa4\x33\x90\xba\x62\xb5" - "\x70\xd3\x88\x9b\x4f\x05\xa7\x51" - "\x85\x87\x17\x09\x42\xed\x4e", - .clen = 47, - }, { - .key = "\x8b\x26\x61\x55\xf1\x3e\xe3\xa1" - "\x8d\xc8\x6e\x85\xa5\x21\x67\x5d", - .klen = 16, - .iv = "\x9b\xef\xf0\xbd\x35\xdd\x8d\x28" - "\xad\xff\x9b\xa9\xa4\xeb\x98\xdf", - .assoc = "\xaa\xb8\x7e\x25\x79\x7c\x37\xaf" - "\xce\x36\xc7\xce\xa2\xb4\xc9\x60" - "\x10\xc3\xb3\x02\xcf\xb0\x5e\x8d" - "\xb5\xc2\x7e\x9a\x35\xc0\x24\xfd", - .alen = 32, - .ptext = "\xb9\x82\x0c\x8d\xbd\x1b\xe2\x36" - "\xee\x6c\xf4\xf2\xa1\x7d\xf9\xe2" - "\xdb\x74\x36\x23\x11\x58\x3f\x93" - "\xe5\xcd\xb5\x90\xeb\xd8\x0c\xb3", - .plen = 32, - .ctext = "\x1d\x2c\x57\xe0\x50\x38\x3d\x41" - "\x2e\x71\xc8\x3b\x92\x43\x58\xaf" - "\x5a\xfb\xad\x8f\xd9\xd5\x8a\x5e" - "\xdb\xf3\xcd\x3a\x2b\xe1\x2c\x1a" - "\xb0\xed\xe3\x0c\x6e\xf9\xf2\xd6" - "\x90\xe6\xb1\x0e\xa5\x8a\xac\xb7", - .clen = 48, - }, { - .key = "\xc8\x4b\x9b\xf5\x01\xba\x8c\xbd" - "\x0e\xa3\x21\x16\x9f\x46\x2a\x63", - .klen = 16, - .iv = "\xd7\x14\x29\x5d\x45\x59\x36\x44" - "\x2e\xd9\x4d\x3b\x9e\x0f\x5b\xe5", - .assoc = "\xe6\xdd\xb8\xc4\x89\xf8\xe0\xca" - "\x4f\x10\x7a\x5f\x9c\xd8\x8b\x66" - "\x3b\x86\xbf\x86\xd4\x50\xe0\xa7" - "\x76\xef\x5c\x72\x0f\x1f\xc3\xd4" - "\xee", - .alen = 33, - .ptext = "\xf5\xa6\x46\x2c\xce\x97\x8a\x51" - "\x6f\x46\xa6\x83\x9b\xa1\xbc\xe8" - "\x05\x36\x42\xa7\x16\xf8\xc1\xad" - "\xa7\xfb\x94\x68\xc5\x37\xab\x8a" - "\x72", - .plen = 33, - .ctext = "\x59\x10\x84\x1c\x83\x4c\x8b\xfc" - "\xfd\x2e\x4b\x46\x84\xff\x78\x4e" - "\x50\xda\x5c\xb9\x61\x1d\xf5\xb9" - "\xfe\xbb\x7f\xae\x8c\xc1\x24\xbd" - "\x8c\x6f\x1f\x9b\xce\xc6\xc1\x37" - "\x08\x06\x5a\xe5\x96\x10\x95\xc2" - "\x5e", - .clen = 49, - }, { - .key = "\x05\x70\xd5\x94\x12\x36\x35\xd8" - "\x8f\x7d\xd3\xa8\x99\x6a\xed\x69", - .klen = 16, - .iv = "\x14\x39\x63\xfc\x56\xd5\xdf\x5f" - "\xaf\xb3\xff\xcc\x98\x33\x1d\xeb", - .assoc = "\x23\x02\xf1\x64\x9a\x73\x89\xe6" - "\xd0\xea\x2c\xf1\x96\xfc\x4e\x6d" - "\x65\x48\xcb\x0a\xda\xf0\x62\xc0" - "\x38\x1d\x3b\x4a\xe9\x7e\x62\xaa" - "\xfd\xc9\x4a\xa9\xa9\x39\x4b\x54" - "\xc8\x0e\x24\x7f\x5e\x10\x7a\x45" - "\x10\x0b\x56\x85\xad\x54\xaa\x66" - "\xa8\x43\xcd\xd4\x9b\xb7\xfa", - .alen = 63, - .ptext = "\x32\xcb\x80\xcc\xde\x12\x33\x6d" - "\xf0\x20\x58\x15\x95\xc6\x7f\xee" - "\x2f\xf9\x4e\x2c\x1b\x98\x43\xc7" - "\x68\x28\x73\x40\x9f\x96\x4a\x60" - "\x80\xf4\x4b\xf4\xc1\x3d\xd0\x93" - "\xcf\x12\xc9\x59\x8f\x7a\x7f\xa8" - "\x1b\xa5\x50\xed\x87\xa9\x72\x59" - "\x9c\x44\xb2\xa4\x99\x98\x34", - .plen = 63, - .ctext = "\x9a\x12\xbc\xdf\x72\xa8\x56\x22" - "\x49\x2d\x07\x92\xfc\x3d\x6d\x5f" - "\xef\x36\x19\xae\x91\xfa\xd6\x63" - "\x46\xea\x8a\x39\x14\x21\xa6\x37" - "\x18\xfc\x97\x3e\x16\xa5\x4d\x39" - "\x45\x2e\x69\xcc\x9c\x5f\xdf\x6d" - "\x5e\xa2\xbf\xac\x83\x32\x72\x52" - "\x58\x58\x23\x40\xfd\xa5\xc2\xe6" - "\xe9\x5a\x50\x98\x00\x58\xc9\x86" - "\x4f\x20\x37\xdb\x7b\x22\xa3", - .clen = 79, - }, { - .key = "\x41\x94\x0e\x33\x22\xb1\xdd\xf4" - "\x10\x57\x85\x39\x93\x8f\xaf\x70", - .klen = 16, - .iv = "\x50\x5d\x9d\x9b\x66\x50\x88\x7b" - "\x30\x8e\xb1\x5e\x92\x58\xe0\xf1", - .assoc = "\x5f\x27\x2b\x03\xaa\xef\x32\x02" - "\x50\xc4\xde\x82\x90\x21\x11\x73" - "\x8f\x0a\xd6\x8f\xdf\x90\xe4\xda" - "\xf9\x4a\x1a\x23\xc3\xdd\x02\x81" - "\x0b\x76\x4f\xd7\x0a\x4b\x5e\x51" - "\xe3\x1d\xb9\xe5\x21\xb9\x8f\xd4" - "\x3d\x72\x3e\x26\x16\xa9\xca\x32" - "\x77\x47\x63\x14\x95\x3d\xe4\x34", - .alen = 64, - .ptext = "\x6e\xf0\xba\x6b\xee\x8e\xdc\x89" - "\x71\xfb\x0a\xa6\x8f\xea\x41\xf4" - "\x5a\xbb\x59\xb0\x20\x38\xc5\xe0" - "\x29\x56\x52\x19\x79\xf5\xe9\x37" - "\x8f\xa1\x50\x23\x22\x4f\xe3\x91" - "\xe9\x21\x5e\xbf\x52\x23\x95\x37" - "\x48\x0c\x38\x8f\xf0\xff\x92\x24" - "\x6b\x47\x49\xe3\x94\x1f\x1e\x01", - .plen = 64, - .ctext = "\xe6\xeb\x92\x5a\x5b\xf0\x2d\xbb" - "\x23\xec\x35\xe3\xae\xc9\xfb\x0b" - "\x90\x14\x46\xeb\xa8\x8d\xb0\x9b" - "\x39\xda\x8b\x48\xec\xb2\x00\x4e" - "\x80\x6f\x46\x4f\x9b\x1e\xbb\x35" - "\xea\x5a\xbc\xa2\x36\xa5\x89\x45" - "\xc2\xd6\xd7\x15\x0b\xf6\x6c\x56" - "\xec\x99\x7d\x61\xb3\x15\x93\xed" - "\x83\x1e\xd9\x48\x84\x0b\x37\xfe" - "\x95\x74\x44\xd5\x54\xa6\x27\x06", - .clen = 80, - }, { - .key = "\x7e\xb9\x48\xd3\x32\x2d\x86\x10" - "\x91\x31\x37\xcb\x8d\xb3\x72\x76", - .klen = 16, - .iv = "\x8d\x82\xd6\x3b\x76\xcc\x30\x97" - "\xb1\x68\x63\xef\x8c\x7c\xa3\xf7", - .assoc = "\x9c\x4b\x65\xa2\xba\x6b\xdb\x1e" - "\xd1\x9e\x90\x13\x8a\x45\xd3\x79" - "\xba\xcd\xe2\x13\xe4\x30\x66\xf4" - "\xba\x78\xf9\xfb\x9d\x3c\xa1\x58" - "\x1a\x22\x53\x05\x6b\x5c\x71\x4f" - "\xfd\x2d\x4d\x4c\xe5\x62\xa5\x63" - "\x6a\xda\x26\xc8\x7f\xff\xea\xfd" - "\x46\x4a\xfa\x53\x8f\xc4\xcd\x68" - "\x58", - .alen = 65, - .ptext = "\xab\x14\xf3\x0a\xfe\x0a\x85\xa5" - "\xf2\xd5\xbc\x38\x89\x0e\x04\xfb" - "\x84\x7d\x65\x34\x25\xd8\x47\xfa" - "\xeb\x83\x31\xf1\x54\x54\x89\x0d" - "\x9d\x4d\x54\x51\x84\x61\xf6\x8e" - "\x03\x31\xf2\x25\x16\xcc\xaa\xc6" - "\x75\x73\x20\x30\x59\x54\xb2\xf0" - "\x3a\x4b\xe0\x23\x8e\xa6\x08\x35" - "\x8a\xdf\x27\xa0\xe4\x60\x99\xae" - "\x8e\x43\xd9\x39\x7b\x10\x40\x67" - "\x5c\x7e\xc9\x70\x63\x34\xca\x59" - "\xfe\x86\xbc\xb7\x9c\x39\xf3\x6d" - "\x6a\x41\x64\x6f\x16\x7f\x65\x7e" - "\x89\x84\x68\xeb\xb0\x51\xbe\x55" - "\x33\x16\x59\x6c\x3b\xef\x88\xad" - "\x2f\xab\xbc\x25\x76\x87\x41\x2f" - "\x36", - .plen = 129, - .ctext = "\x89\x24\x27\x86\xdc\xd7\x6b\xd9" - "\xd1\xcd\xdc\x16\xdd\x2c\xc1\xfb" - "\x52\xb5\xb3\xab\x50\x99\x3f\xa0" - "\x38\xa4\x74\xa5\x04\x15\x63\x05" - "\x8f\x54\x81\x06\x5a\x6b\xa4\x63" - "\x6d\xa7\x21\xcb\xff\x42\x30\x8e" - "\x3b\xd1\xca\x3f\x4b\x1a\xb8\xc3" - "\x42\x01\xe6\xbc\x75\x15\x87\xee" - "\xc9\x8e\x65\x01\xd9\xd8\xb5\x9f" - "\x48\x86\xa6\x5f\x2c\xc7\xb5\xb0" - "\xed\x5d\x14\x7c\x3f\x40\xb1\x0b" - "\x72\xef\x94\x8d\x7a\x85\x56\xe5" - "\x56\x08\x15\x56\xba\xaf\xbd\xf0" - "\x20\xef\xa0\xf6\xa9\xad\xa2\xc9" - "\x1c\x3b\x28\x51\x7e\x77\xb2\x18" - "\x4f\x61\x64\x37\x22\x36\x6d\x78" - "\xed\xed\x35\xe8\x83\xa5\xec\x25" - "\x6b\xff\x5f\x1a\x09\x96\x3d\xdc" - "\x20", - .clen = 145, - }, { - .key = "\xba\xde\x82\x72\x42\xa9\x2f\x2c" - "\x12\x0b\xe9\x5c\x87\xd7\x35\x7c", - .klen = 16, - .iv = "\xc9\xa7\x10\xda\x86\x48\xd9\xb3" - "\x32\x42\x15\x80\x85\xa1\x65\xfe", - .assoc = "\xd8\x70\x9f\x42\xca\xe6\x83\x3a" - "\x52\x79\x42\xa5\x84\x6a\x96\x7f" - "\xe4\x8f\xed\x97\xe9\xd0\xe8\x0d" - "\x7c\xa6\xd8\xd4\x77\x9b\x40\x2e" - "\x28\xce\x57\x34\xcd\x6e\x84\x4c" - "\x17\x3c\xe1\xb2\xa8\x0b\xbb\xf1" - "\x96\x41\x0d\x69\xe8\x54\x0a\xc8" - "\x15\x4e\x91\x92\x89\x4b\xb7\x9b" - "\x21\xf7\x42\x89\xac\x12\x2a\x54" - "\x69\xee\x18\xc7\x8d\xed\xe8\xfd" - "\xbb\x04\x28\xe6\x8a\x3c\x98\xc1" - "\x04\x2d\xa9\xa1\x24\x83\xff\xe9" - "\x55\x7a\xf0\xd1\xf6\x63\x05\xe1" - "\xd9\x1e\x75\x72\xc1\x9f\xae\x32" - "\xe1\x6b\xcd\x9e\x61\x19\x23\x86" - "\xd9\xd2\xaf\x8e\xd5\xd3\xa8\xa9" - "\x51", - .alen = 129, - .ptext = "\xe8\x39\x2d\xaa\x0e\x85\x2d\xc1" - "\x72\xaf\x6e\xc9\x82\x33\xc7\x01" - "\xaf\x40\x70\xb8\x2a\x78\xc9\x14" - "\xac\xb1\x10\xca\x2e\xb3\x28\xe4" - "\xac\xfa\x58\x7f\xe5\x73\x09\x8c" - "\x1d\x40\x87\x8c\xd9\x75\xc0\x55" - "\xa2\xda\x07\xd1\xc2\xa9\xd1\xbb" - "\x09\x4f\x77\x62\x88\x2d\xf2\x68" - "\x54", - .plen = 65, - .ctext = "\x36\x78\xb9\x22\xde\x62\x35\x55" - "\x1a\x7a\xf5\x45\xbc\xd7\x15\x82" - "\x01\xe9\x5a\x07\xea\x46\xaf\x91" - "\xcb\x73\xa5\xee\xe1\xb4\xbf\xc2" - "\xdb\xd2\x9d\x59\xde\xfc\x83\x00" - "\xf5\x46\xac\x97\xd5\x57\xa9\xb9" - "\x1f\x8c\xe8\xca\x68\x8b\x91\x0c" - "\x01\xbe\x0a\xaf\x7c\xf6\x67\xa4" - "\xbf\xbc\x88\x3f\x5d\xd1\xf9\x19" - "\x0f\x9d\xb2\xaf\xb9\x6e\x17\xdf" - "\xa2", - .clen = 81, - }, { - .key = "\xf7\x02\xbb\x11\x52\x24\xd8\x48" - "\x93\xe6\x9b\xee\x81\xfc\xf7\x82", - .klen = 16, - .iv = "\x06\xcc\x4a\x79\x96\xc3\x82\xcf" - "\xb3\x1c\xc7\x12\x7f\xc5\x28\x04", - .assoc = "\x15\x95\xd8\xe1\xda\x62\x2c\x56" - "\xd3\x53\xf4\x36\x7e\x8e\x59\x85" - "\x0e\x51\xf9\x1c\xee\x70\x6a\x27" - "\x3d\xd3\xb7\xac\x51\xfa\xdf\x05", - .alen = 32, - .ptext = "\x24\x5e\x67\x49\x1e\x01\xd6\xdd" - "\xf3\x89\x20\x5b\x7c\x57\x89\x07" - "\xd9\x02\x7c\x3d\x2f\x18\x4b\x2d" - "\x6e\xde\xee\xa2\x08\x12\xc7\xba", - .plen = 32, - .ctext = "\x08\x1b\x95\x0e\x41\x95\x02\x4b" - "\x9c\xbb\xa8\xd0\x7c\xd3\x44\x6e" - "\x89\x14\x33\x70\x0a\xbc\xea\x39" - "\x88\xaa\x2b\xd5\x73\x11\x55\xf5" - "\x33\x33\x9c\xd7\x42\x34\x49\x8e" - "\x2f\x03\x30\x05\x47\xaf\x34", - .clen = 47, - }, { - .key = "\x33\x27\xf5\xb1\x62\xa0\x80\x63" - "\x14\xc0\x4d\x7f\x7b\x20\xba\x89", - .klen = 16, - .iv = "\x42\xf0\x84\x19\xa6\x3f\x2b\xea" - "\x34\xf6\x79\xa3\x79\xe9\xeb\x0a", - .assoc = "\x51\xb9\x12\x80\xea\xde\xd5\x71" - "\x54\x2d\xa6\xc8\x78\xb2\x1b\x8c" - "\x39\x14\x05\xa0\xf3\x10\xec\x41" - "\xff\x01\x95\x84\x2b\x59\x7f\xdb", - .alen = 32, - .ptext = "\x61\x83\xa0\xe8\x2e\x7d\x7f\xf8" - "\x74\x63\xd2\xec\x76\x7c\x4c\x0d" - "\x03\xc4\x88\xc1\x35\xb8\xcd\x47" - "\x2f\x0c\xcd\x7a\xe2\x71\x66\x91", - .plen = 32, - .ctext = "\x97\xca\xf4\xe0\x8d\x89\xbf\x68" - "\x0c\x60\xb9\x27\xdf\xaa\x41\xc6" - "\x25\xd8\xf7\x1f\x10\x15\x48\x61" - "\x4c\x95\x00\xdf\x51\x9b\x7f\xe6" - "\x24\x40\x9e\xbe\x3b\xeb\x1b\x98" - "\xb9\x9c\xe5\xef\xf2\x05", - .clen = 46, - }, { - .key = "\x70\x4c\x2f\x50\x72\x1c\x29\x7f" - "\x95\x9a\xff\x10\x75\x45\x7d\x8f", - .klen = 16, - .iv = "\x7f\x15\xbd\xb8\xb6\xba\xd3\x06" - "\xb5\xd1\x2b\x35\x73\x0e\xad\x10", - .assoc = "\x8e\xde\x4c\x20\xfa\x59\x7e\x8d" - "\xd5\x07\x58\x59\x72\xd7\xde\x92" - "\x63\xd6\x10\x24\xf8\xb0\x6e\x5a" - "\xc0\x2e\x74\x5d\x06\xb8\x1e\xb2", - .alen = 32, - .ptext = "\x9d\xa7\xda\x88\x3e\xf8\x28\x14" - "\xf5\x3e\x85\x7d\x70\xa0\x0f\x13" - "\x2e\x86\x93\x45\x3a\x58\x4f\x61" - "\xf0\x3a\xac\x53\xbc\xd0\x06\x68", - .plen = 32, - .ctext = "\x63\x4c\x2a\x8e\xb4\x6b\x63\x0d" - "\xb5\xec\x9b\x4e\x12\x23\xa3\xcf" - "\x1a\x5a\x70\x15\x5a\x10\x40\x51" - "\xca\x47\x4c\x9d\xc9\x97\xf4\x77" - "\xdb\xc8\x10\x2d\xdc\x65\x20\x3f", - .clen = 40, - }, { - .key = "\xac\x70\x69\xef\x82\x97\xd2\x9b" - "\x15\x74\xb1\xa2\x6f\x69\x3f\x95", - .klen = 16, - .iv = "\xbb\x3a\xf7\x57\xc6\x36\x7c\x22" - "\x36\xab\xde\xc6\x6d\x32\x70\x17", - .assoc = "\xcb\x03\x85\xbf\x0a\xd5\x26\xa9" - "\x56\xe1\x0a\xeb\x6c\xfb\xa1\x98" - "\x8d\x98\x1c\xa8\xfe\x50\xf0\x74" - "\x81\x5c\x53\x35\xe0\x17\xbd\x88", - .alen = 32, - .ptext = "\xda\xcc\x14\x27\x4e\x74\xd1\x30" - "\x76\x18\x37\x0f\x6a\xc4\xd1\x1a" - "\x58\x49\x9f\xc9\x3f\xf8\xd1\x7a" - "\xb2\x67\x8b\x2b\x96\x2f\xa5\x3e", - .plen = 32, - .ctext = "\xf1\x62\x44\xc7\x5f\x19\xca\x43" - "\x47\x2c\xaf\x68\x82\xbd\x51\xef" - "\x3d\x65\xd8\x45\x2d\x06\x07\x78" - "\x08\x2e\xb3\x23\xcd\x81\x12\x55" - "\x1a", - .clen = 33, - }, { - .key = "\xe9\x95\xa2\x8f\x93\x13\x7b\xb7" - "\x96\x4e\x63\x33\x69\x8d\x02\x9b" - "\x23\xf9\x22\xeb\x80\xa0\xb1\x81" - "\xe2\x73\xc3\x21\x4d\x47\x8d\xf4", - .klen = 32, - .iv = "\xf8\x5e\x31\xf7\xd7\xb2\x25\x3e" - "\xb7\x85\x90\x58\x67\x57\x33\x1d", - .assoc = "", - .alen = 0, - .ptext = "", - .plen = 0, - .ctext = "\xdf\x2f\x83\xc0\x45\x4a\x2c\xcf" - "\xb9\xd2\x41\xf6\x80\xa1\x52\x70", - .clen = 16, - }, { - .key = "\x25\xba\xdc\x2e\xa3\x8f\x24\xd3" - "\x17\x29\x15\xc5\x63\xb2\xc5\xa1" - "\x4d\xbc\x2d\x6f\x85\x40\x33\x9a" - "\xa3\xa0\xa1\xfa\x27\xa6\x2c\xca", - .klen = 32, - .iv = "\x34\x83\x6a\x96\xe7\x2d\xce\x5a" - "\x38\x5f\x42\xe9\x61\x7b\xf5\x23", - .assoc = "", - .alen = 0, - .ptext = "\x53", - .plen = 1, - .ctext = "\x01\xd8\x55\x3c\xc0\x5a\x4b\xc7" - "\x01\xf4\x08\xe3\x0d\xf7\xf0\x78" - "\x53", - .clen = 17, - }, { - .key = "\x62\xdf\x16\xcd\xb3\x0a\xcc\xef" - "\x98\x03\xc7\x56\x5d\xd6\x87\xa8" - "\x77\x7e\x39\xf3\x8a\xe0\xb5\xb4" - "\x65\xce\x80\xd2\x01\x05\xcb\xa1", - .klen = 32, - .iv = "\x71\xa8\xa4\x35\xf7\xa9\x76\x75" - "\xb8\x39\xf4\x7a\x5b\x9f\xb8\x29", - .assoc = "", - .alen = 0, - .ptext = "\x8f\x3a\xc1\x05\x7f\xe7\xcb\x83" - "\xf9\xa6\x4d\xc3\x58\x31\x19\x2c" - "\xd7\x90\xc2\x56\x4e\xd8\x57\xc7" - "\xf6\xf0\x27\xb4\x25\x4c\x83", - .plen = 31, - .ctext = "\xc2\x4b\x41\x0f\x2d\xb9\x62\x07" - "\xff\x8e\x74\xf8\xa1\xa6\xd5\x37" - "\xa5\x64\x31\x5c\xca\x73\x9b\x43" - "\xe6\x70\x63\x46\x95\xcb\xf7\xb5" - "\x20\x8c\x75\x7a\x2a\x17\x2f\xa9" - "\xb8\x4d\x11\x42\xd1\xf8\xf1", - .clen = 47, - }, { - .key = "\x9e\x03\x4f\x6d\xc3\x86\x75\x0a" - "\x19\xdd\x79\xe8\x57\xfb\x4a\xae" - "\xa2\x40\x45\x77\x90\x80\x37\xce" - "\x26\xfb\x5f\xaa\xdb\x64\x6b\x77", - .klen = 32, - .iv = "\xae\xcc\xde\xd5\x07\x25\x1f\x91" - "\x39\x14\xa6\x0c\x55\xc4\x7b\x30", - .assoc = "", - .alen = 0, - .ptext = "\xcc\x5f\xfb\xa4\x8f\x63\x74\x9f" - "\x7a\x81\xff\x55\x52\x56\xdc\x33" - "\x01\x52\xcd\xdb\x53\x78\xd9\xe1" - "\xb7\x1d\x06\x8d\xff\xab\x22\x98", - .plen = 32, - .ctext = "\xbb\x01\x7c\xd1\x2c\x33\x7b\x37" - "\x0a\xee\xc4\x30\x19\xd7\x3a\x6f" - "\xf8\x2b\x67\xf5\x3b\x84\x87\x2a" - "\xfb\x07\x7a\x82\xb5\xe4\x85\x26" - "\x1e\xa8\xe5\x04\x54\xce\xe5\x5f" - "\xb5\x3f\xc1\xd5\x7f\xbd\xd2\xa6", - .clen = 48, - }, { - .key = "\xdb\x28\x89\x0c\xd3\x01\x1e\x26" - "\x9a\xb7\x2b\x79\x51\x1f\x0d\xb4" - "\xcc\x03\x50\xfc\x95\x20\xb9\xe7" - "\xe8\x29\x3e\x83\xb5\xc3\x0a\x4e", - .klen = 32, - .iv = "\xea\xf1\x18\x74\x17\xa0\xc8\xad" - "\xba\xee\x58\x9d\x4f\xe8\x3d\x36", - .assoc = "", - .alen = 0, - .ptext = "\x08\x84\x34\x44\x9f\xde\x1c\xbb" - "\xfb\x5b\xb1\xe6\x4c\x7a\x9f\x39" - "\x2c\x14\xd9\x5f\x59\x18\x5b\xfb" - "\x79\x4b\xe5\x65\xd9\x0a\xc1\x6f" - "\x2e", - .plen = 33, - .ctext = "\xc2\xf4\x40\x55\xf9\x59\xff\x73" - "\x08\xf5\x98\x92\x0c\x7b\x35\x9a" - "\xa8\xf4\x42\x7e\x6f\x93\xca\x22" - "\x23\x06\x1e\xf8\x89\x22\xf4\x46" - "\x7c\x7c\x67\x75\xab\xe5\x75\xaa" - "\x15\xd7\x83\x19\xfd\x31\x59\x5b" - "\x32", - .clen = 49, - }, { - .key = "\x17\x4d\xc3\xab\xe3\x7d\xc7\x42" - "\x1b\x91\xdd\x0a\x4b\x43\xcf\xba" - "\xf6\xc5\x5c\x80\x9a\xc0\x3b\x01" - "\xa9\x56\x1d\x5b\x8f\x22\xa9\x25", - .klen = 32, - .iv = "\x27\x16\x51\x13\x27\x1c\x71\xc9" - "\x3b\xc8\x0a\x2f\x49\x0c\x00\x3c", - .assoc = "", - .alen = 0, - .ptext = "\x45\xa8\x6e\xe3\xaf\x5a\xc5\xd7" - "\x7c\x35\x63\x77\x46\x9f\x61\x3f" - "\x56\xd7\xe4\xe3\x5e\xb8\xdc\x14" - "\x3a\x79\xc4\x3e\xb3\x69\x61\x46" - "\x3c\xb6\x83\x4e\xb4\x26\xc7\x73" - "\x22\xda\x52\x8b\x7d\x11\x98\xea" - "\x62\xe1\x14\x1e\xdc\xfe\x0f\xad" - "\x20\x76\x5a\xdc\x4e\x71\x13", - .plen = 63, - .ctext = "\xc9\x82\x3b\x4b\x87\x84\xa5\xdb" - "\xa0\x8c\xd3\x3e\x7f\x8d\xe8\x28" - "\x2a\xdc\xfa\x01\x84\x87\x9a\x70" - "\x81\x75\x37\x0a\xd2\x75\xa9\xb6" - "\x21\x72\xee\x7e\x65\x95\xe5\xcc" - "\x01\xb7\x39\xa6\x51\x15\xca\xff" - "\x61\xdc\x97\x38\xcc\xf4\xca\xc7" - "\x83\x9b\x05\x11\x72\x60\xf0\xb4" - "\x7e\x06\xab\x0a\xc0\xbb\x59\x23" - "\xaa\x2d\xfc\x4e\x35\x05\x59", - .clen = 79, - }, { - .key = "\x54\x71\xfd\x4b\xf3\xf9\x6f\x5e" - "\x9c\x6c\x8f\x9c\x45\x68\x92\xc1" - "\x21\x87\x67\x04\x9f\x60\xbd\x1b" - "\x6a\x84\xfc\x34\x6a\x81\x48\xfb", - .klen = 32, - .iv = "\x63\x3b\x8b\xb3\x37\x98\x1a\xe5" - "\xbc\xa2\xbc\xc0\x43\x31\xc2\x42", - .assoc = "", - .alen = 0, - .ptext = "\x81\xcd\xa8\x82\xbf\xd6\x6e\xf3" - "\xfd\x0f\x15\x09\x40\xc3\x24\x45" - "\x81\x99\xf0\x67\x63\x58\x5e\x2e" - "\xfb\xa6\xa3\x16\x8d\xc8\x00\x1c" - "\x4b\x62\x87\x7c\x15\x38\xda\x70" - "\x3d\xea\xe7\xf2\x40\xba\xae\x79" - "\x8f\x48\xfc\xbf\x45\x53\x2e\x78" - "\xef\x79\xf0\x1b\x49\xf7\xfd\x9c", - .plen = 64, - .ctext = "\x11\x7c\x7d\xef\xce\x29\x95\xec" - "\x7e\x9f\x42\xa6\x26\x07\xa1\x75" - "\x2f\x4e\x09\x9a\xf6\x6b\xc2\xfa" - "\x0d\xd0\x17\xdc\x25\x1e\x9b\xdc" - "\x5f\x8c\x1c\x60\x15\x4f\x9b\x20" - "\x7b\xff\xcd\x82\x60\x84\xf4\xa5" - "\x20\x9a\x05\x19\x5b\x02\x0a\x72" - "\x43\x11\x26\x58\xcf\xc5\x41\xcf" - "\x13\xcc\xde\x32\x92\xfa\x86\xf2" - "\xaf\x16\xe8\x8f\xca\xb6\xfd\x54", - .clen = 80, - }, { - .key = "\x90\x96\x36\xea\x03\x74\x18\x7a" - "\x1d\x46\x42\x2d\x3f\x8c\x54\xc7" - "\x4b\x4a\x73\x89\xa4\x00\x3f\x34" - "\x2c\xb1\xdb\x0c\x44\xe0\xe8\xd2", - .klen = 32, - .iv = "\xa0\x5f\xc5\x52\x47\x13\xc2\x01" - "\x3d\x7c\x6e\x52\x3d\x55\x85\x48", - .assoc = "\xaf", - .alen = 1, - .ptext = "", - .plen = 0, - .ctext = "\x9b\xc5\x3b\x20\x0a\x88\x56\xbe" - "\x69\xdf\xc4\xc4\x02\x46\x3a\xf0", - .clen = 16, - }, { - .key = "\xcd\xbb\x70\x89\x13\xf0\xc1\x95" - "\x9e\x20\xf4\xbf\x39\xb1\x17\xcd" - "\x76\x0c\x7f\x0d\xa9\xa0\xc1\x4e" - "\xed\xdf\xb9\xe4\x1e\x3f\x87\xa8", - .klen = 32, - .iv = "\xdc\x84\xfe\xf1\x58\x8f\x6b\x1c" - "\xbe\x57\x20\xe3\x37\x7a\x48\x4f", - .assoc = "\xeb\x4d\x8d\x59\x9c\x2e\x15\xa3" - "\xde\x8d\x4d\x07\x36\x43\x78\xd0" - "\x0b\x6d\x84\x4f\x2c\xf0\x82\x5b" - "\x4e\xf6\x29\xd1\x8b\x6f\x56", - .alen = 31, - .ptext = "", - .plen = 0, - .ctext = "\xe0\x6d\xa1\x07\x98\x2f\x40\x2d" - "\x2e\x9a\xd6\x61\x43\xc0\x74\x69", - .clen = 16, - }, { - .key = "\x0a\xe0\xaa\x29\x24\x6c\x6a\xb1" - "\x1f\xfa\xa6\x50\x33\xd5\xda\xd3" - "\xa0\xce\x8a\x91\xae\x40\x43\x68" - "\xae\x0d\x98\xbd\xf8\x9e\x26\x7f", - .klen = 32, - .iv = "\x19\xa9\x38\x91\x68\x0b\x14\x38" - "\x3f\x31\xd2\x74\x31\x9e\x0a\x55", - .assoc = "\x28\x72\xc7\xf8\xac\xaa\xbe\xbf" - "\x5f\x67\xff\x99\x30\x67\x3b\xd6" - "\x35\x2f\x90\xd3\x31\x90\x04\x74" - "\x0f\x23\x08\xa9\x65\xce\xf6\xea", - .alen = 32, - .ptext = "", - .plen = 0, - .ctext = "\xb9\x57\x13\x3e\x82\x31\x61\x65" - "\x0d\x7f\x6c\x96\x93\x5c\x50\xe2", - .clen = 16, - }, { - .key = "\x46\x04\xe3\xc8\x34\xe7\x12\xcd" - "\xa0\xd4\x58\xe2\x2d\xf9\x9c\xda" - "\xca\x91\x96\x15\xb4\xe0\xc5\x81" - "\x70\x3a\x77\x95\xd2\xfd\xc5\x55", - .klen = 32, - .iv = "\x55\xcd\x72\x30\x78\x86\xbd\x54" - "\xc0\x0b\x84\x06\x2b\xc2\xcd\x5b", - .assoc = "\x64\x97\x00\x98\xbc\x25\x67\xdb" - "\xe0\x41\xb1\x2a\x2a\x8c\xfe\xdd" - "\x5f\xf2\x9c\x58\x36\x30\x86\x8e" - "\xd1\x51\xe6\x81\x3f\x2d\x95\xc1" - "\x01", - .alen = 33, - .ptext = "", - .plen = 0, - .ctext = "\x81\x96\x34\xde\xbb\x36\xdd\x3e" - "\x4e\x5e\xcb\x44\x21\xb8\x3f\xf1", - .clen = 16, - }, { - .key = "\x83\x29\x1d\x67\x44\x63\xbb\xe9" - "\x20\xaf\x0a\x73\x27\x1e\x5f\xe0" - "\xf5\x53\xa1\x9a\xb9\x80\x47\x9b" - "\x31\x68\x56\x6e\xac\x5c\x65\x2c", - .klen = 32, - .iv = "\x92\xf2\xac\xcf\x88\x02\x65\x70" - "\x41\xe5\x36\x97\x25\xe7\x90\x61", - .assoc = "\xa1\xbb\x3a\x37\xcc\xa1\x10\xf7" - "\x61\x1c\x63\xbc\x24\xb0\xc0\xe3" - "\x8a\xb4\xa7\xdc\x3b\xd0\x08\xa8" - "\x92\x7f\xc5\x5a\x19\x8c\x34\x97" - "\x0f\x95\x9b\x18\xe4\x8d\xb4\x24" - "\xb9\x33\x28\x18\xe1\x9d\x14\xe0" - "\x64\xb2\x89\x7d\x78\xa8\x05\x7e" - "\x07\x8c\xfc\x88\x2d\xb8\x53", - .alen = 63, - .ptext = "", - .plen = 0, - .ctext = "\x2e\x99\xb6\x79\x57\x56\x80\x36" - "\x8e\xc4\x1c\x12\x7d\x71\x36\x0c", - .clen = 16, - }, { - .key = "\xbf\x4e\x57\x07\x54\xdf\x64\x05" - "\xa1\x89\xbc\x04\x21\x42\x22\xe6" - "\x1f\x15\xad\x1e\xbe\x20\xc9\xb4" - "\xf3\x95\x35\x46\x86\xbb\x04\x03", - .klen = 32, - .iv = "\xce\x17\xe5\x6f\x98\x7e\x0e\x8c" - "\xc2\xbf\xe8\x29\x1f\x0b\x52\x68", - .assoc = "\xdd\xe0\x74\xd6\xdc\x1d\xb8\x13" - "\xe2\xf6\x15\x4d\x1e\xd4\x83\xe9" - "\xb4\x76\xb3\x60\x40\x70\x8a\xc1" - "\x53\xac\xa4\x32\xf3\xeb\xd3\x6e" - "\x1e\x42\xa0\x46\x45\x9f\xc7\x22" - "\xd3\x43\xbc\x7e\xa5\x47\x2a\x6f" - "\x91\x19\x70\x1e\xe1\xfe\x25\x49" - "\xd6\x8f\x93\xc7\x28\x3f\x3d\x03", - .alen = 64, - .ptext = "", - .plen = 0, - .ctext = "\x7b\x25\x3d\x47\xd4\xa7\x08\xce" - "\x3b\x89\x40\x36\xba\x6d\x0e\xa2", - .clen = 16, - }, { - .key = "\xfc\x72\x90\xa6\x64\x5a\x0d\x21" - "\x22\x63\x6e\x96\x1b\x67\xe4\xec" - "\x49\xd7\xb9\xa2\xc3\xc0\x4b\xce" - "\xb4\xc3\x14\x1e\x61\x1a\xa3\xd9", - .klen = 32, - .iv = "\x0b\x3c\x1f\x0e\xa8\xf9\xb7\xa7" - "\x42\x9a\x9a\xba\x19\x30\x15\x6e", - .assoc = "\x1a", - .alen = 1, - .ptext = "\x29", - .plen = 1, - .ctext = "\xe6\x09\x6f\x95\x9a\x18\xc8\xf6" - "\x17\x75\x81\x16\xdf\x26\xff\x67" - "\x92", - .clen = 17, - }, { - .key = "\x38\x97\xca\x45\x74\xd6\xb6\x3c" - "\xa3\x3d\x20\x27\x15\x8b\xa7\xf2" - "\x74\x9a\xc4\x27\xc8\x60\xcd\xe8" - "\x75\xf0\xf2\xf7\x3b\x79\x42\xb0", - .klen = 32, - .iv = "\x47\x60\x59\xad\xb8\x75\x60\xc3" - "\xc3\x74\x4c\x4c\x13\x54\xd8\x74", - .assoc = "\x56\x29\xe7\x15\xfc\x14\x0a\x4a" - "\xe4\xaa\x79\x70\x12\x1d\x08\xf6" - "\x09\xfb\xca\x69\x4b\xb0\x8e\xf5" - "\xd6\x07\x62\xe3\xa8\xa9\x12", - .alen = 31, - .ptext = "\x66\xf3\x75\x7d\x40\xb3\xb4\xd1" - "\x04\xe1\xa6\x94\x10\xe6\x39\x77" - "\xd3\xac\x4d\x8a\x8c\x58\x6e\xfb" - "\x06\x13\x9a\xd9\x5e\xc0\xfa", - .plen = 31, - .ctext = "\x82\xc0\x56\xf0\xd7\xc4\xc9\xfd" - "\x3c\xd1\x2a\xd4\x15\x86\x9d\xda" - "\xea\x6c\x6f\xa1\x33\xb0\x7a\x01" - "\x57\xe7\xf3\x7b\x73\xe7\x54\x10" - "\xc6\x91\xe2\xc6\xa0\x69\xe7\xe6" - "\x76\xc3\xf5\x3a\x76\xfd\x4a", - .clen = 47, - }, { - .key = "\x75\xbc\x04\xe5\x84\x52\x5e\x58" - "\x24\x17\xd2\xb9\x0e\xaf\x6a\xf9" - "\x9e\x5c\xd0\xab\xcd\x00\x4f\x01" - "\x37\x1e\xd1\xcf\x15\xd8\xe2\x86", - .klen = 32, - .iv = "\x84\x85\x92\x4d\xc8\xf1\x08\xdf" - "\x44\x4e\xff\xdd\x0d\x78\x9a\x7a", - .assoc = "\x93\x4e\x21\xb4\x0c\x90\xb3\x66" - "\x65\x84\x2b\x01\x0b\x42\xcb\xfc" - "\x33\xbd\xd6\xed\x50\x50\x10\x0e" - "\x97\x35\x41\xbb\x82\x08\xb1\xf2", - .alen = 32, - .ptext = "\xa2\x17\xaf\x1c\x50\x2e\x5d\xed" - "\x85\xbb\x58\x26\x0a\x0b\xfc\x7d" - "\xfe\x6e\x59\x0e\x91\xf8\xf0\x15" - "\xc8\x40\x78\xb1\x38\x1f\x99\xa7", - .plen = 32, - .ctext = "\x01\x47\x8e\x6c\xf6\x64\x89\x3a" - "\x71\xce\xe4\xaa\x45\x70\xe6\x84" - "\x62\x48\x08\x64\x86\x6a\xdf\xec" - "\xb4\xa0\xfb\x34\x03\x0c\x19\xf4" - "\x2b\x7b\x36\x73\xec\x54\xa9\x1e" - "\x30\x85\xdb\xe4\xac\xe9\x2c\xca", - .clen = 48, - }, { - .key = "\xb1\xe1\x3e\x84\x94\xcd\x07\x74" - "\xa5\xf2\x84\x4a\x08\xd4\x2c\xff" - "\xc8\x1e\xdb\x2f\xd2\xa0\xd1\x1b" - "\xf8\x4c\xb0\xa8\xef\x37\x81\x5d", - .klen = 32, - .iv = "\xc0\xaa\xcc\xec\xd8\x6c\xb1\xfb" - "\xc5\x28\xb1\x6e\x07\x9d\x5d\x81", - .assoc = "\xd0\x73\x5a\x54\x1d\x0b\x5b\x82" - "\xe5\x5f\xdd\x93\x05\x66\x8e\x02" - "\x5e\x80\xe1\x71\x55\xf0\x92\x28" - "\x59\x62\x20\x94\x5c\x67\x50\xc8" - "\x58", - .alen = 33, - .ptext = "\xdf\x3c\xe9\xbc\x61\xaa\x06\x09" - "\x06\x95\x0a\xb7\x04\x2f\xbe\x84" - "\x28\x30\x64\x92\x96\x98\x72\x2e" - "\x89\x6e\x57\x8a\x13\x7e\x38\x7e" - "\xdb", - .plen = 33, - .ctext = "\x85\xe0\xf8\x0f\x8e\x49\xe3\x60" - "\xcb\x4a\x54\x94\xcf\xf5\x7e\x34" - "\xe9\xf8\x80\x65\x53\xd0\x72\x70" - "\x4f\x7d\x9d\xd1\x15\x6f\xb9\x2c" - "\xfa\xe8\xdd\xac\x2e\xe1\x3f\x67" - "\x63\x0f\x1a\x59\xb7\x89\xdb\xf4" - "\xc3", - .clen = 49, - }, { - .key = "\xee\x05\x77\x23\xa5\x49\xb0\x90" - "\x26\xcc\x36\xdc\x02\xf8\xef\x05" - "\xf3\xe1\xe7\xb3\xd8\x40\x53\x35" - "\xb9\x79\x8f\x80\xc9\x96\x20\x33", - .klen = 32, - .iv = "\xfd\xce\x06\x8b\xe9\xe8\x5a\x17" - "\x46\x02\x63\x00\x01\xc1\x20\x87", - .assoc = "\x0c\x98\x94\xf3\x2d\x87\x04\x9e" - "\x66\x39\x8f\x24\xff\x8a\x50\x08" - "\x88\x42\xed\xf6\x5a\x90\x14\x42" - "\x1a\x90\xfe\x6c\x36\xc6\xf0\x9f" - "\x66\xa0\xb5\x2d\x2c\xf8\x25\x15" - "\x55\x90\xa2\x7e\x77\x94\x96\x3a" - "\x71\x1c\xf7\x44\xee\xa8\xc3\x42" - "\xe2\xa3\x84\x04\x0b\xe1\xce", - .alen = 63, - .ptext = "\x1b\x61\x23\x5b\x71\x26\xae\x25" - "\x87\x6f\xbc\x49\xfe\x53\x81\x8a" - "\x53\xf2\x70\x17\x9b\x38\xf4\x48" - "\x4b\x9b\x36\x62\xed\xdd\xd8\x54" - "\xea\xcb\xb6\x79\x45\xfc\xaa\x54" - "\x5c\x94\x47\x58\xa7\xff\x9c\x9e" - "\x7c\xb6\xf1\xac\xc8\xfd\x8b\x35" - "\xd5\xa4\x6a\xd4\x09\xc2\x08", - .plen = 63, - .ctext = "\x00\xe5\x5b\x87\x5c\x20\x22\x8a" - "\xda\x1f\xd3\xff\xbb\xb2\xb0\xf8" - "\xef\xe9\xeb\x9e\x7c\x80\xf4\x2b" - "\x59\xc0\x79\xbc\x17\xa0\x15\x01" - "\xf5\x72\xfb\x5a\xe7\xaf\x07\xe3" - "\x1b\x49\x21\x34\x23\x63\x55\x5e" - "\xee\x4f\x34\x17\xfa\xfe\xa5\x0c" - "\xed\x0b\x23\xea\x9b\xda\x57\x2f" - "\xf6\xa9\xae\x0d\x4e\x40\x96\x45" - "\x7f\xfa\xf0\xbf\xc4\x98\x78", - .clen = 79, - }, { - .key = "\x2a\x2a\xb1\xc3\xb5\xc5\x59\xac" - "\xa7\xa6\xe8\x6d\xfc\x1d\xb2\x0b" - "\x1d\xa3\xf3\x38\xdd\xe0\xd5\x4e" - "\x7b\xa7\x6e\x58\xa3\xf5\xbf\x0a", - .klen = 32, - .iv = "\x39\xf3\x3f\x2b\xf9\x64\x03\x33" - "\xc7\xdd\x15\x91\xfb\xe6\xe2\x8d", - .assoc = "\x49\xbc\xce\x92\x3d\x02\xad\xba" - "\xe7\x13\x41\xb6\xf9\xaf\x13\x0f" - "\xb2\x04\xf8\x7a\x5f\x30\x96\x5b" - "\xdc\xbd\xdd\x44\x10\x25\x8f\x75" - "\x75\x4d\xb9\x5b\x8e\x0a\x38\x13" - "\x6f\x9f\x36\xe4\x3a\x3e\xac\xc9" - "\x9d\x83\xde\xe5\x57\xfd\xe3\x0e" - "\xb1\xa7\x1b\x44\x05\x67\xb7\x37", - .alen = 64, - .ptext = "\x58\x85\x5c\xfa\x81\xa1\x57\x40" - "\x08\x4a\x6e\xda\xf8\x78\x44\x90" - "\x7d\xb5\x7b\x9b\xa1\xd8\x76\x62" - "\x0c\xc9\x15\x3b\xc7\x3c\x77\x2b" - "\xf8\x78\xba\xa7\xa6\x0e\xbd\x52" - "\x76\xa3\xdc\xbe\x6b\xa8\xb1\x2d" - "\xa9\x1d\xd8\x4e\x31\x53\xab\x00" - "\xa5\xa7\x01\x13\x04\x49\xf2\x04", - .plen = 64, - .ctext = "\x28\xdd\xb9\x4a\x12\xc7\x0a\xe1" - "\x58\x06\x1a\x9b\x8c\x67\xdf\xeb" - "\x35\x35\x60\x9d\x06\x40\x65\xc1" - "\x93\xe8\xb3\x82\x50\x29\xdd\xb5" - "\x2b\xcb\xde\x18\x78\x6b\x42\xbe" - "\x6d\x24\xd0\xb2\x7d\xd7\x08\x8f" - "\x4a\x18\x98\xad\x8c\xf2\x97\xb4" - "\xf4\x77\xe4\xbf\x41\x3b\xc4\x06" - "\xce\x9e\x34\x81\xf0\x89\x11\x13" - "\x02\x65\xa1\x7c\xdf\x07\x33\x06", - .clen = 80, - }, { - .key = "\x67\x4f\xeb\x62\xc5\x40\x01\xc7" - "\x28\x80\x9a\xfe\xf6\x41\x74\x12" - "\x48\x65\xfe\xbc\xe2\x80\x57\x68" - "\x3c\xd4\x4d\x31\x7d\x54\x5f\xe1", - .klen = 32, - .iv = "\x76\x18\x79\xca\x09\xdf\xac\x4e" - "\x48\xb7\xc7\x23\xf5\x0a\xa5\x93", - .assoc = "\x85\xe1\x08\x32\x4d\x7e\x56\xd5" - "\x68\xed\xf3\x47\xf3\xd3\xd6\x15" - "\xdd\xc7\x04\xfe\x64\xd0\x18\x75" - "\x9d\xeb\xbc\x1d\xea\x84\x2e\x4c" - "\x83\xf9\xbe\x8a\xef\x1c\x4b\x10" - "\x89\xaf\xcb\x4b\xfe\xe7\xc1\x58" - "\xca\xea\xc6\x87\xc0\x53\x03\xd9" - "\x80\xaa\xb2\x83\xff\xee\xa1\x6a" - "\x04", - .alen = 65, - .ptext = "\x94\xaa\x96\x9a\x91\x1d\x00\x5c" - "\x88\x24\x20\x6b\xf2\x9c\x06\x96" - "\xa7\x77\x87\x1f\xa6\x78\xf8\x7b" - "\xcd\xf6\xf4\x13\xa1\x9b\x16\x02" - "\x07\x24\xbf\xd5\x08\x20\xd0\x4f" - "\x90\xb3\x70\x24\x2f\x51\xc7\xbb" - "\xd6\x84\xc0\xef\x9a\xa8\xca\xcc" - "\x74\xab\x97\x53\xfe\xd0\xdb\x37" - "\x37\x6a\x0e\x9f\x3f\xa3\x2a\xe3" - "\x1b\x34\x6d\x51\x72\x2b\x17\xe7" - "\x4d\xaa\x2c\x18\xda\xa3\x33\x89" - "\x2a\x9f\xf4\xd2\xed\x76\x3d\x3f" - "\x3c\x15\x9d\x8e\x4f\x3c\x27\xb0" - "\x42\x3f\x2f\x8a\xd4\xc2\x10\xb2" - "\x27\x7f\xe3\x34\x80\x02\x49\x4b" - "\x07\x68\x22\x2a\x88\x25\x53\xb2" - "\x2f", - .plen = 129, - .ctext = "\x85\x39\x69\x35\xfb\xf9\xb0\xa6" - "\x85\x43\x88\xd0\xd7\x78\x60\x19" - "\x3e\x1f\xb1\xa4\xd6\xc5\x96\xec" - "\xf7\x84\x85\xc7\x27\x0f\x74\x57" - "\x28\x9e\xdd\x90\x3c\x43\x12\xc5" - "\x51\x3d\x39\x8f\xa5\xf4\xe0\x0b" - "\x57\x04\xf1\x6d\xfe\x9b\x84\x27" - "\xe8\xeb\x4d\xda\x02\x0a\xc5\x49" - "\x1a\x55\x5e\x50\x56\x4d\x94\xda" - "\x20\xf8\x12\x54\x50\xb3\x11\xda" - "\xed\x44\x27\x67\xd5\xd1\x8b\x4b" - "\x38\x67\x56\x65\x59\xda\xe6\x97" - "\x81\xae\x2f\x92\x3b\xae\x22\x1c" - "\x91\x59\x38\x18\x00\xe8\xba\x92" - "\x04\x19\x56\xdf\xb0\x82\xeb\x6f" - "\x2e\xdb\x54\x3c\x4b\xbb\x60\x90" - "\x4c\x50\x10\x62\xba\x7a\xb1\x68" - "\x37\xd7\x87\x4e\xe4\x66\x09\x1f" - "\xa5", - .clen = 145, - }, { - .key = "\xa3\x73\x24\x01\xd5\xbc\xaa\xe3" - "\xa9\x5a\x4c\x90\xf0\x65\x37\x18" - "\x72\x28\x0a\x40\xe7\x20\xd9\x82" - "\xfe\x02\x2b\x09\x57\xb3\xfe\xb7", - .klen = 32, - .iv = "\xb3\x3d\xb3\x69\x19\x5b\x54\x6a" - "\xc9\x91\x79\xb4\xef\x2e\x68\x99", - .assoc = "\xc2\x06\x41\xd1\x5d\xfa\xff\xf1" - "\xe9\xc7\xa5\xd9\xed\xf8\x98\x1b" - "\x07\x89\x10\x82\x6a\x70\x9a\x8f" - "\x5e\x19\x9b\xf5\xc5\xe3\xcd\x22" - "\x92\xa5\xc2\xb8\x51\x2e\x5e\x0e" - "\xa4\xbe\x5f\xb1\xc1\x90\xd7\xe7" - "\xf7\x52\xae\x28\x29\xa8\x22\xa4" - "\x4f\xae\x48\xc2\xfa\x75\x8b\x9e" - "\xce\x83\x2a\x88\x07\x55\xbb\x89" - "\xf6\xdf\xac\xdf\x83\x08\xbf\x7d" - "\xac\x30\x8b\x8e\x02\xac\x00\xf1" - "\x30\x46\xe1\xbc\x75\xbf\x49\xbb" - "\x26\x4e\x29\xf0\x2f\x21\xc6\x13" - "\x92\xd9\x3d\x11\xe4\x10\x00\x8e" - "\xd4\xd4\x58\x65\xa6\x2b\xe3\x25" - "\xb1\x8f\x15\x93\xe7\x71\xb9\x2c" - "\x4b", - .alen = 129, - .ptext = "\xd1\xcf\xd0\x39\xa1\x99\xa9\x78" - "\x09\xfe\xd2\xfd\xec\xc1\xc9\x9d" - "\xd2\x39\x93\xa3\xab\x18\x7a\x95" - "\x8f\x24\xd3\xeb\x7b\xfa\xb5\xd8" - "\x15\xd1\xc3\x04\x69\x32\xe3\x4d" - "\xaa\xc2\x04\x8b\xf2\xfa\xdc\x4a" - "\x02\xeb\xa8\x90\x03\xfd\xea\x97" - "\x43\xaf\x2e\x92\xf8\x57\xc5\x6a" - "\x00", - .plen = 65, - .ctext = "\x7d\xde\x53\x22\xe4\x23\x3b\x30" - "\x78\xde\x35\x90\x7a\xd9\x0b\x93" - "\xf6\x0e\x0b\xed\x40\xee\x10\x9c" - "\x96\x3a\xd3\x34\xb2\xd0\x67\xcf" - "\x63\x7f\x2d\x0c\xcf\x96\xec\x64" - "\x1a\x87\xcc\x7d\x2c\x5e\x81\x4b" - "\xd2\x8f\x4c\x7c\x00\xb1\xb4\xe0" - "\x87\x4d\xb1\xbc\xd8\x78\x2c\x17" - "\xf2\x3b\xd8\x28\x40\xe2\x76\xf6" - "\x20\x13\x83\x46\xaf\xff\xe3\x0f" - "\x72", - .clen = 81, - }, { - .key = "\xe0\x98\x5e\xa1\xe5\x38\x53\xff" - "\x2a\x35\xfe\x21\xea\x8a\xfa\x1e" - "\x9c\xea\x15\xc5\xec\xc0\x5b\x9b" - "\xbf\x2f\x0a\xe1\x32\x12\x9d\x8e", - .klen = 32, - .iv = "\xef\x61\xed\x08\x29\xd7\xfd\x86" - "\x4a\x6b\x2b\x46\xe9\x53\x2a\xa0", - .assoc = "\xfe\x2a\x7b\x70\x6d\x75\xa7\x0d" - "\x6a\xa2\x57\x6a\xe7\x1c\x5b\x21" - "\x31\x4b\x1b\x07\x6f\x10\x1c\xa8" - "\x20\x46\x7a\xce\x9f\x42\x6d\xf9", - .alen = 32, - .ptext = "\x0d\xf4\x09\xd8\xb1\x14\x51\x94" - "\x8a\xd8\x84\x8e\xe6\xe5\x8c\xa3" - "\xfc\xfc\x9e\x28\xb0\xb8\xfc\xaf" - "\x50\x52\xb1\xc4\x55\x59\x55\xaf", - .plen = 32, - .ctext = "\x5a\xcd\x8c\x57\xf2\x6a\xb6\xbe" - "\x53\xc7\xaa\x9a\x60\x74\x9c\xc4" - "\xa2\xc2\xd0\x6d\xe1\x03\x63\xdc" - "\xbb\x51\x7e\x9c\x89\x73\xde\x4e" - "\x24\xf8\x52\x7c\x15\x41\x0e\xba" - "\x69\x0e\x36\x5f\x2f\x22\x8c", - .clen = 47, - }, { - .key = "\x1c\xbd\x98\x40\xf5\xb3\xfc\x1b" - "\xaa\x0f\xb0\xb3\xe4\xae\xbc\x24" - "\xc7\xac\x21\x49\xf1\x60\xdd\xb5" - "\x80\x5d\xe9\xba\x0c\x71\x3c\x64", - .klen = 32, - .iv = "\x2c\x86\x26\xa8\x39\x52\xa6\xa2" - "\xcb\x45\xdd\xd7\xe3\x77\xed\xa6", - .assoc = "\x3b\x4f\xb5\x10\x7d\xf1\x50\x29" - "\xeb\x7c\x0a\xfb\xe1\x40\x1e\x27" - "\x5c\x0d\x27\x8b\x74\xb0\x9e\xc2" - "\xe1\x74\x59\xa6\x79\xa1\x0c\xd0", - .alen = 32, - .ptext = "\x4a\x18\x43\x77\xc1\x90\xfa\xb0" - "\x0b\xb2\x36\x20\xe0\x09\x4e\xa9" - "\x26\xbe\xaa\xac\xb5\x58\x7e\xc8" - "\x11\x7f\x90\x9c\x2f\xb8\xf4\x85", - .plen = 32, - .ctext = "\x47\xd6\xce\x78\xd6\xbf\x4a\x51" - "\xb8\xda\x92\x3c\xfd\xda\xac\x8e" - "\x8d\x88\xd7\x4d\x90\xe5\xeb\xa1" - "\xab\xd6\x7c\x76\xad\xea\x7d\x76" - "\x53\xee\xb0\xcd\xd0\x02\xbb\x70" - "\x5b\x6f\x7b\xe2\x8c\xe8", - .clen = 46, - }, { - .key = "\x59\xe1\xd2\xdf\x05\x2f\xa4\x37" - "\x2b\xe9\x63\x44\xde\xd3\x7f\x2b" - "\xf1\x6f\x2d\xcd\xf6\x00\x5f\xcf" - "\x42\x8a\xc8\x92\xe6\xd0\xdc\x3b", - .klen = 32, - .iv = "\x68\xab\x60\x47\x49\xce\x4f\xbe" - "\x4c\x20\x8f\x68\xdd\x9c\xb0\xac", - .assoc = "\x77\x74\xee\xaf\x8d\x6d\xf9\x45" - "\x6c\x56\xbc\x8d\xdb\x65\xe0\x2e" - "\x86\xd0\x32\x0f\x79\x50\x20\xdb" - "\xa2\xa1\x37\x7e\x53\x00\xab\xa6", - .alen = 32, - .ptext = "\x86\x3d\x7d\x17\xd1\x0c\xa3\xcc" - "\x8c\x8d\xe8\xb1\xda\x2e\x11\xaf" - "\x51\x80\xb5\x30\xba\xf8\x00\xe2" - "\xd3\xad\x6f\x75\x09\x18\x93\x5c", - .plen = 32, - .ctext = "\x9f\xa9\x2b\xa4\x8f\x00\x05\x2b" - "\xe7\x68\x81\x51\xbb\xfb\xdf\x60" - "\xbb\xac\xe8\xc1\xdc\x68\xae\x68" - "\x3a\xcd\x7a\x06\x49\xfe\x80\x11" - "\xe6\x61\x99\xe2\xdd\xbe\x2c\xbf", - .clen = 40, - }, { - .key = "\x96\x06\x0b\x7f\x15\xab\x4d\x53" - "\xac\xc3\x15\xd6\xd8\xf7\x42\x31" - "\x1b\x31\x38\x51\xfc\xa0\xe1\xe8" - "\x03\xb8\xa7\x6b\xc0\x2f\x7b\x11", - .klen = 32, - .iv = "\xa5\xcf\x9a\xe6\x59\x4a\xf7\xd9" - "\xcd\xfa\x41\xfa\xd7\xc0\x72\xb2", - .assoc = "\xb4\x99\x28\x4e\x9d\xe8\xa2\x60" - "\xed\x30\x6e\x1e\xd5\x89\xa3\x34" - "\xb1\x92\x3e\x93\x7e\xf0\xa2\xf5" - "\x64\xcf\x16\x57\x2d\x5f\x4a\x7d", - .alen = 32, - .ptext = "\xc3\x62\xb7\xb6\xe2\x87\x4c\xe7" - "\x0d\x67\x9a\x43\xd4\x52\xd4\xb5" - "\x7b\x43\xc1\xb5\xbf\x98\x82\xfc" - "\x94\xda\x4e\x4d\xe4\x77\x32\x32", - .plen = 32, - .ctext = "\xe2\x34\xfa\x25\xfd\xfb\x89\x5e" - "\x5b\x4e\x0b\x15\x6e\x39\xfb\x0c" - "\x73\xc7\xd9\x6b\xbe\xce\x9b\x70" - "\xc7\x4f\x96\x16\x03\xfc\xea\xfb" - "\x56", - .clen = 33, - }, -}; - /* * All key wrapping test vectors taken from * http://csrc.nist.gov/groups/STM/cavp/documents/mac/kwtestvectors.zip diff --git a/include/crypto/morus1280_glue.h b/include/crypto/morus1280_glue.h deleted file mode 100644 index 5cefddb1991f..000000000000 --- a/include/crypto/morus1280_glue.h +++ /dev/null @@ -1,97 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * The MORUS-1280 Authenticated-Encryption Algorithm - * Common glue skeleton -- header file - * - * Copyright (c) 2016-2018 Ondrej Mosnacek - * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. - */ - -#ifndef _CRYPTO_MORUS1280_GLUE_H -#define _CRYPTO_MORUS1280_GLUE_H - -#include -#include -#include -#include -#include - -#define MORUS1280_WORD_SIZE 8 -#define MORUS1280_BLOCK_SIZE (MORUS_BLOCK_WORDS * MORUS1280_WORD_SIZE) - -struct morus1280_block { - u8 bytes[MORUS1280_BLOCK_SIZE]; -}; - -struct morus1280_glue_ops { - void (*init)(void *state, const void *key, const void *iv); - void (*ad)(void *state, const void *data, unsigned int length); - void (*enc)(void *state, const void *src, void *dst, unsigned int length); - void (*dec)(void *state, const void *src, void *dst, unsigned int length); - void (*enc_tail)(void *state, const void *src, void *dst, unsigned int length); - void (*dec_tail)(void *state, const void *src, void *dst, unsigned int length); - void (*final)(void *state, void *tag_xor, u64 assoclen, u64 cryptlen); -}; - -struct morus1280_ctx { - const struct morus1280_glue_ops *ops; - struct morus1280_block key; -}; - -void crypto_morus1280_glue_init_ops(struct crypto_aead *aead, - const struct morus1280_glue_ops *ops); -int crypto_morus1280_glue_setkey(struct crypto_aead *aead, const u8 *key, - unsigned int keylen); -int crypto_morus1280_glue_setauthsize(struct crypto_aead *tfm, - unsigned int authsize); -int crypto_morus1280_glue_encrypt(struct aead_request *req); -int crypto_morus1280_glue_decrypt(struct aead_request *req); - -#define MORUS1280_DECLARE_ALG(id, driver_name, priority) \ - static const struct morus1280_glue_ops crypto_morus1280_##id##_ops = {\ - .init = crypto_morus1280_##id##_init, \ - .ad = crypto_morus1280_##id##_ad, \ - .enc = crypto_morus1280_##id##_enc, \ - .enc_tail = crypto_morus1280_##id##_enc_tail, \ - .dec = crypto_morus1280_##id##_dec, \ - .dec_tail = crypto_morus1280_##id##_dec_tail, \ - .final = crypto_morus1280_##id##_final, \ - }; \ - \ - static int crypto_morus1280_##id##_init_tfm(struct crypto_aead *tfm) \ - { \ - crypto_morus1280_glue_init_ops(tfm, &crypto_morus1280_##id##_ops); \ - return 0; \ - } \ - \ - static void crypto_morus1280_##id##_exit_tfm(struct crypto_aead *tfm) \ - { \ - } \ - \ - static struct aead_alg crypto_morus1280_##id##_alg = { \ - .setkey = crypto_morus1280_glue_setkey, \ - .setauthsize = crypto_morus1280_glue_setauthsize, \ - .encrypt = crypto_morus1280_glue_encrypt, \ - .decrypt = crypto_morus1280_glue_decrypt, \ - .init = crypto_morus1280_##id##_init_tfm, \ - .exit = crypto_morus1280_##id##_exit_tfm, \ - \ - .ivsize = MORUS_NONCE_SIZE, \ - .maxauthsize = MORUS_MAX_AUTH_SIZE, \ - .chunksize = MORUS1280_BLOCK_SIZE, \ - \ - .base = { \ - .cra_flags = CRYPTO_ALG_INTERNAL, \ - .cra_blocksize = 1, \ - .cra_ctxsize = sizeof(struct morus1280_ctx), \ - .cra_alignmask = 0, \ - .cra_priority = priority, \ - \ - .cra_name = "__morus1280", \ - .cra_driver_name = "__"driver_name, \ - \ - .cra_module = THIS_MODULE, \ - } \ - } - -#endif /* _CRYPTO_MORUS1280_GLUE_H */ diff --git a/include/crypto/morus640_glue.h b/include/crypto/morus640_glue.h deleted file mode 100644 index 0ee6266cb26c..000000000000 --- a/include/crypto/morus640_glue.h +++ /dev/null @@ -1,97 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * The MORUS-640 Authenticated-Encryption Algorithm - * Common glue skeleton -- header file - * - * Copyright (c) 2016-2018 Ondrej Mosnacek - * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. - */ - -#ifndef _CRYPTO_MORUS640_GLUE_H -#define _CRYPTO_MORUS640_GLUE_H - -#include -#include -#include -#include -#include - -#define MORUS640_WORD_SIZE 4 -#define MORUS640_BLOCK_SIZE (MORUS_BLOCK_WORDS * MORUS640_WORD_SIZE) - -struct morus640_block { - u8 bytes[MORUS640_BLOCK_SIZE]; -}; - -struct morus640_glue_ops { - void (*init)(void *state, const void *key, const void *iv); - void (*ad)(void *state, const void *data, unsigned int length); - void (*enc)(void *state, const void *src, void *dst, unsigned int length); - void (*dec)(void *state, const void *src, void *dst, unsigned int length); - void (*enc_tail)(void *state, const void *src, void *dst, unsigned int length); - void (*dec_tail)(void *state, const void *src, void *dst, unsigned int length); - void (*final)(void *state, void *tag_xor, u64 assoclen, u64 cryptlen); -}; - -struct morus640_ctx { - const struct morus640_glue_ops *ops; - struct morus640_block key; -}; - -void crypto_morus640_glue_init_ops(struct crypto_aead *aead, - const struct morus640_glue_ops *ops); -int crypto_morus640_glue_setkey(struct crypto_aead *aead, const u8 *key, - unsigned int keylen); -int crypto_morus640_glue_setauthsize(struct crypto_aead *tfm, - unsigned int authsize); -int crypto_morus640_glue_encrypt(struct aead_request *req); -int crypto_morus640_glue_decrypt(struct aead_request *req); - -#define MORUS640_DECLARE_ALG(id, driver_name, priority) \ - static const struct morus640_glue_ops crypto_morus640_##id##_ops = {\ - .init = crypto_morus640_##id##_init, \ - .ad = crypto_morus640_##id##_ad, \ - .enc = crypto_morus640_##id##_enc, \ - .enc_tail = crypto_morus640_##id##_enc_tail, \ - .dec = crypto_morus640_##id##_dec, \ - .dec_tail = crypto_morus640_##id##_dec_tail, \ - .final = crypto_morus640_##id##_final, \ - }; \ - \ - static int crypto_morus640_##id##_init_tfm(struct crypto_aead *tfm) \ - { \ - crypto_morus640_glue_init_ops(tfm, &crypto_morus640_##id##_ops); \ - return 0; \ - } \ - \ - static void crypto_morus640_##id##_exit_tfm(struct crypto_aead *tfm) \ - { \ - } \ - \ - static struct aead_alg crypto_morus640_##id##_alg = {\ - .setkey = crypto_morus640_glue_setkey, \ - .setauthsize = crypto_morus640_glue_setauthsize, \ - .encrypt = crypto_morus640_glue_encrypt, \ - .decrypt = crypto_morus640_glue_decrypt, \ - .init = crypto_morus640_##id##_init_tfm, \ - .exit = crypto_morus640_##id##_exit_tfm, \ - \ - .ivsize = MORUS_NONCE_SIZE, \ - .maxauthsize = MORUS_MAX_AUTH_SIZE, \ - .chunksize = MORUS640_BLOCK_SIZE, \ - \ - .base = { \ - .cra_flags = CRYPTO_ALG_INTERNAL, \ - .cra_blocksize = 1, \ - .cra_ctxsize = sizeof(struct morus640_ctx), \ - .cra_alignmask = 0, \ - .cra_priority = priority, \ - \ - .cra_name = "__morus640", \ - .cra_driver_name = "__"driver_name, \ - \ - .cra_module = THIS_MODULE, \ - } \ - } - -#endif /* _CRYPTO_MORUS640_GLUE_H */ diff --git a/include/crypto/morus_common.h b/include/crypto/morus_common.h deleted file mode 100644 index 969510a9a56c..000000000000 --- a/include/crypto/morus_common.h +++ /dev/null @@ -1,18 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * The MORUS Authenticated-Encryption Algorithm - * Common definitions - * - * Copyright (c) 2016-2018 Ondrej Mosnacek - * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. - */ - -#ifndef _CRYPTO_MORUS_COMMON_H -#define _CRYPTO_MORUS_COMMON_H - -#define MORUS_BLOCK_WORDS 4 -#define MORUS_STATE_BLOCKS 5 -#define MORUS_NONCE_SIZE 16 -#define MORUS_MAX_AUTH_SIZE 16 - -#endif /* _CRYPTO_MORUS_COMMON_H */ -- cgit v1.2.3 From 17a8f868ae3e85a173843b1ac65e744e8585bc5a Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Mon, 8 Jul 2019 11:24:56 +0530 Subject: opp: Return genpd virtual devices from dev_pm_opp_attach_genpd() The cpufreq drivers don't need to do runtime PM operations on the virtual devices returned by dev_pm_domain_attach_by_name() and so the virtual devices weren't shared with the callers of dev_pm_opp_attach_genpd() earlier. But the IO device drivers would want to do that. This patch updates the prototype of dev_pm_opp_attach_genpd() to accept another argument to return the pointer to the array of genpd virtual devices. Reported-by: Rajendra Nayak Tested-by: Rajendra Nayak Signed-off-by: Viresh Kumar --- drivers/opp/core.c | 6 +++++- include/linux/pm_opp.h | 4 ++-- 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/opp/core.c b/drivers/opp/core.c index 08b830f84d8c..bdb435822401 100644 --- a/drivers/opp/core.c +++ b/drivers/opp/core.c @@ -1771,6 +1771,7 @@ static void _opp_detach_genpd(struct opp_table *opp_table) * dev_pm_opp_attach_genpd - Attach genpd(s) for the device and save virtual device pointer * @dev: Consumer device for which the genpd is getting attached. * @names: Null terminated array of pointers containing names of genpd to attach. + * @virt_devs: Pointer to return the array of virtual devices. * * Multiple generic power domains for a device are supported with the help of * virtual genpd devices, which are created for each consumer device - genpd @@ -1788,7 +1789,8 @@ static void _opp_detach_genpd(struct opp_table *opp_table) * The order of entries in the names array must match the order in which * "required-opps" are added in DT. */ -struct opp_table *dev_pm_opp_attach_genpd(struct device *dev, const char **names) +struct opp_table *dev_pm_opp_attach_genpd(struct device *dev, + const char **names, struct device ***virt_devs) { struct opp_table *opp_table; struct device *virt_dev; @@ -1842,6 +1844,8 @@ struct opp_table *dev_pm_opp_attach_genpd(struct device *dev, const char **names name++; } + if (virt_devs) + *virt_devs = opp_table->genpd_virt_devs; mutex_unlock(&opp_table->genpd_virt_dev_lock); return opp_table; diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h index af5021f27cb7..5bdceca5125d 100644 --- a/include/linux/pm_opp.h +++ b/include/linux/pm_opp.h @@ -128,7 +128,7 @@ struct opp_table *dev_pm_opp_set_clkname(struct device *dev, const char * name); void dev_pm_opp_put_clkname(struct opp_table *opp_table); struct opp_table *dev_pm_opp_register_set_opp_helper(struct device *dev, int (*set_opp)(struct dev_pm_set_opp_data *data)); void dev_pm_opp_unregister_set_opp_helper(struct opp_table *opp_table); -struct opp_table *dev_pm_opp_attach_genpd(struct device *dev, const char **names); +struct opp_table *dev_pm_opp_attach_genpd(struct device *dev, const char **names, struct device ***virt_devs); void dev_pm_opp_detach_genpd(struct opp_table *opp_table); int dev_pm_opp_xlate_performance_state(struct opp_table *src_table, struct opp_table *dst_table, unsigned int pstate); int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq); @@ -292,7 +292,7 @@ static inline struct opp_table *dev_pm_opp_set_clkname(struct device *dev, const static inline void dev_pm_opp_put_clkname(struct opp_table *opp_table) {} -static inline struct opp_table *dev_pm_opp_attach_genpd(struct device *dev, const char **names) +static inline struct opp_table *dev_pm_opp_attach_genpd(struct device *dev, const char **names, struct device ***virt_devs) { return ERR_PTR(-ENOTSUPP); } -- cgit v1.2.3 From 71419d84c216cee8da3b19fb843b4242f112cde4 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Thu, 25 Jul 2019 12:41:29 +0200 Subject: opp: Add dev_pm_opp_find_level_exact() Since the performance states in the OPP table are unique, implement a dev_pm_opp_find_level_exact() in order to be able to fetch a specific OPP. Signed-off-by: Niklas Cassel [ Viresh: Updated commit log ] Signed-off-by: Viresh Kumar --- drivers/opp/core.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/pm_opp.h | 8 ++++++++ 2 files changed, 56 insertions(+) (limited to 'include') diff --git a/drivers/opp/core.c b/drivers/opp/core.c index bdb435822401..0ee8c0133d3e 100644 --- a/drivers/opp/core.c +++ b/drivers/opp/core.c @@ -401,6 +401,54 @@ struct dev_pm_opp *dev_pm_opp_find_freq_exact(struct device *dev, } EXPORT_SYMBOL_GPL(dev_pm_opp_find_freq_exact); +/** + * dev_pm_opp_find_level_exact() - search for an exact level + * @dev: device for which we do this operation + * @level: level to search for + * + * Return: Searches for exact match in the opp table and returns pointer to the + * matching opp if found, else returns ERR_PTR in case of error and should + * be handled using IS_ERR. Error return values can be: + * EINVAL: for bad pointer + * ERANGE: no match found for search + * ENODEV: if device not found in list of registered devices + * + * The callers are required to call dev_pm_opp_put() for the returned OPP after + * use. + */ +struct dev_pm_opp *dev_pm_opp_find_level_exact(struct device *dev, + unsigned int level) +{ + struct opp_table *opp_table; + struct dev_pm_opp *temp_opp, *opp = ERR_PTR(-ERANGE); + + opp_table = _find_opp_table(dev); + if (IS_ERR(opp_table)) { + int r = PTR_ERR(opp_table); + + dev_err(dev, "%s: OPP table not found (%d)\n", __func__, r); + return ERR_PTR(r); + } + + mutex_lock(&opp_table->lock); + + list_for_each_entry(temp_opp, &opp_table->opp_list, node) { + if (temp_opp->level == level) { + opp = temp_opp; + + /* Increment the reference count of OPP */ + dev_pm_opp_get(opp); + break; + } + } + + mutex_unlock(&opp_table->lock); + dev_pm_opp_put_opp_table(opp_table); + + return opp; +} +EXPORT_SYMBOL_GPL(dev_pm_opp_find_level_exact); + static noinline struct dev_pm_opp *_find_freq_ceil(struct opp_table *opp_table, unsigned long *freq) { diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h index 5bdceca5125d..b8197ab014f2 100644 --- a/include/linux/pm_opp.h +++ b/include/linux/pm_opp.h @@ -96,6 +96,8 @@ unsigned long dev_pm_opp_get_suspend_opp_freq(struct device *dev); struct dev_pm_opp *dev_pm_opp_find_freq_exact(struct device *dev, unsigned long freq, bool available); +struct dev_pm_opp *dev_pm_opp_find_level_exact(struct device *dev, + unsigned int level); struct dev_pm_opp *dev_pm_opp_find_freq_floor(struct device *dev, unsigned long *freq); @@ -200,6 +202,12 @@ static inline struct dev_pm_opp *dev_pm_opp_find_freq_exact(struct device *dev, return ERR_PTR(-ENOTSUPP); } +static inline struct dev_pm_opp *dev_pm_opp_find_level_exact(struct device *dev, + unsigned int level) +{ + return ERR_PTR(-ENOTSUPP); +} + static inline struct dev_pm_opp *dev_pm_opp_find_freq_floor(struct device *dev, unsigned long *freq) { -- cgit v1.2.3 From 5db4c4b9559f8cddd5f7f74e58c7b8f172120e6d Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Tue, 23 Jul 2019 21:00:01 +0300 Subject: mac80211: pass the vif to cancel_remain_on_channel This low level driver can find it useful to get the vif when a remain on channel session is cancelled. iwlwifi will need this soon. Signed-off-by: Emmanuel Grumbach Link: https://lore.kernel.org/r/20190723180001.5828-1-emmanuel.grumbach@intel.com Signed-off-by: Johannes Berg --- drivers/net/wireless/ath/ath10k/mac.c | 3 ++- drivers/net/wireless/ath/ath9k/main.c | 3 ++- drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c | 3 ++- drivers/net/wireless/mac80211_hwsim.c | 3 ++- drivers/net/wireless/rsi/rsi_91x_mac80211.c | 3 ++- drivers/net/wireless/ti/wlcore/main.c | 3 ++- include/net/mac80211.h | 3 ++- net/mac80211/driver-ops.h | 8 +++++--- net/mac80211/offchannel.c | 5 +++-- net/mac80211/trace.h | 7 ++++--- 10 files changed, 26 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c index 0606416dc971..12dad659bf68 100644 --- a/drivers/net/wireless/ath/ath10k/mac.c +++ b/drivers/net/wireless/ath/ath10k/mac.c @@ -6970,7 +6970,8 @@ exit: return ret; } -static int ath10k_cancel_remain_on_channel(struct ieee80211_hw *hw) +static int ath10k_cancel_remain_on_channel(struct ieee80211_hw *hw, + struct ieee80211_vif *vif) { struct ath10k *ar = hw->priv; diff --git a/drivers/net/wireless/ath/ath9k/main.c b/drivers/net/wireless/ath/ath9k/main.c index f23cb2f3d296..34121fbf32e3 100644 --- a/drivers/net/wireless/ath/ath9k/main.c +++ b/drivers/net/wireless/ath/ath9k/main.c @@ -2392,7 +2392,8 @@ out: return ret; } -static int ath9k_cancel_remain_on_channel(struct ieee80211_hw *hw) +static int ath9k_cancel_remain_on_channel(struct ieee80211_hw *hw, + struct ieee80211_vif *vif) { struct ath_softc *sc = hw->priv; struct ath_common *common = ath9k_hw_common(sc->sc_ah); diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c index 55cd49ccbf0b..e63623251d61 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c @@ -4010,7 +4010,8 @@ out_unlock: return ret; } -static int iwl_mvm_cancel_roc(struct ieee80211_hw *hw) +static int iwl_mvm_cancel_roc(struct ieee80211_hw *hw, + struct ieee80211_vif *vif) { struct iwl_mvm *mvm = IWL_MAC80211_GET_MVM(hw); diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c index c4611eb4eb86..0869f924e60c 100644 --- a/drivers/net/wireless/mac80211_hwsim.c +++ b/drivers/net/wireless/mac80211_hwsim.c @@ -2216,7 +2216,8 @@ static int mac80211_hwsim_roc(struct ieee80211_hw *hw, return 0; } -static int mac80211_hwsim_croc(struct ieee80211_hw *hw) +static int mac80211_hwsim_croc(struct ieee80211_hw *hw, + struct ieee80211_vif *vif) { struct mac80211_hwsim_data *hwsim = hw->priv; diff --git a/drivers/net/wireless/rsi/rsi_91x_mac80211.c b/drivers/net/wireless/rsi/rsi_91x_mac80211.c index 49df3bb08d41..ce5e92d82efc 100644 --- a/drivers/net/wireless/rsi/rsi_91x_mac80211.c +++ b/drivers/net/wireless/rsi/rsi_91x_mac80211.c @@ -1818,7 +1818,8 @@ out: return status; } -static int rsi_mac80211_cancel_roc(struct ieee80211_hw *hw) +static int rsi_mac80211_cancel_roc(struct ieee80211_hw *hw, + struct ieee80211_vif *vif) { struct rsi_hw *adapter = hw->priv; struct rsi_common *common = adapter->priv; diff --git a/drivers/net/wireless/ti/wlcore/main.c b/drivers/net/wireless/ti/wlcore/main.c index b74dc8bc9755..547ad538d8b6 100644 --- a/drivers/net/wireless/ti/wlcore/main.c +++ b/drivers/net/wireless/ti/wlcore/main.c @@ -5749,7 +5749,8 @@ static void wlcore_roc_complete_work(struct work_struct *work) ieee80211_remain_on_channel_expired(wl->hw); } -static int wlcore_op_cancel_remain_on_channel(struct ieee80211_hw *hw) +static int wlcore_op_cancel_remain_on_channel(struct ieee80211_hw *hw, + struct ieee80211_vif *vif) { struct wl1271 *wl = hw->priv; diff --git a/include/net/mac80211.h b/include/net/mac80211.h index d26da013f7c0..e39bf85ae4c2 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -3914,7 +3914,8 @@ struct ieee80211_ops { struct ieee80211_channel *chan, int duration, enum ieee80211_roc_type type); - int (*cancel_remain_on_channel)(struct ieee80211_hw *hw); + int (*cancel_remain_on_channel)(struct ieee80211_hw *hw, + struct ieee80211_vif *vif); int (*set_ringparam)(struct ieee80211_hw *hw, u32 tx, u32 rx); void (*get_ringparam)(struct ieee80211_hw *hw, u32 *tx, u32 *tx_max, u32 *rx, u32 *rx_max); diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index c2d8b5451a5e..2c9b3eb8b652 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -692,14 +692,16 @@ static inline int drv_remain_on_channel(struct ieee80211_local *local, return ret; } -static inline int drv_cancel_remain_on_channel(struct ieee80211_local *local) +static inline int +drv_cancel_remain_on_channel(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata) { int ret; might_sleep(); - trace_drv_cancel_remain_on_channel(local); - ret = local->ops->cancel_remain_on_channel(&local->hw); + trace_drv_cancel_remain_on_channel(local, sdata); + ret = local->ops->cancel_remain_on_channel(&local->hw, &sdata->vif); trace_drv_return_int(local, ret); return ret; diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c index 60ef8972b254..c710504ccf1a 100644 --- a/net/mac80211/offchannel.c +++ b/net/mac80211/offchannel.c @@ -8,6 +8,7 @@ * Copyright 2006-2007 Jiri Benc * Copyright 2007, Michael Wu * Copyright 2009 Johannes Berg + * Copyright (C) 2019 Intel Corporation */ #include #include @@ -732,7 +733,7 @@ static int ieee80211_cancel_roc(struct ieee80211_local *local, } if (local->ops->remain_on_channel) { - ret = drv_cancel_remain_on_channel(local); + ret = drv_cancel_remain_on_channel(local, roc->sdata); if (WARN_ON_ONCE(ret)) { mutex_unlock(&local->mtx); return ret; @@ -991,7 +992,7 @@ void ieee80211_roc_purge(struct ieee80211_local *local, if (roc->started) { if (local->ops->remain_on_channel) { /* can race, so ignore return value */ - drv_cancel_remain_on_channel(local); + drv_cancel_remain_on_channel(local, sdata); ieee80211_roc_notify_destroy(roc); } else { roc->abort = true; diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h index 3bb4459b52c7..4768322dc202 100644 --- a/net/mac80211/trace.h +++ b/net/mac80211/trace.h @@ -1242,9 +1242,10 @@ TRACE_EVENT(drv_remain_on_channel, ) ); -DEFINE_EVENT(local_only_evt, drv_cancel_remain_on_channel, - TP_PROTO(struct ieee80211_local *local), - TP_ARGS(local) +DEFINE_EVENT(local_sdata_evt, drv_cancel_remain_on_channel, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata), + TP_ARGS(local, sdata) ); TRACE_EVENT(drv_set_ringparam, -- cgit v1.2.3 From 612fcfd9b31f08858d2a2e1279adda367e1ade00 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 12 Jun 2019 16:26:58 +0200 Subject: mac80211: remove unused and unneeded remove_sta_debugfs callback The remove_sta_debugfs callback in struct rate_control_ops is no longer used by any driver, as there is no need for it (the debugfs directory is already removed recursivly by the mac80211 core.) Because no one needs it, just remove it to keep anyone else from accidentally using it in the future. Cc: Johannes Berg Cc: "David S. Miller" Cc: linux-wireless@vger.kernel.org Cc: netdev@vger.kernel.org Signed-off-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20190612142658.12792-5-gregkh@linuxfoundation.org Signed-off-by: Johannes Berg --- include/net/mac80211.h | 1 - net/mac80211/rate.h | 9 --------- net/mac80211/sta_info.c | 1 - 3 files changed, 11 deletions(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index e39bf85ae4c2..6cc5b25edf9d 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -5946,7 +5946,6 @@ struct rate_control_ops { void (*add_sta_debugfs)(void *priv, void *priv_sta, struct dentry *dir); - void (*remove_sta_debugfs)(void *priv, void *priv_sta); u32 (*get_expected_throughput)(void *priv_sta); }; diff --git a/net/mac80211/rate.h b/net/mac80211/rate.h index 5d5348bc41ec..5397c6dad056 100644 --- a/net/mac80211/rate.h +++ b/net/mac80211/rate.h @@ -60,15 +60,6 @@ static inline void rate_control_add_sta_debugfs(struct sta_info *sta) #endif } -static inline void rate_control_remove_sta_debugfs(struct sta_info *sta) -{ -#ifdef CONFIG_MAC80211_DEBUGFS - struct rate_control_ref *ref = sta->rate_ctrl; - if (ref && ref->ops->remove_sta_debugfs) - ref->ops->remove_sta_debugfs(ref->priv, sta->rate_ctrl_priv); -#endif -} - void ieee80211_check_rate_mask(struct ieee80211_sub_if_data *sdata); /* Get a reference to the rate control algorithm. If `name' is NULL, get the diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 95eb8220e2e4..fb6614f57cbc 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -1065,7 +1065,6 @@ static void __sta_info_destroy_part2(struct sta_info *sta) cfg80211_del_sta_sinfo(sdata->dev, sta->sta.addr, sinfo, GFP_KERNEL); kfree(sinfo); - rate_control_remove_sta_debugfs(sta); ieee80211_sta_debugfs_remove(sta); cleanup_single_sta(sta); -- cgit v1.2.3 From fb0e76abe34bd67756dbdf4d5982b7dc54afa1d8 Mon Sep 17 00:00:00 2001 From: Erik Stromdahl Date: Mon, 17 Jun 2019 22:01:39 +0200 Subject: mac80211: add tx dequeue function for process context Since ieee80211_tx_dequeue() must not be called with softirqs enabled (i.e. from process context without proper disable of bottom halves), we add a wrapper that disables bottom halves before calling ieee80211_tx_dequeue() The new function is named ieee80211_tx_dequeue_ni() just as all other from-process-context versions found in mac80211. The documentation of ieee80211_tx_dequeue() is also updated so it mentions that the function should not be called from process context. Signed-off-by: Erik Stromdahl Link: https://lore.kernel.org/r/20190617200140.6189-1-erik.stromdahl@gmail.com Signed-off-by: Johannes Berg --- include/net/mac80211.h | 26 ++++++++++++++++++++++++++ net/mac80211/tx.c | 2 ++ 2 files changed, 28 insertions(+) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 6cc5b25edf9d..fbe1c29e3349 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -6234,10 +6234,36 @@ void ieee80211_unreserve_tid(struct ieee80211_sta *sta, u8 tid); * but for the duration of the frame handling. * However, also note that while in the wake_tx_queue() method, * rcu_read_lock() is already held. + * + * softirqs must also be disabled when this function is called. + * In process context, use ieee80211_tx_dequeue_ni() instead. */ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw, struct ieee80211_txq *txq); +/** + * ieee80211_tx_dequeue_ni - dequeue a packet from a software tx queue + * (in process context) + * + * Like ieee80211_tx_dequeue() but can be called in process context + * (internally disables bottom halves). + * + * @hw: pointer as obtained from ieee80211_alloc_hw() + * @txq: pointer obtained from station or virtual interface, or from + * ieee80211_next_txq() + */ +static inline struct sk_buff *ieee80211_tx_dequeue_ni(struct ieee80211_hw *hw, + struct ieee80211_txq *txq) +{ + struct sk_buff *skb; + + local_bh_disable(); + skb = ieee80211_tx_dequeue(hw, txq); + local_bh_enable(); + + return skb; +} + /** * ieee80211_next_txq - get next tx queue to pull packets from * diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index f13eb2f61ccf..fb8870d9eba3 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -3546,6 +3546,8 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw, ieee80211_tx_result r; struct ieee80211_vif *vif = txq->vif; + WARN_ON_ONCE(softirq_count() == 0); + begin: spin_lock_bh(&fq->lock); -- cgit v1.2.3 From 3e47bf1ca4c363ba8b1f99c4c3dcda13d2979954 Mon Sep 17 00:00:00 2001 From: Alexander Wetzel Date: Sat, 29 Jun 2019 21:50:13 +0200 Subject: mac80211: Simplify Extended Key ID API 1) Drop IEEE80211_HW_EXT_KEY_ID_NATIVE and let drivers directly set the NL80211_EXT_FEATURE_EXT_KEY_ID flag. 2) Drop IEEE80211_HW_NO_AMPDU_KEYBORDER_SUPPORT and simply assume all drivers are unable to handle A-MPDU key borders. The new Extended Key ID API now requires all mac80211 drivers to set NL80211_EXT_FEATURE_EXT_KEY_ID when they implement set_key() and can handle Extended Key ID. For drivers not providing set_key() mac80211 itself enables Extended Key ID support, using the internal SW crypto services. Signed-off-by: Alexander Wetzel Link: https://lore.kernel.org/r/20190629195015.19680-2-alexander@wetzel-home.de Signed-off-by: Johannes Berg --- include/net/mac80211.h | 8 -------- net/mac80211/debugfs.c | 2 -- net/mac80211/key.c | 18 ++++++++---------- net/mac80211/main.c | 18 ++++++------------ 4 files changed, 14 insertions(+), 32 deletions(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index fbe1c29e3349..58941893a13f 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -2268,12 +2268,6 @@ struct ieee80211_txq { * @IEEE80211_HW_SUPPORTS_ONLY_HE_MULTI_BSSID: Hardware supports multi BSSID * only for HE APs. Applies if @IEEE80211_HW_SUPPORTS_MULTI_BSSID is set. * - * @IEEE80211_HW_EXT_KEY_ID_NATIVE: Driver and hardware are supporting Extended - * Key ID and can handle two unicast keys per station for Rx and Tx. - * - * @IEEE80211_HW_NO_AMPDU_KEYBORDER_SUPPORT: The card/driver can't handle - * active Tx A-MPDU sessions with Extended Key IDs during rekey. - * * @NUM_IEEE80211_HW_FLAGS: number of hardware flags, used for sizing arrays */ enum ieee80211_hw_flags { @@ -2325,8 +2319,6 @@ enum ieee80211_hw_flags { IEEE80211_HW_TX_STATUS_NO_AMPDU_LEN, IEEE80211_HW_SUPPORTS_MULTI_BSSID, IEEE80211_HW_SUPPORTS_ONLY_HE_MULTI_BSSID, - IEEE80211_HW_EXT_KEY_ID_NATIVE, - IEEE80211_HW_NO_AMPDU_KEYBORDER_SUPPORT, /* keep last, obviously */ NUM_IEEE80211_HW_FLAGS diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index 2e7f75938c51..47435f57e086 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -271,8 +271,6 @@ static const char *hw_flag_names[] = { FLAG(TX_STATUS_NO_AMPDU_LEN), FLAG(SUPPORTS_MULTI_BSSID), FLAG(SUPPORTS_ONLY_HE_MULTI_BSSID), - FLAG(EXT_KEY_ID_NATIVE), - FLAG(NO_AMPDU_KEYBORDER_SUPPORT), #undef FLAG }; diff --git a/net/mac80211/key.c b/net/mac80211/key.c index dd60f6428049..92c3affb0eb0 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -270,8 +270,7 @@ int ieee80211_set_tx_key(struct ieee80211_key *key) sta->ptk_idx = key->conf.keyidx; - if (ieee80211_hw_check(&local->hw, NO_AMPDU_KEYBORDER_SUPPORT)) - clear_sta_flag(sta, WLAN_STA_BLOCK_BA); + clear_sta_flag(sta, WLAN_STA_BLOCK_BA); ieee80211_check_fast_xmit(sta); return 0; @@ -289,16 +288,15 @@ static void ieee80211_pairwise_rekey(struct ieee80211_key *old, if (new->conf.flags & IEEE80211_KEY_FLAG_NO_AUTO_TX) { /* Extended Key ID key install, initial one or rekey */ - if (sta->ptk_idx != INVALID_PTK_KEYIDX && - ieee80211_hw_check(&local->hw, - NO_AMPDU_KEYBORDER_SUPPORT)) { + if (sta->ptk_idx != INVALID_PTK_KEYIDX) { /* Aggregation Sessions with Extended Key ID must not * mix MPDUs with different keyIDs within one A-MPDU. - * Tear down any running Tx aggregation and all new - * Rx/Tx aggregation request during rekey if the driver - * asks us to do so. (Blocking Tx only would be - * sufficient but WLAN_STA_BLOCK_BA gets the job done - * for the few ms we need it.) + * Tear down running Tx aggregation sessions and block + * new Rx/Tx aggregation requests during rekey to + * ensure there are no A-MPDUs for the driver to + * aggregate. (Blocking Tx only would be sufficient but + * WLAN_STA_BLOCK_BA gets the job done for the few ms + * we need it.) */ set_sta_flag(sta, WLAN_STA_BLOCK_BA); mutex_lock(&sta->ampdu_mlme.mtx); diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 4c2702f128f3..29b9d57df1a3 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -1048,21 +1048,15 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) } } - /* Enable Extended Key IDs when driver allowed it, or when it - * supports neither HW crypto nor A-MPDUs + /* Mac80211 and therefore all drivers using SW crypto only + * are able to handle PTK rekeys and Extended Key ID. */ - if ((!local->ops->set_key && - !ieee80211_hw_check(hw, AMPDU_AGGREGATION)) || - ieee80211_hw_check(&local->hw, EXT_KEY_ID_NATIVE)) - wiphy_ext_feature_set(local->hw.wiphy, - NL80211_EXT_FEATURE_EXT_KEY_ID); - - /* Mac80211 and therefore all cards only using SW crypto are able to - * handle PTK rekeys correctly - */ - if (!local->ops->set_key) + if (!local->ops->set_key) { wiphy_ext_feature_set(local->hw.wiphy, NL80211_EXT_FEATURE_CAN_REPLACE_PTK0); + wiphy_ext_feature_set(local->hw.wiphy, + NL80211_EXT_FEATURE_EXT_KEY_ID); + } /* * Calculate scan IE length -- we need this to alloc -- cgit v1.2.3 From dc3998ec5cf2d377f2e85ba16b6a15affec98a0a Mon Sep 17 00:00:00 2001 From: Alexander Wetzel Date: Sat, 29 Jun 2019 21:50:14 +0200 Subject: mac80211: AMPDU handling for rekeys with Extended Key ID Extended Key ID allows A-MPDU sessions while rekeying as long as each A-MPDU aggregates only MPDUs with one keyid together. Drivers able to segregate MPDUs accordingly can tell mac80211 to not stop A-MPDU sessions when rekeying by setting the new flag IEEE80211_HW_AMPDU_KEYBORDER_SUPPORT. Signed-off-by: Alexander Wetzel Link: https://lore.kernel.org/r/20190629195015.19680-3-alexander@wetzel-home.de Signed-off-by: Johannes Berg --- include/net/mac80211.h | 5 +++++ net/mac80211/debugfs.c | 1 + net/mac80211/key.c | 14 ++++++++------ 3 files changed, 14 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 58941893a13f..0187d84031fc 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -2268,6 +2268,10 @@ struct ieee80211_txq { * @IEEE80211_HW_SUPPORTS_ONLY_HE_MULTI_BSSID: Hardware supports multi BSSID * only for HE APs. Applies if @IEEE80211_HW_SUPPORTS_MULTI_BSSID is set. * + * @IEEE80211_HW_AMPDU_KEYBORDER_SUPPORT: The card and driver is only + * aggregating MPDUs with the same keyid, allowing mac80211 to keep Tx + * A-MPDU sessions active while rekeying with Extended Key ID. + * * @NUM_IEEE80211_HW_FLAGS: number of hardware flags, used for sizing arrays */ enum ieee80211_hw_flags { @@ -2319,6 +2323,7 @@ enum ieee80211_hw_flags { IEEE80211_HW_TX_STATUS_NO_AMPDU_LEN, IEEE80211_HW_SUPPORTS_MULTI_BSSID, IEEE80211_HW_SUPPORTS_ONLY_HE_MULTI_BSSID, + IEEE80211_HW_AMPDU_KEYBORDER_SUPPORT, /* keep last, obviously */ NUM_IEEE80211_HW_FLAGS diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index 47435f57e086..568b3b276931 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -271,6 +271,7 @@ static const char *hw_flag_names[] = { FLAG(TX_STATUS_NO_AMPDU_LEN), FLAG(SUPPORTS_MULTI_BSSID), FLAG(SUPPORTS_ONLY_HE_MULTI_BSSID), + FLAG(AMPDU_KEYBORDER_SUPPORT), #undef FLAG }; diff --git a/net/mac80211/key.c b/net/mac80211/key.c index 92c3affb0eb0..7dfee848abac 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -270,7 +270,8 @@ int ieee80211_set_tx_key(struct ieee80211_key *key) sta->ptk_idx = key->conf.keyidx; - clear_sta_flag(sta, WLAN_STA_BLOCK_BA); + if (!ieee80211_hw_check(&local->hw, AMPDU_KEYBORDER_SUPPORT)) + clear_sta_flag(sta, WLAN_STA_BLOCK_BA); ieee80211_check_fast_xmit(sta); return 0; @@ -288,15 +289,16 @@ static void ieee80211_pairwise_rekey(struct ieee80211_key *old, if (new->conf.flags & IEEE80211_KEY_FLAG_NO_AUTO_TX) { /* Extended Key ID key install, initial one or rekey */ - if (sta->ptk_idx != INVALID_PTK_KEYIDX) { + if (sta->ptk_idx != INVALID_PTK_KEYIDX && + !ieee80211_hw_check(&local->hw, AMPDU_KEYBORDER_SUPPORT)) { /* Aggregation Sessions with Extended Key ID must not * mix MPDUs with different keyIDs within one A-MPDU. * Tear down running Tx aggregation sessions and block * new Rx/Tx aggregation requests during rekey to - * ensure there are no A-MPDUs for the driver to - * aggregate. (Blocking Tx only would be sufficient but - * WLAN_STA_BLOCK_BA gets the job done for the few ms - * we need it.) + * ensure there are no A-MPDUs when the driver is not + * supporting A-MPDU key borders. (Blocking Tx only + * would be sufficient but WLAN_STA_BLOCK_BA gets the + * job done for the few ms we need it.) */ set_sta_flag(sta, WLAN_STA_BLOCK_BA); mutex_lock(&sta->ampdu_mlme.mtx); -- cgit v1.2.3 From 2aa485e1148557215337731b2c79f5569edcbbab Mon Sep 17 00:00:00 2001 From: John Crispin Date: Sat, 13 Jul 2019 18:36:41 +0200 Subject: mac80211: add support for parsing ADDBA_EXT IEs ADDBA_EXT IEs can be used to negotiate the BA fragmentation level. Signed-off-by: John Crispin Link: https://lore.kernel.org/r/20190713163642.18491-2-john@phrozen.org Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 8 ++++++++ net/mac80211/ieee80211_i.h | 1 + net/mac80211/util.c | 7 +++++++ 3 files changed, 16 insertions(+) (limited to 'include') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 8511fadc0935..f36144eda5d6 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -881,6 +881,14 @@ struct ieee80211_tpc_report_ie { u8 link_margin; } __packed; +#define IEEE80211_ADDBA_EXT_FRAG_LEVEL_MASK GENMASK(2, 1) +#define IEEE80211_ADDBA_EXT_FRAG_LEVEL_SHIFT 1 +#define IEEE80211_ADDBA_EXT_NO_FRAG BIT(0) + +struct ieee80211_addba_ext_ie { + u8 data; +} __packed; + struct ieee80211_mgmt { __le16 frame_control; __le16 duration; diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 004e2e3adb88..c67da3575e74 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1506,6 +1506,7 @@ struct ieee802_11_elems { u8 max_bssid_indicator; u8 dtim_count; u8 dtim_period; + const struct ieee80211_addba_ext_ie *addba_ext_ie; /* length of them, respectively */ u8 ext_capab_len; diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 1b224fa27367..3441558ef2d2 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -1200,6 +1200,13 @@ _ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action, elems->cisco_dtpc_elem = pos; break; + case WLAN_EID_ADDBA_EXT: + if (elen != sizeof(struct ieee80211_addba_ext_ie)) { + elem_parse_failed = true; + break; + } + elems->addba_ext_ie = (void *)pos; + break; case WLAN_EID_TIMEOUT_INTERVAL: if (elen >= sizeof(struct ieee80211_timeout_interval_ie)) elems->timeout_int = (void *)pos; -- cgit v1.2.3 From cbe77dde4757446bbe333299b0c91d48b8d575a2 Mon Sep 17 00:00:00 2001 From: John Crispin Date: Sun, 14 Jul 2019 17:44:14 +0200 Subject: mac80211: add xmit rate to struct ieee80211_tx_status Right now struct ieee80211_tx_rate cannot hold HE rates. Lets use struct ieee80211_tx_status instead. This will also make the code future-proof for when we have EHT. Signed-off-by: John Crispin Link: https://lore.kernel.org/r/20190714154419.11854-2-john@phrozen.org Signed-off-by: Johannes Berg --- include/net/mac80211.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 0187d84031fc..6fe4381ba0ef 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1058,11 +1058,13 @@ struct ieee80211_tx_info { * @sta: Station that the packet was transmitted for * @info: Basic tx status information * @skb: Packet skb (can be NULL if not provided by the driver) + * @rate: The TX rate that was used when sending the packet */ struct ieee80211_tx_status { struct ieee80211_sta *sta; struct ieee80211_tx_info *info; struct sk_buff *skb; + struct rate_info *rate; }; /** -- cgit v1.2.3 From e6f4051123fd33901e9655a675b22aefcdc5d277 Mon Sep 17 00:00:00 2001 From: Manikanta Pubbisetty Date: Mon, 22 Jul 2019 12:44:50 +0530 Subject: {nl,mac}80211: fix interface combinations on crypto controlled devices Commit 33d915d9e8ce ("{nl,mac}80211: allow 4addr AP operation on crypto controlled devices") has introduced a change which allows 4addr operation on crypto controlled devices (ex: ath10k). This change has inadvertently impacted the interface combinations logic on such devices. General rule is that software interfaces like AP/VLAN should not be listed under supported interface combinations and should not be considered during validation of these combinations; because of the aforementioned change, AP/VLAN interfaces(if present) will be checked against interfaces supported by the device and blocks valid interface combinations. Consider a case where an AP and AP/VLAN are up and running; when a second AP device is brought up on the same physical device, this AP will be checked against the AP/VLAN interface (which will not be part of supported interface combinations of the device) and blocks second AP to come up. Add a new API cfg80211_iftype_allowed() to fix the problem, this API works for all devices with/without SW crypto control. Signed-off-by: Manikanta Pubbisetty Fixes: 33d915d9e8ce ("{nl,mac}80211: allow 4addr AP operation on crypto controlled devices") Link: https://lore.kernel.org/r/1563779690-9716-1-git-send-email-mpubbise@codeaurora.org Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 15 +++++++++++++++ net/mac80211/util.c | 7 +++---- net/wireless/core.c | 6 ++---- net/wireless/nl80211.c | 4 +--- net/wireless/util.c | 27 +++++++++++++++++++++++++-- 5 files changed, 46 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 45850a8391d9..26e2ad2c7027 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -7320,6 +7320,21 @@ void cfg80211_pmsr_complete(struct wireless_dev *wdev, struct cfg80211_pmsr_request *req, gfp_t gfp); +/** + * cfg80211_iftype_allowed - check whether the interface can be allowed + * @wiphy: the wiphy + * @iftype: interface type + * @is_4addr: use_4addr flag, must be '0' when check_swif is '1' + * @check_swif: check iftype against software interfaces + * + * Check whether the interface is allowed to operate; additionally, this API + * can be used to check iftype against the software interfaces when + * check_swif is '1'. + */ +bool cfg80211_iftype_allowed(struct wiphy *wiphy, enum nl80211_iftype iftype, + bool is_4addr, u8 check_swif); + + /* Logging, debugging and troubleshooting/diagnostic helpers. */ /* wiphy_printk helpers, similar to dev_printk */ diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 1b224fa27367..ad1e58184c4e 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -3796,9 +3796,7 @@ int ieee80211_check_combinations(struct ieee80211_sub_if_data *sdata, } /* Always allow software iftypes */ - if (local->hw.wiphy->software_iftypes & BIT(iftype) || - (iftype == NL80211_IFTYPE_AP_VLAN && - local->hw.wiphy->flags & WIPHY_FLAG_4ADDR_AP)) { + if (cfg80211_iftype_allowed(local->hw.wiphy, iftype, 0, 1)) { if (radar_detect) return -EINVAL; return 0; @@ -3833,7 +3831,8 @@ int ieee80211_check_combinations(struct ieee80211_sub_if_data *sdata, if (sdata_iter == sdata || !ieee80211_sdata_running(sdata_iter) || - local->hw.wiphy->software_iftypes & BIT(wdev_iter->iftype)) + cfg80211_iftype_allowed(local->hw.wiphy, + wdev_iter->iftype, 0, 1)) continue; params.iftype_num[wdev_iter->iftype]++; diff --git a/net/wireless/core.c b/net/wireless/core.c index 45d9afcff6d5..32b3c719fdfc 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -1410,10 +1410,8 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, } break; case NETDEV_PRE_UP: - if (!(wdev->wiphy->interface_modes & BIT(wdev->iftype)) && - !(wdev->iftype == NL80211_IFTYPE_AP_VLAN && - rdev->wiphy.flags & WIPHY_FLAG_4ADDR_AP && - wdev->use_4addr)) + if (!cfg80211_iftype_allowed(wdev->wiphy, wdev->iftype, + wdev->use_4addr, 0)) return notifier_from_errno(-EOPNOTSUPP); if (rfkill_blocked(rdev->rfkill)) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index fc83dd179c1a..fd05ae1437a9 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -3484,9 +3484,7 @@ static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info) return err; } - if (!(rdev->wiphy.interface_modes & (1 << type)) && - !(type == NL80211_IFTYPE_AP_VLAN && params.use_4addr && - rdev->wiphy.flags & WIPHY_FLAG_4ADDR_AP)) + if (!cfg80211_iftype_allowed(&rdev->wiphy, type, params.use_4addr, 0)) return -EOPNOTSUPP; err = nl80211_parse_mon_options(rdev, type, info, ¶ms); diff --git a/net/wireless/util.c b/net/wireless/util.c index 1c39d6a2e850..d0e35b7b9e35 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -1697,7 +1697,7 @@ int cfg80211_iter_combinations(struct wiphy *wiphy, for (iftype = 0; iftype < NUM_NL80211_IFTYPES; iftype++) { num_interfaces += params->iftype_num[iftype]; if (params->iftype_num[iftype] > 0 && - !(wiphy->software_iftypes & BIT(iftype))) + !cfg80211_iftype_allowed(wiphy, iftype, 0, 1)) used_iftypes |= BIT(iftype); } @@ -1719,7 +1719,7 @@ int cfg80211_iter_combinations(struct wiphy *wiphy, return -ENOMEM; for (iftype = 0; iftype < NUM_NL80211_IFTYPES; iftype++) { - if (wiphy->software_iftypes & BIT(iftype)) + if (cfg80211_iftype_allowed(wiphy, iftype, 0, 1)) continue; for (j = 0; j < c->n_limits; j++) { all_iftypes |= limits[j].types; @@ -2072,3 +2072,26 @@ int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap, return max_vht_nss; } EXPORT_SYMBOL(ieee80211_get_vht_max_nss); + +bool cfg80211_iftype_allowed(struct wiphy *wiphy, enum nl80211_iftype iftype, + bool is_4addr, u8 check_swif) + +{ + bool is_vlan = iftype == NL80211_IFTYPE_AP_VLAN; + + switch (check_swif) { + case 0: + if (is_vlan && is_4addr) + return wiphy->flags & WIPHY_FLAG_4ADDR_AP; + return wiphy->interface_modes & BIT(iftype); + case 1: + if (!(wiphy->software_iftypes & BIT(iftype)) && is_vlan) + return wiphy->flags & WIPHY_FLAG_4ADDR_AP; + return wiphy->software_iftypes & BIT(iftype); + default: + break; + } + + return false; +} +EXPORT_SYMBOL(cfg80211_iftype_allowed); -- cgit v1.2.3 From 91b05a7e7d8033a90a64f5fc0e3808db423e420a Mon Sep 17 00:00:00 2001 From: Ondrej Mosnacek Date: Tue, 9 Jul 2019 13:11:24 +0200 Subject: crypto: user - make NETLINK_CRYPTO work inside netns Currently, NETLINK_CRYPTO works only in the init network namespace. It doesn't make much sense to cut it out of the other network namespaces, so do the minor plumbing work necessary to make it work in any network namespace. Code inspired by net/core/sock_diag.c. Tested using kcapi-dgst from libkcapi [1]: Before: # unshare -n kcapi-dgst -c sha256 Signed-off-by: Herbert Xu --- crypto/crypto_user_base.c | 37 ++++++++++++++++++++++++------------ crypto/crypto_user_stat.c | 4 +++- include/crypto/internal/cryptouser.h | 2 -- include/net/net_namespace.h | 3 +++ 4 files changed, 31 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/crypto/crypto_user_base.c b/crypto/crypto_user_base.c index c65e39005ce2..910e0b46012e 100644 --- a/crypto/crypto_user_base.c +++ b/crypto/crypto_user_base.c @@ -10,9 +10,10 @@ #include #include #include -#include #include +#include #include +#include #include #include #include @@ -25,9 +26,6 @@ static DEFINE_MUTEX(crypto_cfg_mutex); -/* The crypto netlink socket */ -struct sock *crypto_nlsk; - struct crypto_dump_info { struct sk_buff *in_skb; struct sk_buff *out_skb; @@ -186,6 +184,7 @@ out: static int crypto_report(struct sk_buff *in_skb, struct nlmsghdr *in_nlh, struct nlattr **attrs) { + struct net *net = sock_net(in_skb->sk); struct crypto_user_alg *p = nlmsg_data(in_nlh); struct crypto_alg *alg; struct sk_buff *skb; @@ -217,7 +216,7 @@ drop_alg: if (err) return err; - return nlmsg_unicast(crypto_nlsk, skb, NETLINK_CB(in_skb).portid); + return nlmsg_unicast(net->crypto_nlsk, skb, NETLINK_CB(in_skb).portid); } static int crypto_dump_report(struct sk_buff *skb, struct netlink_callback *cb) @@ -420,6 +419,7 @@ static const struct crypto_link { static int crypto_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { + struct net *net = sock_net(skb->sk); struct nlattr *attrs[CRYPTOCFGA_MAX+1]; const struct crypto_link *link; int type, err; @@ -450,7 +450,7 @@ static int crypto_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, .done = link->done, .min_dump_alloc = min(dump_alloc, 65535UL), }; - err = netlink_dump_start(crypto_nlsk, skb, nlh, &c); + err = netlink_dump_start(net->crypto_nlsk, skb, nlh, &c); } return err; @@ -474,22 +474,35 @@ static void crypto_netlink_rcv(struct sk_buff *skb) mutex_unlock(&crypto_cfg_mutex); } -static int __init crypto_user_init(void) +static int __net_init crypto_netlink_init(struct net *net) { struct netlink_kernel_cfg cfg = { .input = crypto_netlink_rcv, }; - crypto_nlsk = netlink_kernel_create(&init_net, NETLINK_CRYPTO, &cfg); - if (!crypto_nlsk) - return -ENOMEM; + net->crypto_nlsk = netlink_kernel_create(net, NETLINK_CRYPTO, &cfg); + return net->crypto_nlsk == NULL ? -ENOMEM : 0; +} - return 0; +static void __net_exit crypto_netlink_exit(struct net *net) +{ + netlink_kernel_release(net->crypto_nlsk); + net->crypto_nlsk = NULL; +} + +static struct pernet_operations crypto_netlink_net_ops = { + .init = crypto_netlink_init, + .exit = crypto_netlink_exit, +}; + +static int __init crypto_user_init(void) +{ + return register_pernet_subsys(&crypto_netlink_net_ops); } static void __exit crypto_user_exit(void) { - netlink_kernel_release(crypto_nlsk); + unregister_pernet_subsys(&crypto_netlink_net_ops); } module_init(crypto_user_init); diff --git a/crypto/crypto_user_stat.c b/crypto/crypto_user_stat.c index a03f326a63d3..8bad88413de1 100644 --- a/crypto/crypto_user_stat.c +++ b/crypto/crypto_user_stat.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -298,6 +299,7 @@ out: int crypto_reportstat(struct sk_buff *in_skb, struct nlmsghdr *in_nlh, struct nlattr **attrs) { + struct net *net = sock_net(in_skb->sk); struct crypto_user_alg *p = nlmsg_data(in_nlh); struct crypto_alg *alg; struct sk_buff *skb; @@ -329,7 +331,7 @@ drop_alg: if (err) return err; - return nlmsg_unicast(crypto_nlsk, skb, NETLINK_CB(in_skb).portid); + return nlmsg_unicast(net->crypto_nlsk, skb, NETLINK_CB(in_skb).portid); } MODULE_LICENSE("GPL"); diff --git a/include/crypto/internal/cryptouser.h b/include/crypto/internal/cryptouser.h index 8c602b187c58..40623f4457df 100644 --- a/include/crypto/internal/cryptouser.h +++ b/include/crypto/internal/cryptouser.h @@ -1,8 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include -extern struct sock *crypto_nlsk; - struct crypto_alg *crypto_alg_match(struct crypto_user_alg *p, int exact); #ifdef CONFIG_CRYPTO_STATS diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 4a9da951a794..85bc1de5dece 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -170,6 +170,9 @@ struct net { #endif #ifdef CONFIG_XDP_SOCKETS struct netns_xdp xdp; +#endif +#if IS_ENABLED(CONFIG_CRYPTO_USER) + struct sock *crypto_nlsk; #endif struct sock *diag_nlsk; atomic_t fnhe_genid; -- cgit v1.2.3 From 1f961807925032daa90267d8a23ff730e7ede07a Mon Sep 17 00:00:00 2001 From: Ralph Campbell Date: Thu, 25 Jul 2019 17:56:44 -0700 Subject: mm/hmm: replace hmm_update with mmu_notifier_range The hmm_mirror_ops callback function sync_cpu_device_pagetables() passes a struct hmm_update which is a simplified version of struct mmu_notifier_range. This is unnecessary so replace hmm_update with mmu_notifier_range directly. Link: https://lore.kernel.org/r/20190726005650.2566-2-rcampbell@nvidia.com Signed-off-by: Ralph Campbell Reviewed: Christoph Hellwig Reviewed-by: Jason Gunthorpe [jgg: white space tuning] Signed-off-by: Jason Gunthorpe --- drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c | 14 ++++++++------ drivers/gpu/drm/nouveau/nouveau_svm.c | 4 ++-- include/linux/hmm.h | 34 ++++++---------------------------- mm/hmm.c | 13 ++++--------- 4 files changed, 20 insertions(+), 45 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c index 3971c201f320..b698b423b25d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c @@ -195,13 +195,14 @@ static void amdgpu_mn_invalidate_node(struct amdgpu_mn_node *node, * Block for operations on BOs to finish and mark pages as accessed and * potentially dirty. */ -static int amdgpu_mn_sync_pagetables_gfx(struct hmm_mirror *mirror, - const struct hmm_update *update) +static int +amdgpu_mn_sync_pagetables_gfx(struct hmm_mirror *mirror, + const struct mmu_notifier_range *update) { struct amdgpu_mn *amn = container_of(mirror, struct amdgpu_mn, mirror); unsigned long start = update->start; unsigned long end = update->end; - bool blockable = update->blockable; + bool blockable = mmu_notifier_range_blockable(update); struct interval_tree_node *it; /* notification is exclusive, but interval is inclusive */ @@ -243,13 +244,14 @@ static int amdgpu_mn_sync_pagetables_gfx(struct hmm_mirror *mirror, * necessitates evicting all user-mode queues of the process. The BOs * are restorted in amdgpu_mn_invalidate_range_end_hsa. */ -static int amdgpu_mn_sync_pagetables_hsa(struct hmm_mirror *mirror, - const struct hmm_update *update) +static int +amdgpu_mn_sync_pagetables_hsa(struct hmm_mirror *mirror, + const struct mmu_notifier_range *update) { struct amdgpu_mn *amn = container_of(mirror, struct amdgpu_mn, mirror); unsigned long start = update->start; unsigned long end = update->end; - bool blockable = update->blockable; + bool blockable = mmu_notifier_range_blockable(update); struct interval_tree_node *it; /* notification is exclusive, but interval is inclusive */ diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c index 545100f7c594..79b29c918717 100644 --- a/drivers/gpu/drm/nouveau/nouveau_svm.c +++ b/drivers/gpu/drm/nouveau/nouveau_svm.c @@ -252,13 +252,13 @@ nouveau_svmm_invalidate(struct nouveau_svmm *svmm, u64 start, u64 limit) static int nouveau_svmm_sync_cpu_device_pagetables(struct hmm_mirror *mirror, - const struct hmm_update *update) + const struct mmu_notifier_range *update) { struct nouveau_svmm *svmm = container_of(mirror, typeof(*svmm), mirror); unsigned long start = update->start; unsigned long limit = update->end; - if (!update->blockable) + if (!mmu_notifier_range_blockable(update)) return -EAGAIN; SVMM_DBG(svmm, "invalidate %016lx-%016lx", start, limit); diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 9f32586684c9..4f870f7017cb 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -340,29 +340,6 @@ static inline uint64_t hmm_device_entry_from_pfn(const struct hmm_range *range, struct hmm_mirror; -/* - * enum hmm_update_event - type of update - * @HMM_UPDATE_INVALIDATE: invalidate range (no indication as to why) - */ -enum hmm_update_event { - HMM_UPDATE_INVALIDATE, -}; - -/* - * struct hmm_update - HMM update information for callback - * - * @start: virtual start address of the range to update - * @end: virtual end address of the range to update - * @event: event triggering the update (what is happening) - * @blockable: can the callback block/sleep ? - */ -struct hmm_update { - unsigned long start; - unsigned long end; - enum hmm_update_event event; - bool blockable; -}; - /* * struct hmm_mirror_ops - HMM mirror device operations callback * @@ -383,9 +360,9 @@ struct hmm_mirror_ops { /* sync_cpu_device_pagetables() - synchronize page tables * * @mirror: pointer to struct hmm_mirror - * @update: update information (see struct hmm_update) - * Return: -EAGAIN if update.blockable false and callback need to - * block, 0 otherwise. + * @update: update information (see struct mmu_notifier_range) + * Return: -EAGAIN if mmu_notifier_range_blockable(update) is false + * and callback needs to block, 0 otherwise. * * This callback ultimately originates from mmu_notifiers when the CPU * page table is updated. The device driver must update its page table @@ -396,8 +373,9 @@ struct hmm_mirror_ops { * page tables are completely updated (TLBs flushed, etc); this is a * synchronous call. */ - int (*sync_cpu_device_pagetables)(struct hmm_mirror *mirror, - const struct hmm_update *update); + int (*sync_cpu_device_pagetables)( + struct hmm_mirror *mirror, + const struct mmu_notifier_range *update); }; /* diff --git a/mm/hmm.c b/mm/hmm.c index 54b3a4162ae9..4040b4427635 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -165,7 +165,6 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn, { struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); struct hmm_mirror *mirror; - struct hmm_update update; struct hmm_range *range; unsigned long flags; int ret = 0; @@ -173,15 +172,10 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn, if (!kref_get_unless_zero(&hmm->kref)) return 0; - update.start = nrange->start; - update.end = nrange->end; - update.event = HMM_UPDATE_INVALIDATE; - update.blockable = mmu_notifier_range_blockable(nrange); - spin_lock_irqsave(&hmm->ranges_lock, flags); hmm->notifiers++; list_for_each_entry(range, &hmm->ranges, list) { - if (update.end < range->start || update.start >= range->end) + if (nrange->end < range->start || nrange->start >= range->end) continue; range->valid = false; @@ -198,9 +192,10 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn, list_for_each_entry(mirror, &hmm->mirrors, list) { int rc; - rc = mirror->ops->sync_cpu_device_pagetables(mirror, &update); + rc = mirror->ops->sync_cpu_device_pagetables(mirror, nrange); if (rc) { - if (WARN_ON(update.blockable || rc != -EAGAIN)) + if (WARN_ON(mmu_notifier_range_blockable(nrange) || + rc != -EAGAIN)) continue; ret = -EAGAIN; break; -- cgit v1.2.3 From 9a4903e49e495bfd2650862dfae4178bebe4db9c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 25 Jul 2019 17:56:46 -0700 Subject: mm/hmm: replace the block argument to hmm_range_fault with a flags value This allows easier expansion to other flags, and also makes the callers a little easier to read. Link: https://lore.kernel.org/r/20190726005650.2566-4-rcampbell@nvidia.com Signed-off-by: Christoph Hellwig Signed-off-by: Ralph Campbell Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 2 +- drivers/gpu/drm/nouveau/nouveau_svm.c | 2 +- include/linux/hmm.h | 11 ++++- mm/hmm.c | 74 ++++++++++++++++----------------- 4 files changed, 48 insertions(+), 41 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index e51b48ac48eb..12a59ac83f72 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -832,7 +832,7 @@ retry: down_read(&mm->mmap_sem); - r = hmm_range_fault(range, true); + r = hmm_range_fault(range, 0); if (unlikely(r < 0)) { if (likely(r == -EAGAIN)) { /* diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c index 79b29c918717..49b520c60fc5 100644 --- a/drivers/gpu/drm/nouveau/nouveau_svm.c +++ b/drivers/gpu/drm/nouveau/nouveau_svm.c @@ -505,7 +505,7 @@ nouveau_range_fault(struct hmm_mirror *mirror, struct hmm_range *range) return -EBUSY; } - ret = hmm_range_fault(range, true); + ret = hmm_range_fault(range, 0); if (ret <= 0) { if (ret == 0) ret = -EBUSY; diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 4f870f7017cb..89a141060e5b 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -407,12 +407,19 @@ int hmm_range_register(struct hmm_range *range, unsigned long end, unsigned page_shift); void hmm_range_unregister(struct hmm_range *range); + +/* + * Retry fault if non-blocking, drop mmap_sem and return -EAGAIN in that case. + */ +#define HMM_FAULT_ALLOW_RETRY (1 << 0) + long hmm_range_snapshot(struct hmm_range *range); -long hmm_range_fault(struct hmm_range *range, bool block); +long hmm_range_fault(struct hmm_range *range, unsigned int flags); + long hmm_range_dma_map(struct hmm_range *range, struct device *device, dma_addr_t *daddrs, - bool block); + unsigned int flags); long hmm_range_dma_unmap(struct hmm_range *range, struct vm_area_struct *vma, struct device *device, diff --git a/mm/hmm.c b/mm/hmm.c index 362944b0fbca..84f2791d3510 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -281,7 +281,7 @@ struct hmm_vma_walk { struct dev_pagemap *pgmap; unsigned long last; bool fault; - bool block; + unsigned int flags; }; static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr, @@ -293,8 +293,11 @@ static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr, struct vm_area_struct *vma = walk->vma; vm_fault_t ret; - flags |= hmm_vma_walk->block ? 0 : FAULT_FLAG_ALLOW_RETRY; - flags |= write_fault ? FAULT_FLAG_WRITE : 0; + if (hmm_vma_walk->flags & HMM_FAULT_ALLOW_RETRY) + flags |= FAULT_FLAG_ALLOW_RETRY; + if (write_fault) + flags |= FAULT_FLAG_WRITE; + ret = handle_mm_fault(vma, addr, flags); if (ret & VM_FAULT_RETRY) { /* Note, handle_mm_fault did up_read(&mm->mmap_sem)) */ @@ -1012,26 +1015,26 @@ long hmm_range_snapshot(struct hmm_range *range) } EXPORT_SYMBOL(hmm_range_snapshot); -/* - * hmm_range_fault() - try to fault some address in a virtual address range - * @range: range being faulted - * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem) - * Return: number of valid pages in range->pfns[] (from range start - * address). This may be zero. If the return value is negative, - * then one of the following values may be returned: +/** + * hmm_range_fault - try to fault some address in a virtual address range + * @range: range being faulted + * @flags: HMM_FAULT_* flags * - * -EINVAL invalid arguments or mm or virtual address are in an - * invalid vma (for instance device file vma). - * -ENOMEM: Out of memory. - * -EPERM: Invalid permission (for instance asking for write and - * range is read only). - * -EAGAIN: If you need to retry and mmap_sem was drop. This can only - * happens if block argument is false. - * -EBUSY: If the the range is being invalidated and you should wait - * for invalidation to finish. - * -EFAULT: Invalid (ie either no valid vma or it is illegal to access - * that range), number of valid pages in range->pfns[] (from - * range start address). + * Return: the number of valid pages in range->pfns[] (from range start + * address), which may be zero. On error one of the following status codes + * can be returned: + * + * -EINVAL: Invalid arguments or mm or virtual address is in an invalid vma + * (e.g., device file vma). + * -ENOMEM: Out of memory. + * -EPERM: Invalid permission (e.g., asking for write and range is read + * only). + * -EAGAIN: A page fault needs to be retried and mmap_sem was dropped. + * -EBUSY: The range has been invalidated and the caller needs to wait for + * the invalidation to finish. + * -EFAULT: Invalid (i.e., either no valid vma or it is illegal to access + * that range) number of valid pages in range->pfns[] (from + * range start address). * * This is similar to a regular CPU page fault except that it will not trigger * any memory migration if the memory being faulted is not accessible by CPUs @@ -1040,7 +1043,7 @@ EXPORT_SYMBOL(hmm_range_snapshot); * On error, for one virtual address in the range, the function will mark the * corresponding HMM pfn entry with an error flag. */ -long hmm_range_fault(struct hmm_range *range, bool block) +long hmm_range_fault(struct hmm_range *range, unsigned int flags) { const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP; unsigned long start = range->start, end; @@ -1086,7 +1089,7 @@ long hmm_range_fault(struct hmm_range *range, bool block) hmm_vma_walk.pgmap = NULL; hmm_vma_walk.last = start; hmm_vma_walk.fault = true; - hmm_vma_walk.block = block; + hmm_vma_walk.flags = flags; hmm_vma_walk.range = range; mm_walk.private = &hmm_vma_walk; end = min(range->end, vma->vm_end); @@ -1125,25 +1128,22 @@ long hmm_range_fault(struct hmm_range *range, bool block) EXPORT_SYMBOL(hmm_range_fault); /** - * hmm_range_dma_map() - hmm_range_fault() and dma map page all in one. - * @range: range being faulted - * @device: device against to dma map page to - * @daddrs: dma address of mapped pages - * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem) - * Return: number of pages mapped on success, -EAGAIN if mmap_sem have been - * drop and you need to try again, some other error value otherwise + * hmm_range_dma_map - hmm_range_fault() and dma map page all in one. + * @range: range being faulted + * @device: device to map page to + * @daddrs: array of dma addresses for the mapped pages + * @flags: HMM_FAULT_* * - * Note same usage pattern as hmm_range_fault(). + * Return: the number of pages mapped on success (including zero), or any + * status return from hmm_range_fault() otherwise. */ -long hmm_range_dma_map(struct hmm_range *range, - struct device *device, - dma_addr_t *daddrs, - bool block) +long hmm_range_dma_map(struct hmm_range *range, struct device *device, + dma_addr_t *daddrs, unsigned int flags) { unsigned long i, npages, mapped; long ret; - ret = hmm_range_fault(range, block); + ret = hmm_range_fault(range, flags); if (ret <= 0) return ret ? ret : -EBUSY; -- cgit v1.2.3 From d45d464b118f428229d91769c8a3cc1e2e0bb4d5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 25 Jul 2019 17:56:47 -0700 Subject: mm/hmm: merge hmm_range_snapshot into hmm_range_fault Add a HMM_FAULT_SNAPSHOT flag so that hmm_range_snapshot can be merged into the almost identical hmm_range_fault function. Link: https://lore.kernel.org/r/20190726005650.2566-5-rcampbell@nvidia.com Signed-off-by: Christoph Hellwig Signed-off-by: Ralph Campbell Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- Documentation/vm/hmm.rst | 17 +++++----- include/linux/hmm.h | 4 ++- mm/hmm.c | 85 ++---------------------------------------------- 3 files changed, 13 insertions(+), 93 deletions(-) (limited to 'include') diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst index 710ce1c701bf..ddcb5ca8b296 100644 --- a/Documentation/vm/hmm.rst +++ b/Documentation/vm/hmm.rst @@ -192,15 +192,14 @@ read only, or fully unmap, etc.). The device must complete the update before the driver callback returns. When the device driver wants to populate a range of virtual addresses, it can -use either:: +use:: - long hmm_range_snapshot(struct hmm_range *range); - long hmm_range_fault(struct hmm_range *range, bool block); + long hmm_range_fault(struct hmm_range *range, unsigned int flags); -The first one (hmm_range_snapshot()) will only fetch present CPU page table +With the HMM_RANGE_SNAPSHOT flag, it will only fetch present CPU page table entries and will not trigger a page fault on missing or non-present entries. -The second one does trigger a page fault on missing or read-only entries if -write access is requested (see below). Page faults use the generic mm page +Without that flag, it does trigger a page fault on missing or read-only entries +if write access is requested (see below). Page faults use the generic mm page fault code path just like a CPU page fault. Both functions copy CPU page table entries into their pfns array argument. Each @@ -227,20 +226,20 @@ The usage pattern is:: /* * Just wait for range to be valid, safe to ignore return value as we - * will use the return value of hmm_range_snapshot() below under the + * will use the return value of hmm_range_fault() below under the * mmap_sem to ascertain the validity of the range. */ hmm_range_wait_until_valid(&range, TIMEOUT_IN_MSEC); again: down_read(&mm->mmap_sem); - ret = hmm_range_snapshot(&range); + ret = hmm_range_fault(&range, HMM_RANGE_SNAPSHOT); if (ret) { up_read(&mm->mmap_sem); if (ret == -EBUSY) { /* * No need to check hmm_range_wait_until_valid() return value - * on retry we will get proper error with hmm_range_snapshot() + * on retry we will get proper error with hmm_range_fault() */ hmm_range_wait_until_valid(&range, TIMEOUT_IN_MSEC); goto again; diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 89a141060e5b..90dc5944b1bc 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -413,7 +413,9 @@ void hmm_range_unregister(struct hmm_range *range); */ #define HMM_FAULT_ALLOW_RETRY (1 << 0) -long hmm_range_snapshot(struct hmm_range *range); +/* Don't fault in missing PTEs, just snapshot the current state. */ +#define HMM_FAULT_SNAPSHOT (1 << 1) + long hmm_range_fault(struct hmm_range *range, unsigned int flags); long hmm_range_dma_map(struct hmm_range *range, diff --git a/mm/hmm.c b/mm/hmm.c index 84f2791d3510..1bc014cddd78 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -280,7 +280,6 @@ struct hmm_vma_walk { struct hmm_range *range; struct dev_pagemap *pgmap; unsigned long last; - bool fault; unsigned int flags; }; @@ -373,7 +372,7 @@ static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, { struct hmm_range *range = hmm_vma_walk->range; - if (!hmm_vma_walk->fault) + if (hmm_vma_walk->flags & HMM_FAULT_SNAPSHOT) return; /* @@ -418,7 +417,7 @@ static void hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk, { unsigned long i; - if (!hmm_vma_walk->fault) { + if (hmm_vma_walk->flags & HMM_FAULT_SNAPSHOT) { *fault = *write_fault = false; return; } @@ -936,85 +935,6 @@ void hmm_range_unregister(struct hmm_range *range) } EXPORT_SYMBOL(hmm_range_unregister); -/* - * hmm_range_snapshot() - snapshot CPU page table for a range - * @range: range - * Return: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid - * permission (for instance asking for write and range is read only), - * -EBUSY if you need to retry, -EFAULT invalid (ie either no valid - * vma or it is illegal to access that range), number of valid pages - * in range->pfns[] (from range start address). - * - * This snapshots the CPU page table for a range of virtual addresses. Snapshot - * validity is tracked by range struct. See in include/linux/hmm.h for example - * on how to use. - */ -long hmm_range_snapshot(struct hmm_range *range) -{ - const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP; - unsigned long start = range->start, end; - struct hmm_vma_walk hmm_vma_walk; - struct hmm *hmm = range->hmm; - struct vm_area_struct *vma; - struct mm_walk mm_walk; - - lockdep_assert_held(&hmm->mm->mmap_sem); - do { - /* If range is no longer valid force retry. */ - if (!range->valid) - return -EBUSY; - - vma = find_vma(hmm->mm, start); - if (vma == NULL || (vma->vm_flags & device_vma)) - return -EFAULT; - - if (is_vm_hugetlb_page(vma)) { - if (huge_page_shift(hstate_vma(vma)) != - range->page_shift && - range->page_shift != PAGE_SHIFT) - return -EINVAL; - } else { - if (range->page_shift != PAGE_SHIFT) - return -EINVAL; - } - - if (!(vma->vm_flags & VM_READ)) { - /* - * If vma do not allow read access, then assume that it - * does not allow write access, either. HMM does not - * support architecture that allow write without read. - */ - hmm_pfns_clear(range, range->pfns, - range->start, range->end); - return -EPERM; - } - - range->vma = vma; - hmm_vma_walk.pgmap = NULL; - hmm_vma_walk.last = start; - hmm_vma_walk.fault = false; - hmm_vma_walk.range = range; - mm_walk.private = &hmm_vma_walk; - end = min(range->end, vma->vm_end); - - mm_walk.vma = vma; - mm_walk.mm = vma->vm_mm; - mm_walk.pte_entry = NULL; - mm_walk.test_walk = NULL; - mm_walk.hugetlb_entry = NULL; - mm_walk.pud_entry = hmm_vma_walk_pud; - mm_walk.pmd_entry = hmm_vma_walk_pmd; - mm_walk.pte_hole = hmm_vma_walk_hole; - mm_walk.hugetlb_entry = hmm_vma_walk_hugetlb_entry; - - walk_page_range(start, end, &mm_walk); - start = end; - } while (start < range->end); - - return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; -} -EXPORT_SYMBOL(hmm_range_snapshot); - /** * hmm_range_fault - try to fault some address in a virtual address range * @range: range being faulted @@ -1088,7 +1008,6 @@ long hmm_range_fault(struct hmm_range *range, unsigned int flags) range->vma = vma; hmm_vma_walk.pgmap = NULL; hmm_vma_walk.last = start; - hmm_vma_walk.fault = true; hmm_vma_walk.flags = flags; hmm_vma_walk.range = range; mm_walk.private = &hmm_vma_walk; -- cgit v1.2.3 From ef11a931bd1c57b02fe2603ff95a392a73041f9e Mon Sep 17 00:00:00 2001 From: John Crispin Date: Tue, 18 Jun 2019 08:19:14 +0200 Subject: mac80211: HE: add Spatial Reuse element parsing support Add support to mac80211 for parsing SPR elements as per P802.11ax_D4.0 section 9.4.2.241. Signed-off-by: Shashidhar Lakkavalli Signed-off-by: John Crispin Link: https://lore.kernel.org/r/20190618061915.7102-2-john@phrozen.org Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 49 ++++++++++++++++++++++++++++++++++++++++++++++ net/mac80211/ieee80211_i.h | 1 + net/mac80211/util.c | 4 ++++ 3 files changed, 54 insertions(+) (limited to 'include') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index f36144eda5d6..a656f31262f1 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1633,6 +1633,18 @@ struct ieee80211_he_operation { u8 optional[0]; } __packed; +/** + * struct ieee80211_he_spr - HE spatial reuse element + * + * This structure is the "HE spatial reuse element" element as + * described in P802.11ax_D4.0 section 9.4.2.241 + */ +struct ieee80211_he_spr { + u8 he_sr_control; + /* Optional 0 to 19 bytes: depends on @he_sr_control */ + u8 optional[0]; +} __packed; + /** * struct ieee80211_he_mu_edca_param_ac_rec - MU AC Parameter Record field * @@ -2071,6 +2083,42 @@ ieee80211_he_oper_size(const u8 *he_oper_ie) return oper_len; } +/* HE Spatial Reuse defines */ +#define IEEE80211_HE_SPR_NON_SRG_OFFSET_PRESENT 0x4 +#define IEEE80211_HE_SPR_SRG_INFORMATION_PRESENT 0x8 + +/* + * ieee80211_he_spr_size - calculate 802.11ax HE Spatial Reuse IE size + * @he_spr_ie: byte data of the He Spatial Reuse IE, stating from the the byte + * after the ext ID byte. It is assumed that he_spr_ie has at least + * sizeof(struct ieee80211_he_spr) bytes, the caller must have validated + * this + * @return the actual size of the IE data (not including header), or 0 on error + */ +static inline u8 +ieee80211_he_spr_size(const u8 *he_spr_ie) +{ + struct ieee80211_he_spr *he_spr = (void *)he_spr_ie; + u8 spr_len = sizeof(struct ieee80211_he_spr); + u32 he_spr_params; + + /* Make sure the input is not NULL */ + if (!he_spr_ie) + return 0; + + /* Calc required length */ + he_spr_params = le32_to_cpu(he_spr->he_sr_control); + if (he_spr_params & IEEE80211_HE_SPR_NON_SRG_OFFSET_PRESENT) + spr_len++; + if (he_spr_params & IEEE80211_HE_SPR_SRG_INFORMATION_PRESENT) + spr_len += 18; + + /* Add the first byte (extension ID) to the total length */ + spr_len++; + + return spr_len; +} + /* Authentication algorithms */ #define WLAN_AUTH_OPEN 0 #define WLAN_AUTH_SHARED_KEY 1 @@ -2493,6 +2541,7 @@ enum ieee80211_eid_ext { WLAN_EID_EXT_HE_OPERATION = 36, WLAN_EID_EXT_UORA = 37, WLAN_EID_EXT_HE_MU_EDCA = 38, + WLAN_EID_EXT_HE_SPR = 39, WLAN_EID_EXT_MAX_CHANNEL_SWITCH_TIME = 52, WLAN_EID_EXT_MULTIPLE_BSSID_CONFIGURATION = 55, WLAN_EID_EXT_NON_INHERITANCE = 56, diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 4c80c0ed67a7..472c5a40317d 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1480,6 +1480,7 @@ struct ieee802_11_elems { const struct ieee80211_meshconf_ie *mesh_config; const u8 *he_cap; const struct ieee80211_he_operation *he_operation; + const struct ieee80211_he_spr *he_spr; const struct ieee80211_mu_edca_param_set *mu_edca_param_set; const u8 *uora_element; const u8 *mesh_id; diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 3441558ef2d2..3e2aeb1a75b4 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -1240,6 +1240,10 @@ _ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action, WLAN_EID_EXT_MULTIPLE_BSSID_CONFIGURATION && elen == 3) { elems->mbssid_config_ie = (void *)&pos[1]; + } else if (pos[0] == WLAN_EID_EXT_HE_SPR && + elen >= sizeof(*elems->he_spr) && + elen >= ieee80211_he_spr_size(&pos[1])) { + elems->he_spr = (void *)&pos[1]; } break; default: -- cgit v1.2.3 From a0b4496a43681cbeec03a38e1b685c80c0d7405d Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 16 Jul 2019 00:09:19 +0200 Subject: mac80211: add IEEE80211_KEY_FLAG_GENERATE_MMIE to ieee80211_key_flags Add IEEE80211_KEY_FLAG_GENERATE_MMIE flag to ieee80211_key_flags in order to allow the driver to notify mac80211 to generate MMIE and that it requires sequence number generation only. This is a preliminary patch to add BIP_CMAC_128 hw support to mt7615 driver Signed-off-by: Lorenzo Bianconi Link: https://lore.kernel.org/r/dfe275f9aa0f1cc6b33085f9efd5d8447f68ad13.1563228405.git.lorenzo@kernel.org Signed-off-by: Johannes Berg --- include/net/mac80211.h | 4 ++++ net/mac80211/wpa.c | 6 +++++- 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 6fe4381ba0ef..9effd286c1ae 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1704,6 +1704,9 @@ struct wireless_dev *ieee80211_vif_to_wdev(struct ieee80211_vif *vif); * a TKIP key if it only requires MIC space. Do not set together with * @IEEE80211_KEY_FLAG_GENERATE_MMIC on the same key. * @IEEE80211_KEY_FLAG_NO_AUTO_TX: Key needs explicit Tx activation. + * @IEEE80211_KEY_FLAG_GENERATE_MMIE: This flag should be set by the driver + * for a AES_CMAC key to indicate that it requires sequence number + * generation only */ enum ieee80211_key_flags { IEEE80211_KEY_FLAG_GENERATE_IV_MGMT = BIT(0), @@ -1716,6 +1719,7 @@ enum ieee80211_key_flags { IEEE80211_KEY_FLAG_RESERVE_TAILROOM = BIT(7), IEEE80211_KEY_FLAG_PUT_MIC_SPACE = BIT(8), IEEE80211_KEY_FLAG_NO_AUTO_TX = BIT(9), + IEEE80211_KEY_FLAG_GENERATE_MMIE = BIT(10), }; /** diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c index ee72779729e5..91bf32af55e9 100644 --- a/net/mac80211/wpa.c +++ b/net/mac80211/wpa.c @@ -946,7 +946,8 @@ ieee80211_crypto_aes_cmac_encrypt(struct ieee80211_tx_data *tx) info = IEEE80211_SKB_CB(skb); - if (info->control.hw_key) + if (info->control.hw_key && + !(key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIE)) return TX_CONTINUE; if (WARN_ON(skb_tailroom(skb) < sizeof(*mmie))) @@ -962,6 +963,9 @@ ieee80211_crypto_aes_cmac_encrypt(struct ieee80211_tx_data *tx) bip_ipn_set64(mmie->sequence_number, pn64); + if (info->control.hw_key) + return TX_CONTINUE; + bip_aad(skb, aad); /* -- cgit v1.2.3 From 7a113110fc8cdda14023c0bffc7bd8b5f3da1edf Mon Sep 17 00:00:00 2001 From: Denis Kenzior Date: Mon, 22 Jul 2019 06:33:10 -0500 Subject: nl80211: document uapi for CMD_FRAME_WAIT_CANCEL Commit 1c38c7f22068 ("nl80211: send event when CMD_FRAME duration expires") added the possibility of NL80211_CMD_FRAME_WAIT_CANCEL being sent whenever the off-channel wait time associated with a CMD_FRAME completes. Document this in the uapi/linux/nl80211.h file. Signed-off-by: Denis Kenzior Link: https://lore.kernel.org/r/20190722113312.14031-1-denkenz@gmail.com Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index beb9a9d0c00a..c45587c2cf44 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -657,7 +657,9 @@ * is used during CSA period. * @NL80211_CMD_FRAME_WAIT_CANCEL: When an off-channel TX was requested, this * command may be used with the corresponding cookie to cancel the wait - * time if it is known that it is no longer necessary. + * time if it is known that it is no longer necessary. This command is + * also sent as an event whenever the driver has completed the off-channel + * wait time. * @NL80211_CMD_ACTION: Alias for @NL80211_CMD_FRAME for backward compatibility. * @NL80211_CMD_FRAME_TX_STATUS: Report TX status of a management frame * transmitted with %NL80211_CMD_FRAME. %NL80211_ATTR_COOKIE identifies -- cgit v1.2.3 From cc374377a19d2a49d693997b62dc3a6f5fac6d61 Mon Sep 17 00:00:00 2001 From: Ralph Campbell Date: Thu, 25 Jul 2019 17:56:50 -0700 Subject: mm/hmm: remove hmm_range vma Since hmm_range_fault() doesn't use the struct hmm_range vma field, remove it. Link: https://lore.kernel.org/r/20190726005650.2566-8-rcampbell@nvidia.com Suggested-by: Jason Gunthorpe Signed-off-by: Ralph Campbell Reviewed-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- drivers/gpu/drm/nouveau/nouveau_svm.c | 7 +++---- include/linux/hmm.h | 1 - mm/hmm.c | 1 - 3 files changed, 3 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c index 49b520c60fc5..a74530b5a523 100644 --- a/drivers/gpu/drm/nouveau/nouveau_svm.c +++ b/drivers/gpu/drm/nouveau/nouveau_svm.c @@ -496,12 +496,12 @@ nouveau_range_fault(struct hmm_mirror *mirror, struct hmm_range *range) range->start, range->end, PAGE_SHIFT); if (ret) { - up_read(&range->vma->vm_mm->mmap_sem); + up_read(&range->hmm->mm->mmap_sem); return (int)ret; } if (!hmm_range_wait_until_valid(range, HMM_RANGE_DEFAULT_TIMEOUT)) { - up_read(&range->vma->vm_mm->mmap_sem); + up_read(&range->hmm->mm->mmap_sem); return -EBUSY; } @@ -509,7 +509,7 @@ nouveau_range_fault(struct hmm_mirror *mirror, struct hmm_range *range) if (ret <= 0) { if (ret == 0) ret = -EBUSY; - up_read(&range->vma->vm_mm->mmap_sem); + up_read(&range->hmm->mm->mmap_sem); hmm_range_unregister(range); return ret; } @@ -682,7 +682,6 @@ nouveau_svm_fault(struct nvif_notify *notify) args.i.p.addr + args.i.p.size, fn - fi); /* Have HMM fault pages within the fault window to the GPU. */ - range.vma = vma; range.start = args.i.p.addr; range.end = args.i.p.addr + args.i.p.size; range.pfns = args.phys; diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 90dc5944b1bc..82265118d94a 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -164,7 +164,6 @@ enum hmm_pfn_value_e { */ struct hmm_range { struct hmm *hmm; - struct vm_area_struct *vma; struct list_head list; unsigned long start; unsigned long end; diff --git a/mm/hmm.c b/mm/hmm.c index 6111c0a3c12d..9a908902e4cc 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -1002,7 +1002,6 @@ long hmm_range_fault(struct hmm_range *range, unsigned int flags) return -EPERM; } - range->vma = vma; hmm_vma_walk.pgmap = NULL; hmm_vma_walk.last = start; hmm_vma_walk.flags = flags; -- cgit v1.2.3 From a7cf3d24ee6081930feb4c830a7f6f16ebe31c49 Mon Sep 17 00:00:00 2001 From: Subash Abhinov Kasiviswanathan Date: Thu, 25 Jul 2019 12:07:12 -0600 Subject: net: qualcomm: rmnet: Fix incorrect UL checksum offload logic The udp_ip4_ind bit is set only for IPv4 UDP non-fragmented packets so that the hardware can flip the checksum to 0xFFFF if the computed checksum is 0 per RFC768. However, this bit had to be set for IPv6 UDP non fragmented packets as well per hardware requirements. Otherwise, IPv6 UDP packets with computed checksum as 0 were transmitted by hardware and were dropped in the network. In addition to setting this bit for IPv6 UDP, the field is also appropriately renamed to udp_ind as part of this change. Fixes: 5eb5f8608ef1 ("net: qualcomm: rmnet: Add support for TX checksum offload") Cc: Sean Tranchetti Signed-off-by: Subash Abhinov Kasiviswanathan Signed-off-by: David S. Miller --- drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c | 13 +++++++++---- include/linux/if_rmnet.h | 4 ++-- 2 files changed, 11 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c index 60189923737a..21d38167f961 100644 --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c @@ -206,9 +206,9 @@ rmnet_map_ipv4_ul_csum_header(void *iphdr, ul_header->csum_insert_offset = skb->csum_offset; ul_header->csum_enabled = 1; if (ip4h->protocol == IPPROTO_UDP) - ul_header->udp_ip4_ind = 1; + ul_header->udp_ind = 1; else - ul_header->udp_ip4_ind = 0; + ul_header->udp_ind = 0; /* Changing remaining fields to network order */ hdr++; @@ -239,6 +239,7 @@ rmnet_map_ipv6_ul_csum_header(void *ip6hdr, struct rmnet_map_ul_csum_header *ul_header, struct sk_buff *skb) { + struct ipv6hdr *ip6h = (struct ipv6hdr *)ip6hdr; __be16 *hdr = (__be16 *)ul_header, offset; offset = htons((__force u16)(skb_transport_header(skb) - @@ -246,7 +247,11 @@ rmnet_map_ipv6_ul_csum_header(void *ip6hdr, ul_header->csum_start_offset = offset; ul_header->csum_insert_offset = skb->csum_offset; ul_header->csum_enabled = 1; - ul_header->udp_ip4_ind = 0; + + if (ip6h->nexthdr == IPPROTO_UDP) + ul_header->udp_ind = 1; + else + ul_header->udp_ind = 0; /* Changing remaining fields to network order */ hdr++; @@ -419,7 +424,7 @@ sw_csum: ul_header->csum_start_offset = 0; ul_header->csum_insert_offset = 0; ul_header->csum_enabled = 0; - ul_header->udp_ip4_ind = 0; + ul_header->udp_ind = 0; priv->stats.csum_sw++; } diff --git a/include/linux/if_rmnet.h b/include/linux/if_rmnet.h index b4f5403383fc..9661416a9bb4 100644 --- a/include/linux/if_rmnet.h +++ b/include/linux/if_rmnet.h @@ -41,11 +41,11 @@ struct rmnet_map_ul_csum_header { __be16 csum_start_offset; #if defined(__LITTLE_ENDIAN_BITFIELD) u16 csum_insert_offset:14; - u16 udp_ip4_ind:1; + u16 udp_ind:1; u16 csum_enabled:1; #elif defined (__BIG_ENDIAN_BITFIELD) u16 csum_enabled:1; - u16 udp_ip4_ind:1; + u16 udp_ind:1; u16 csum_insert_offset:14; #else #error "Please fix " -- cgit v1.2.3 From f1765a1819ff3489db9500c6d464e682e6844a14 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Fri, 26 Jul 2019 12:17:44 +0200 Subject: of: Fix typo in kerneldoc "Findfrom" is not a word. Replace the function synopsis by something that makes sense. Signed-off-by: Thierry Reding Signed-off-by: Rob Herring --- include/linux/of.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/of.h b/include/linux/of.h index 0cf857012f11..844f89e1b039 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -1164,7 +1164,7 @@ static inline int of_property_read_string_index(const struct device_node *np, } /** - * of_property_read_bool - Findfrom a property + * of_property_read_bool - Find a property * @np: device node from which the property value is to be read. * @propname: name of the property to be searched. * -- cgit v1.2.3 From 934d24a5e1508e73c0001afb54a3916e4270428f Mon Sep 17 00:00:00 2001 From: Vitor Soares Date: Fri, 19 Jul 2019 15:30:54 +0200 Subject: i3c: move i3c_device_match_id to device.c and export it Some I3C device drivers need to know which entry matches the i3c_device object passed to the probe function Let's move i3c_device_match_id() to device.c and export it so it can be used by drivers. Signed-off-by: Vitor Soares Signed-off-by: Boris Brezillon --- drivers/i3c/device.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++ drivers/i3c/master.c | 45 --------------------------------------- include/linux/i3c/device.h | 4 ++++ 3 files changed, 57 insertions(+), 45 deletions(-) (limited to 'include') diff --git a/drivers/i3c/device.c b/drivers/i3c/device.c index 69cc040c3a1c..9e2e1406f85e 100644 --- a/drivers/i3c/device.c +++ b/drivers/i3c/device.c @@ -200,6 +200,59 @@ struct i3c_device *dev_to_i3cdev(struct device *dev) } EXPORT_SYMBOL_GPL(dev_to_i3cdev); +/** + * i3c_device_match_id() - Returns the i3c_device_id entry matching @i3cdev + * @i3cdev: I3C device + * @id_table: I3C device match table + * + * Return: a pointer to an i3c_device_id object or NULL if there's no match. + */ +const struct i3c_device_id * +i3c_device_match_id(struct i3c_device *i3cdev, + const struct i3c_device_id *id_table) +{ + struct i3c_device_info devinfo; + const struct i3c_device_id *id; + + i3c_device_get_info(i3cdev, &devinfo); + + /* + * The lower 32bits of the provisional ID is just filled with a random + * value, try to match using DCR info. + */ + if (!I3C_PID_RND_LOWER_32BITS(devinfo.pid)) { + u16 manuf = I3C_PID_MANUF_ID(devinfo.pid); + u16 part = I3C_PID_PART_ID(devinfo.pid); + u16 ext_info = I3C_PID_EXTRA_INFO(devinfo.pid); + + /* First try to match by manufacturer/part ID. */ + for (id = id_table; id->match_flags != 0; id++) { + if ((id->match_flags & I3C_MATCH_MANUF_AND_PART) != + I3C_MATCH_MANUF_AND_PART) + continue; + + if (manuf != id->manuf_id || part != id->part_id) + continue; + + if ((id->match_flags & I3C_MATCH_EXTRA_INFO) && + ext_info != id->extra_info) + continue; + + return id; + } + } + + /* Fallback to DCR match. */ + for (id = id_table; id->match_flags != 0; id++) { + if ((id->match_flags & I3C_MATCH_DCR) && + id->dcr == devinfo.dcr) + return id; + } + + return NULL; +} +EXPORT_SYMBOL_GPL(i3c_device_match_id); + /** * i3c_driver_register_with_owner() - register an I3C device driver * diff --git a/drivers/i3c/master.c b/drivers/i3c/master.c index d6f8b038a896..c58729081899 100644 --- a/drivers/i3c/master.c +++ b/drivers/i3c/master.c @@ -276,51 +276,6 @@ static const struct device_type i3c_device_type = { .uevent = i3c_device_uevent, }; -static const struct i3c_device_id * -i3c_device_match_id(struct i3c_device *i3cdev, - const struct i3c_device_id *id_table) -{ - struct i3c_device_info devinfo; - const struct i3c_device_id *id; - - i3c_device_get_info(i3cdev, &devinfo); - - /* - * The lower 32bits of the provisional ID is just filled with a random - * value, try to match using DCR info. - */ - if (!I3C_PID_RND_LOWER_32BITS(devinfo.pid)) { - u16 manuf = I3C_PID_MANUF_ID(devinfo.pid); - u16 part = I3C_PID_PART_ID(devinfo.pid); - u16 ext_info = I3C_PID_EXTRA_INFO(devinfo.pid); - - /* First try to match by manufacturer/part ID. */ - for (id = id_table; id->match_flags != 0; id++) { - if ((id->match_flags & I3C_MATCH_MANUF_AND_PART) != - I3C_MATCH_MANUF_AND_PART) - continue; - - if (manuf != id->manuf_id || part != id->part_id) - continue; - - if ((id->match_flags & I3C_MATCH_EXTRA_INFO) && - ext_info != id->extra_info) - continue; - - return id; - } - } - - /* Fallback to DCR match. */ - for (id = id_table; id->match_flags != 0; id++) { - if ((id->match_flags & I3C_MATCH_DCR) && - id->dcr == devinfo.dcr) - return id; - } - - return NULL; -} - static int i3c_device_match(struct device *dev, struct device_driver *drv) { struct i3c_device *i3cdev; diff --git a/include/linux/i3c/device.h b/include/linux/i3c/device.h index 5ecb055fd375..de102e4418ab 100644 --- a/include/linux/i3c/device.h +++ b/include/linux/i3c/device.h @@ -188,6 +188,10 @@ static inline struct i3c_driver *drv_to_i3cdrv(struct device_driver *drv) struct device *i3cdev_to_dev(struct i3c_device *i3cdev); struct i3c_device *dev_to_i3cdev(struct device *dev); +const struct i3c_device_id * +i3c_device_match_id(struct i3c_device *i3cdev, + const struct i3c_device_id *id_table); + static inline void i3cdev_set_drvdata(struct i3c_device *i3cdev, void *data) { -- cgit v1.2.3 From 6fc4dbcf0276279d488c5fbbfabe94734134f4fa Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 18 Jul 2019 23:01:46 +0800 Subject: padata: Replace delayed timer with immediate workqueue in padata_reorder The function padata_reorder will use a timer when it cannot progress while completed jobs are outstanding (pd->reorder_objects > 0). This is suboptimal as if we do end up using the timer then it would have introduced a gratuitous delay of one second. In fact we can easily distinguish between whether completed jobs are outstanding and whether we can make progress. All we have to do is look at the next pqueue list. This patch does that by replacing pd->processed with pd->cpu so that the next pqueue is more accessible. A work queue is used instead of the original try_again to avoid hogging the CPU. Note that we don't bother removing the work queue in padata_flush_queues because the whole premise is broken. You cannot flush async crypto requests so it makes no sense to even try. A subsequent patch will fix it by replacing it with a ref counting scheme. Signed-off-by: Herbert Xu --- include/linux/padata.h | 13 +++---- kernel/padata.c | 97 ++++++++++---------------------------------------- 2 files changed, 22 insertions(+), 88 deletions(-) (limited to 'include') diff --git a/include/linux/padata.h b/include/linux/padata.h index 56f09e36f770..8da673861d99 100644 --- a/include/linux/padata.h +++ b/include/linux/padata.h @@ -12,7 +12,6 @@ #include #include #include -#include #include #include @@ -73,18 +72,14 @@ struct padata_serial_queue { * @serial: List to wait for serialization after reordering. * @pwork: work struct for parallelization. * @swork: work struct for serialization. - * @pd: Backpointer to the internal control structure. * @work: work struct for parallelization. - * @reorder_work: work struct for reordering. * @num_obj: Number of objects that are processed by this cpu. * @cpu_index: Index of the cpu. */ struct padata_parallel_queue { struct padata_list parallel; struct padata_list reorder; - struct parallel_data *pd; struct work_struct work; - struct work_struct reorder_work; atomic_t num_obj; int cpu_index; }; @@ -110,10 +105,10 @@ struct padata_cpumask { * @reorder_objects: Number of objects waiting in the reorder queues. * @refcnt: Number of objects holding a reference on this parallel_data. * @max_seq_nr: Maximal used sequence number. + * @cpu: Next CPU to be processed. * @cpumask: The cpumasks in use for parallel and serial workers. + * @reorder_work: work struct for reordering. * @lock: Reorder lock. - * @processed: Number of already processed objects. - * @timer: Reorder timer. */ struct parallel_data { struct padata_instance *pinst; @@ -122,10 +117,10 @@ struct parallel_data { atomic_t reorder_objects; atomic_t refcnt; atomic_t seq_nr; + int cpu; struct padata_cpumask cpumask; + struct work_struct reorder_work; spinlock_t lock ____cacheline_aligned; - unsigned int processed; - struct timer_list timer; }; /** diff --git a/kernel/padata.c b/kernel/padata.c index 15a8ad63f4ff..fbafca18597f 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -165,23 +165,12 @@ EXPORT_SYMBOL(padata_do_parallel); */ static struct padata_priv *padata_get_next(struct parallel_data *pd) { - int cpu, num_cpus; - unsigned int next_nr, next_index; struct padata_parallel_queue *next_queue; struct padata_priv *padata; struct padata_list *reorder; + int cpu = pd->cpu; - num_cpus = cpumask_weight(pd->cpumask.pcpu); - - /* - * Calculate the percpu reorder queue and the sequence - * number of the next object. - */ - next_nr = pd->processed; - next_index = next_nr % num_cpus; - cpu = padata_index_to_cpu(pd, next_index); next_queue = per_cpu_ptr(pd->pqueue, cpu); - reorder = &next_queue->reorder; spin_lock(&reorder->lock); @@ -192,7 +181,8 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd) list_del_init(&padata->list); atomic_dec(&pd->reorder_objects); - pd->processed++; + pd->cpu = cpumask_next_wrap(cpu, pd->cpumask.pcpu, -1, + false); spin_unlock(&reorder->lock); goto out; @@ -215,6 +205,7 @@ static void padata_reorder(struct parallel_data *pd) struct padata_priv *padata; struct padata_serial_queue *squeue; struct padata_instance *pinst = pd->pinst; + struct padata_parallel_queue *next_queue; /* * We need to ensure that only one cpu can work on dequeueing of @@ -246,7 +237,6 @@ static void padata_reorder(struct parallel_data *pd) * so exit immediately. */ if (PTR_ERR(padata) == -ENODATA) { - del_timer(&pd->timer); spin_unlock_bh(&pd->lock); return; } @@ -265,70 +255,29 @@ static void padata_reorder(struct parallel_data *pd) /* * The next object that needs serialization might have arrived to - * the reorder queues in the meantime, we will be called again - * from the timer function if no one else cares for it. + * the reorder queues in the meantime. * - * Ensure reorder_objects is read after pd->lock is dropped so we see - * an increment from another task in padata_do_serial. Pairs with + * Ensure reorder queue is read after pd->lock is dropped so we see + * new objects from another task in padata_do_serial. Pairs with * smp_mb__after_atomic in padata_do_serial. */ smp_mb(); - if (atomic_read(&pd->reorder_objects) - && !(pinst->flags & PADATA_RESET)) - mod_timer(&pd->timer, jiffies + HZ); - else - del_timer(&pd->timer); - return; + next_queue = per_cpu_ptr(pd->pqueue, pd->cpu); + if (!list_empty(&next_queue->reorder.list)) + queue_work(pinst->wq, &pd->reorder_work); } static void invoke_padata_reorder(struct work_struct *work) { - struct padata_parallel_queue *pqueue; struct parallel_data *pd; local_bh_disable(); - pqueue = container_of(work, struct padata_parallel_queue, reorder_work); - pd = pqueue->pd; + pd = container_of(work, struct parallel_data, reorder_work); padata_reorder(pd); local_bh_enable(); } -static void padata_reorder_timer(struct timer_list *t) -{ - struct parallel_data *pd = from_timer(pd, t, timer); - unsigned int weight; - int target_cpu, cpu; - - cpu = get_cpu(); - - /* We don't lock pd here to not interfere with parallel processing - * padata_reorder() calls on other CPUs. We just need any CPU out of - * the cpumask.pcpu set. It would be nice if it's the right one but - * it doesn't matter if we're off to the next one by using an outdated - * pd->processed value. - */ - weight = cpumask_weight(pd->cpumask.pcpu); - target_cpu = padata_index_to_cpu(pd, pd->processed % weight); - - /* ensure to call the reorder callback on the correct CPU */ - if (cpu != target_cpu) { - struct padata_parallel_queue *pqueue; - struct padata_instance *pinst; - - /* The timer function is serialized wrt itself -- no locking - * needed. - */ - pinst = pd->pinst; - pqueue = per_cpu_ptr(pd->pqueue, target_cpu); - queue_work_on(target_cpu, pinst->wq, &pqueue->reorder_work); - } else { - padata_reorder(pd); - } - - put_cpu(); -} - static void padata_serial_worker(struct work_struct *serial_work) { struct padata_serial_queue *squeue; @@ -376,9 +325,8 @@ void padata_do_serial(struct padata_priv *padata) cpu = get_cpu(); - /* We need to run on the same CPU padata_do_parallel(.., padata, ..) - * was called on -- or, at least, enqueue the padata object into the - * correct per-cpu queue. + /* We need to enqueue the padata object into the correct + * per-cpu queue. */ if (cpu != padata->cpu) { reorder_via_wq = 1; @@ -388,12 +336,12 @@ void padata_do_serial(struct padata_priv *padata) pqueue = per_cpu_ptr(pd->pqueue, cpu); spin_lock(&pqueue->reorder.lock); - atomic_inc(&pd->reorder_objects); list_add_tail(&padata->list, &pqueue->reorder.list); + atomic_inc(&pd->reorder_objects); spin_unlock(&pqueue->reorder.lock); /* - * Ensure the atomic_inc of reorder_objects above is ordered correctly + * Ensure the addition to the reorder list is ordered correctly * with the trylock of pd->lock in padata_reorder. Pairs with smp_mb * in padata_reorder. */ @@ -401,13 +349,7 @@ void padata_do_serial(struct padata_priv *padata) put_cpu(); - /* If we're running on the wrong CPU, call padata_reorder() via a - * kernel worker. - */ - if (reorder_via_wq) - queue_work_on(cpu, pd->pinst->wq, &pqueue->reorder_work); - else - padata_reorder(pd); + padata_reorder(pd); } EXPORT_SYMBOL(padata_do_serial); @@ -463,14 +405,12 @@ static void padata_init_pqueues(struct parallel_data *pd) continue; } - pqueue->pd = pd; pqueue->cpu_index = cpu_index; cpu_index++; __padata_list_init(&pqueue->reorder); __padata_list_init(&pqueue->parallel); INIT_WORK(&pqueue->work, padata_parallel_worker); - INIT_WORK(&pqueue->reorder_work, invoke_padata_reorder); atomic_set(&pqueue->num_obj, 0); } } @@ -498,12 +438,13 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst, padata_init_pqueues(pd); padata_init_squeues(pd); - timer_setup(&pd->timer, padata_reorder_timer, 0); atomic_set(&pd->seq_nr, -1); atomic_set(&pd->reorder_objects, 0); atomic_set(&pd->refcnt, 0); pd->pinst = pinst; spin_lock_init(&pd->lock); + pd->cpu = cpumask_first(pcpumask); + INIT_WORK(&pd->reorder_work, invoke_padata_reorder); return pd; @@ -538,8 +479,6 @@ static void padata_flush_queues(struct parallel_data *pd) flush_work(&pqueue->work); } - del_timer_sync(&pd->timer); - if (atomic_read(&pd->reorder_objects)) padata_reorder(pd); -- cgit v1.2.3 From 8dfa20fcfbeb245642dfe3a43f8a3735d9aed42a Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 19 Jul 2019 23:09:18 -0700 Subject: crypto: ghash - add comment and improve help text To help avoid confusion, add a comment to ghash-generic.c which explains the convention that the kernel's implementation of GHASH uses. Also update the Kconfig help text and module descriptions to call GHASH a "hash function" rather than a "message digest", since the latter normally means a real cryptographic hash function, which GHASH is not. Cc: Pascal Van Leeuwen Signed-off-by: Eric Biggers Reviewed-by: Ard Biesheuvel Acked-by: Pascal Van Leeuwen Signed-off-by: Herbert Xu --- arch/arm/crypto/ghash-ce-glue.c | 2 +- arch/s390/crypto/ghash_s390.c | 2 +- arch/x86/crypto/ghash-clmulni-intel_glue.c | 3 +-- crypto/Kconfig | 11 ++++++----- crypto/ghash-generic.c | 31 +++++++++++++++++++++++++++--- drivers/crypto/Kconfig | 6 +++--- include/crypto/ghash.h | 2 +- 7 files changed, 41 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/arch/arm/crypto/ghash-ce-glue.c b/arch/arm/crypto/ghash-ce-glue.c index bb906b5f1eb3..c691077679a6 100644 --- a/arch/arm/crypto/ghash-ce-glue.c +++ b/arch/arm/crypto/ghash-ce-glue.c @@ -18,7 +18,7 @@ #include #include -MODULE_DESCRIPTION("GHASH secure hash using ARMv8 Crypto Extensions"); +MODULE_DESCRIPTION("GHASH hash function using ARMv8 Crypto Extensions"); MODULE_AUTHOR("Ard Biesheuvel "); MODULE_LICENSE("GPL v2"); MODULE_ALIAS_CRYPTO("ghash"); diff --git a/arch/s390/crypto/ghash_s390.c b/arch/s390/crypto/ghash_s390.c index eeeb6a7737a4..a3e7400e031c 100644 --- a/arch/s390/crypto/ghash_s390.c +++ b/arch/s390/crypto/ghash_s390.c @@ -153,4 +153,4 @@ module_exit(ghash_mod_exit); MODULE_ALIAS_CRYPTO("ghash"); MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("GHASH Message Digest Algorithm, s390 implementation"); +MODULE_DESCRIPTION("GHASH hash function, s390 implementation"); diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c index ac76fe88ac4f..04d72a5a8ce9 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_glue.c +++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c @@ -357,6 +357,5 @@ module_init(ghash_pclmulqdqni_mod_init); module_exit(ghash_pclmulqdqni_mod_exit); MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("GHASH Message Digest Algorithm, " - "accelerated by PCLMULQDQ-NI"); +MODULE_DESCRIPTION("GHASH hash function, accelerated by PCLMULQDQ-NI"); MODULE_ALIAS_CRYPTO("ghash"); diff --git a/crypto/Kconfig b/crypto/Kconfig index 2e7f08ba0675..455a3354e291 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -647,11 +647,12 @@ config CRYPTO_VPMSUM_TESTER Unless you are testing these algorithms, you don't need this. config CRYPTO_GHASH - tristate "GHASH digest algorithm" + tristate "GHASH hash function" select CRYPTO_GF128MUL select CRYPTO_HASH help - GHASH is message digest algorithm for GCM (Galois/Counter Mode). + GHASH is the hash function used in GCM (Galois/Counter Mode). + It is not a general-purpose cryptographic hash function. config CRYPTO_POLY1305 tristate "Poly1305 authenticator algorithm" @@ -976,12 +977,12 @@ config CRYPTO_WP512 config CRYPTO_GHASH_CLMUL_NI_INTEL - tristate "GHASH digest algorithm (CLMUL-NI accelerated)" + tristate "GHASH hash function (CLMUL-NI accelerated)" depends on X86 && 64BIT select CRYPTO_CRYPTD help - GHASH is message digest algorithm for GCM (Galois/Counter Mode). - The implementation is accelerated by CLMUL-NI of Intel. + This is the x86_64 CLMUL-NI accelerated implementation of + GHASH, the hash function used in GCM (Galois/Counter mode). comment "Ciphers" diff --git a/crypto/ghash-generic.c b/crypto/ghash-generic.c index dad9e1f91a78..5027b3461c92 100644 --- a/crypto/ghash-generic.c +++ b/crypto/ghash-generic.c @@ -1,12 +1,37 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * GHASH: digest algorithm for GCM (Galois/Counter Mode). + * GHASH: hash function for GCM (Galois/Counter Mode). * * Copyright (c) 2007 Nokia Siemens Networks - Mikko Herranen * Copyright (c) 2009 Intel Corp. * Author: Huang Ying + */ + +/* + * GHASH is a keyed hash function used in GCM authentication tag generation. + * + * The original GCM paper [1] presents GHASH as a function GHASH(H, A, C) which + * takes a 16-byte hash key H, additional authenticated data A, and a ciphertext + * C. It formats A and C into a single byte string X, interprets X as a + * polynomial over GF(2^128), and evaluates this polynomial at the point H. + * + * However, the NIST standard for GCM [2] presents GHASH as GHASH(H, X) where X + * is the already-formatted byte string containing both A and C. + * + * "ghash" in the Linux crypto API uses the 'X' (pre-formatted) convention, + * since the API supports only a single data stream per hash. Thus, the + * formatting of 'A' and 'C' is done in the "gcm" template, not in "ghash". + * + * The reason "ghash" is separate from "gcm" is to allow "gcm" to use an + * accelerated "ghash" when a standalone accelerated "gcm(aes)" is unavailable. + * It is generally inappropriate to use "ghash" for other purposes, since it is + * an "ε-almost-XOR-universal hash function", not a cryptographic hash function. + * It can only be used securely in crypto modes specially designed to use it. * - * The algorithm implementation is copied from gcm.c. + * [1] The Galois/Counter Mode of Operation (GCM) + * (http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.694.695&rep=rep1&type=pdf) + * [2] Recommendation for Block Cipher Modes of Operation: Galois/Counter Mode (GCM) and GMAC + * (https://csrc.nist.gov/publications/detail/sp/800-38d/final) */ #include @@ -156,6 +181,6 @@ subsys_initcall(ghash_mod_init); module_exit(ghash_mod_exit); MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("GHASH Message Digest Algorithm"); +MODULE_DESCRIPTION("GHASH hash function"); MODULE_ALIAS_CRYPTO("ghash"); MODULE_ALIAS_CRYPTO("ghash-generic"); diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig index 69d1bbd5d9bf..b8c50871f11b 100644 --- a/drivers/crypto/Kconfig +++ b/drivers/crypto/Kconfig @@ -189,12 +189,12 @@ config S390_PRNG It is available as of z9. config CRYPTO_GHASH_S390 - tristate "GHASH digest algorithm" + tristate "GHASH hash function" depends on S390 select CRYPTO_HASH help - This is the s390 hardware accelerated implementation of the - GHASH message digest algorithm for GCM (Galois/Counter Mode). + This is the s390 hardware accelerated implementation of GHASH, + the hash function used in GCM (Galois/Counter mode). It is available as of z196. diff --git a/include/crypto/ghash.h b/include/crypto/ghash.h index 9136301062a5..f832c9f2aca3 100644 --- a/include/crypto/ghash.h +++ b/include/crypto/ghash.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* - * Common values for GHASH algorithms + * Common values for the GHASH hash function */ #ifndef __CRYPTO_GHASH_H__ -- cgit v1.2.3 From ed1f2e85da79274f3dc4092953f1359eb732f0c6 Mon Sep 17 00:00:00 2001 From: Gwendal Grignou Date: Thu, 18 Jul 2019 16:28:24 -0700 Subject: iio: cros_ec: Add calibscale for 3d MEMS Add calibration scale support to accel, gyro and magnetometer. Check on eve with current firmware, check reading calibscale returns 1.0, check with newer firmware values are applied. Signed-off-by: Gwendal Grignou Signed-off-by: Jonathan Cameron --- .../iio/common/cros_ec_sensors/cros_ec_sensors.c | 51 +++++++++++++++++++--- .../common/cros_ec_sensors/cros_ec_sensors_core.c | 2 +- drivers/iio/light/cros_ec_light_prox.c | 12 ++--- drivers/iio/pressure/cros_ec_baro.c | 2 - include/linux/iio/common/cros_ec_sensors_core.h | 5 ++- 5 files changed, 57 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/drivers/iio/common/cros_ec_sensors/cros_ec_sensors.c b/drivers/iio/common/cros_ec_sensors/cros_ec_sensors.c index 17af4e0fd5f8..2af09606c438 100644 --- a/drivers/iio/common/cros_ec_sensors/cros_ec_sensors.c +++ b/drivers/iio/common/cros_ec_sensors/cros_ec_sensors.c @@ -63,10 +63,35 @@ static int cros_ec_sensors_read(struct iio_dev *indio_dev, /* Save values */ for (i = CROS_EC_SENSOR_X; i < CROS_EC_SENSOR_MAX_AXIS; i++) - st->core.calib[i] = + st->core.calib[i].offset = st->core.resp->sensor_offset.offset[i]; ret = IIO_VAL_INT; - *val = st->core.calib[idx]; + *val = st->core.calib[idx].offset; + break; + case IIO_CHAN_INFO_CALIBSCALE: + st->core.param.cmd = MOTIONSENSE_CMD_SENSOR_SCALE; + st->core.param.sensor_offset.flags = 0; + + ret = cros_ec_motion_send_host_cmd(&st->core, 0); + if (ret == -EPROTO) { + /* Reading calibscale is not supported on older EC. */ + *val = 1; + *val2 = 0; + ret = IIO_VAL_INT_PLUS_MICRO; + break; + } else if (ret) { + break; + } + + /* Save values */ + for (i = CROS_EC_SENSOR_X; i < CROS_EC_SENSOR_MAX_AXIS; i++) + st->core.calib[i].scale = + st->core.resp->sensor_scale.scale[i]; + + *val = st->core.calib[idx].scale >> 15; + *val2 = ((st->core.calib[idx].scale & 0x7FFF) * 1000000LL) / + MOTION_SENSE_DEFAULT_SCALE; + ret = IIO_VAL_INT_PLUS_MICRO; break; case IIO_CHAN_INFO_SCALE: st->core.param.cmd = MOTIONSENSE_CMD_SENSOR_RANGE; @@ -134,7 +159,7 @@ static int cros_ec_sensors_write(struct iio_dev *indio_dev, switch (mask) { case IIO_CHAN_INFO_CALIBBIAS: - st->core.calib[idx] = val; + st->core.calib[idx].offset = val; /* Send to EC for each axis, even if not complete */ st->core.param.cmd = MOTIONSENSE_CMD_SENSOR_OFFSET; @@ -142,10 +167,25 @@ static int cros_ec_sensors_write(struct iio_dev *indio_dev, MOTION_SENSE_SET_OFFSET; for (i = CROS_EC_SENSOR_X; i < CROS_EC_SENSOR_MAX_AXIS; i++) st->core.param.sensor_offset.offset[i] = - st->core.calib[i]; + st->core.calib[i].offset; st->core.param.sensor_offset.temp = EC_MOTION_SENSE_INVALID_CALIB_TEMP; + ret = cros_ec_motion_send_host_cmd(&st->core, 0); + break; + case IIO_CHAN_INFO_CALIBSCALE: + st->core.calib[idx].scale = val; + /* Send to EC for each axis, even if not complete */ + + st->core.param.cmd = MOTIONSENSE_CMD_SENSOR_SCALE; + st->core.param.sensor_offset.flags = + MOTION_SENSE_SET_OFFSET; + for (i = CROS_EC_SENSOR_X; i < CROS_EC_SENSOR_MAX_AXIS; i++) + st->core.param.sensor_scale.scale[i] = + st->core.calib[i].scale; + st->core.param.sensor_scale.temp = + EC_MOTION_SENSE_INVALID_CALIB_TEMP; + ret = cros_ec_motion_send_host_cmd(&st->core, 0); break; case IIO_CHAN_INFO_SCALE: @@ -206,7 +246,8 @@ static int cros_ec_sensors_probe(struct platform_device *pdev) /* Common part */ channel->info_mask_separate = BIT(IIO_CHAN_INFO_RAW) | - BIT(IIO_CHAN_INFO_CALIBBIAS); + BIT(IIO_CHAN_INFO_CALIBBIAS) | + BIT(IIO_CHAN_INFO_CALIBSCALE); channel->info_mask_shared_by_all = BIT(IIO_CHAN_INFO_SCALE) | BIT(IIO_CHAN_INFO_FREQUENCY) | diff --git a/drivers/iio/common/cros_ec_sensors/cros_ec_sensors_core.c b/drivers/iio/common/cros_ec_sensors/cros_ec_sensors_core.c index 130362ca421b..96d5aa1f4bd5 100644 --- a/drivers/iio/common/cros_ec_sensors/cros_ec_sensors_core.c +++ b/drivers/iio/common/cros_ec_sensors/cros_ec_sensors_core.c @@ -118,7 +118,7 @@ static ssize_t cros_ec_sensors_calibrate(struct iio_dev *indio_dev, } else { /* Save values */ for (i = CROS_EC_SENSOR_X; i < CROS_EC_SENSOR_MAX_AXIS; i++) - st->calib[i] = st->resp->perform_calib.offset[i]; + st->calib[i].offset = st->resp->perform_calib.offset[i]; } mutex_unlock(&st->cmd_lock); diff --git a/drivers/iio/light/cros_ec_light_prox.c b/drivers/iio/light/cros_ec_light_prox.c index 308ee6ff2e22..b81746a99f1f 100644 --- a/drivers/iio/light/cros_ec_light_prox.c +++ b/drivers/iio/light/cros_ec_light_prox.c @@ -88,9 +88,10 @@ static int cros_ec_light_prox_read(struct iio_dev *indio_dev, } /* Save values */ - st->core.calib[0] = st->core.resp->sensor_offset.offset[0]; + st->core.calib[0].offset = + st->core.resp->sensor_offset.offset[0]; - *val = st->core.calib[idx]; + *val = st->core.calib[idx].offset; break; case IIO_CHAN_INFO_CALIBSCALE: /* @@ -134,11 +135,12 @@ static int cros_ec_light_prox_write(struct iio_dev *indio_dev, switch (mask) { case IIO_CHAN_INFO_CALIBBIAS: - st->core.calib[idx] = val; + st->core.calib[idx].offset = val; /* Send to EC for each axis, even if not complete */ st->core.param.cmd = MOTIONSENSE_CMD_SENSOR_OFFSET; st->core.param.sensor_offset.flags = MOTION_SENSE_SET_OFFSET; - st->core.param.sensor_offset.offset[0] = st->core.calib[0]; + st->core.param.sensor_offset.offset[0] = + st->core.calib[0].offset; st->core.param.sensor_offset.temp = EC_MOTION_SENSE_INVALID_CALIB_TEMP; if (cros_ec_motion_send_host_cmd(&st->core, 0)) @@ -205,8 +207,6 @@ static int cros_ec_light_prox_probe(struct platform_device *pdev) channel->ext_info = cros_ec_sensors_ext_info; channel->scan_type.sign = 'u'; - state->core.calib[0] = 0; - /* Sensor specific */ switch (state->core.type) { case MOTIONSENSE_TYPE_LIGHT: diff --git a/drivers/iio/pressure/cros_ec_baro.c b/drivers/iio/pressure/cros_ec_baro.c index 034ce98d6e97..d3acba7ba582 100644 --- a/drivers/iio/pressure/cros_ec_baro.c +++ b/drivers/iio/pressure/cros_ec_baro.c @@ -152,8 +152,6 @@ static int cros_ec_baro_probe(struct platform_device *pdev) channel->ext_info = cros_ec_sensors_ext_info; channel->scan_type.sign = 'u'; - state->core.calib[0] = 0; - /* Sensor specific */ switch (state->core.type) { case MOTIONSENSE_TYPE_BARO: diff --git a/include/linux/iio/common/cros_ec_sensors_core.h b/include/linux/iio/common/cros_ec_sensors_core.h index 0c636b9fe8d7..bb03a252bd04 100644 --- a/include/linux/iio/common/cros_ec_sensors_core.h +++ b/include/linux/iio/common/cros_ec_sensors_core.h @@ -62,7 +62,10 @@ struct cros_ec_sensors_core_state { enum motionsensor_type type; enum motionsensor_location loc; - s16 calib[CROS_EC_SENSOR_MAX_AXIS]; + struct calib_data { + s16 offset; + u16 scale; + } calib[CROS_EC_SENSOR_MAX_AXIS]; u8 samples[CROS_EC_SAMPLE_SIZE]; -- cgit v1.2.3 From 1a981c0586c038710227eb740350f291e77ce365 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Fri, 26 Jul 2019 12:27:40 +0200 Subject: net: stmmac: Make MDIO bus reset optional The Tegra EQOS driver already resets the MDIO bus at probe time via the reset GPIO specified in the phy-reset-gpios device tree property. There is no need to reset the bus again later on. This avoids the need to query the device tree for the snps,reset GPIO, which is not part of the Tegra EQOS device tree bindings. This quiesces an error message from the generic bus reset code if it doesn't find the snps,reset related delays. Signed-off-by: Thierry Reding Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c | 3 +++ drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c | 4 +++- drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c | 1 + drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c | 8 +++++++- include/linux/stmmac.h | 1 + 5 files changed, 15 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c index 3a14cdd01f5f..66933332c68e 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c @@ -333,6 +333,9 @@ static void *tegra_eqos_probe(struct platform_device *pdev, usleep_range(2000, 4000); gpiod_set_value(eqos->reset, 0); + /* MDIO bus was already reset just above */ + data->mdio_bus_data->needs_reset = false; + eqos->rst = devm_reset_control_get(&pdev->dev, "eqos"); if (IS_ERR(eqos->rst)) { err = PTR_ERR(eqos->rst); diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c index 4304c1abc5d1..40c42637ad75 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c @@ -348,7 +348,9 @@ int stmmac_mdio_register(struct net_device *ndev) max_addr = PHY_MAX_ADDR; } - new_bus->reset = &stmmac_mdio_reset; + if (mdio_bus_data->needs_reset) + new_bus->reset = &stmmac_mdio_reset; + snprintf(new_bus->id, MII_BUS_ID_SIZE, "%s-%x", new_bus->name, priv->plat->bus_id); new_bus->priv = ndev; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c index 86f9c07a38cf..d5d08e11c353 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c @@ -63,6 +63,7 @@ static void common_default_data(struct plat_stmmacenet_data *plat) plat->has_gmac = 1; plat->force_sf_dma_mode = 1; + plat->mdio_bus_data->needs_reset = true; plat->mdio_bus_data->phy_mask = 0; /* Set default value for multicast hash bins */ diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c index 73fc2524372e..333b09564b88 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c @@ -342,10 +342,16 @@ static int stmmac_dt_phy(struct plat_stmmacenet_data *plat, mdio = true; } - if (mdio) + if (mdio) { plat->mdio_bus_data = devm_kzalloc(dev, sizeof(struct stmmac_mdio_bus_data), GFP_KERNEL); + if (!plat->mdio_bus_data) + return -ENOMEM; + + plat->mdio_bus_data->needs_reset = true; + } + return 0; } diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 7d06241582dd..7b3e354bcd3c 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -81,6 +81,7 @@ struct stmmac_mdio_bus_data { unsigned int phy_mask; int *irqs; int probed_phy_irq; + bool needs_reset; }; struct stmmac_dma_cfg { -- cgit v1.2.3 From a090965b882333500d8780e2c1762e17782f413f Mon Sep 17 00:00:00 2001 From: Denis Ciocca Date: Thu, 18 Jul 2019 15:53:43 -0700 Subject: iio:common:st_sensors: add st_sensors_get_settings_index() helper function Extract from st_sensors_check_device_support() function the code that is used to get the specific settings for a device. This will be used as generic extractor by each ST driver. Signed-off-by: Denis Ciocca Signed-off-by: Jonathan Cameron --- drivers/iio/common/st_sensors/st_sensors_core.c | 49 +++++++++++++++++-------- include/linux/iio/common/st_sensors.h | 4 ++ 2 files changed, 38 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/drivers/iio/common/st_sensors/st_sensors_core.c b/drivers/iio/common/st_sensors/st_sensors_core.c index 8b22dc241482..3610ca9eaa87 100644 --- a/drivers/iio/common/st_sensors/st_sensors_core.c +++ b/drivers/iio/common/st_sensors/st_sensors_core.c @@ -633,28 +633,47 @@ static int st_sensors_init_interface_mode(struct iio_dev *indio_dev, return 0; } +/* + * st_sensors_get_settings_index() - get index of the sensor settings for a + * specific device from list of settings + * @name: device name buffer reference. + * @list: sensor settings list. + * @list_length: length of sensor settings list. + * + * Return: non negative number on success (valid index), + * negative error code otherwise. + */ +int st_sensors_get_settings_index(const char *name, + const struct st_sensor_settings *list, + const int list_length) +{ + int i, n; + + for (i = 0; i < list_length; i++) { + for (n = 0; n < ST_SENSORS_MAX_4WAI; n++) { + if (strcmp(name, list[i].sensors_supported[n]) == 0) + return i; + } + } + + return -ENODEV; +} +EXPORT_SYMBOL(st_sensors_get_settings_index); + int st_sensors_check_device_support(struct iio_dev *indio_dev, int num_sensors_list, const struct st_sensor_settings *sensor_settings) { - int i, n, err = 0; - u8 wai; struct st_sensor_data *sdata = iio_priv(indio_dev); + int i, err; + u8 wai; - for (i = 0; i < num_sensors_list; i++) { - for (n = 0; n < ST_SENSORS_MAX_4WAI; n++) { - if (strcmp(indio_dev->name, - sensor_settings[i].sensors_supported[n]) == 0) { - break; - } - } - if (n < ST_SENSORS_MAX_4WAI) - break; - } - if (i == num_sensors_list) { + i = st_sensors_get_settings_index(indio_dev->name, + sensor_settings, num_sensors_list); + if (i < 0) { dev_err(&indio_dev->dev, "device name %s not recognized.\n", - indio_dev->name); - return -ENODEV; + indio_dev->name); + return i; } err = st_sensors_init_interface_mode(indio_dev, &sensor_settings[i]); diff --git a/include/linux/iio/common/st_sensors.h b/include/linux/iio/common/st_sensors.h index 2948ac99e2da..17fbf3e9b013 100644 --- a/include/linux/iio/common/st_sensors.h +++ b/include/linux/iio/common/st_sensors.h @@ -334,6 +334,10 @@ int st_sensors_set_fullscale_by_gain(struct iio_dev *indio_dev, int scale); int st_sensors_read_info_raw(struct iio_dev *indio_dev, struct iio_chan_spec const *ch, int *val); +int st_sensors_get_settings_index(const char *name, + const struct st_sensor_settings *list, + const int list_length); + int st_sensors_check_device_support(struct iio_dev *indio_dev, int num_sensors_list, const struct st_sensor_settings *sensor_settings); -- cgit v1.2.3 From 1ecd245e0eb23d1c3803474eba75589743d0d1fe Mon Sep 17 00:00:00 2001 From: Denis Ciocca Date: Thu, 18 Jul 2019 15:53:52 -0700 Subject: iio: move 3-wire spi initialization to st_sensors_spi Some devices need to be configured with special bit in order to use spi 3-wire. This was done during device identification phase. Instead, let's move this part as spi specific. Doing this the check_device_support function becomes a simple device id check, so let's rename it. Signed-off-by: Denis Ciocca Signed-off-by: Jonathan Cameron --- drivers/iio/accel/st_accel_core.c | 4 +- drivers/iio/accel/st_accel_spi.c | 4 +- drivers/iio/common/st_sensors/st_sensors_core.c | 64 ++++++------------------ drivers/iio/common/st_sensors/st_sensors_spi.c | 65 ++++++++++++++++++++++++- drivers/iio/gyro/st_gyro_core.c | 4 +- drivers/iio/gyro/st_gyro_spi.c | 4 +- drivers/iio/magnetometer/st_magn_core.c | 4 +- drivers/iio/magnetometer/st_magn_spi.c | 4 +- drivers/iio/pressure/st_pressure_core.c | 4 +- drivers/iio/pressure/st_pressure_spi.c | 4 +- include/linux/iio/common/st_sensors.h | 3 +- include/linux/iio/common/st_sensors_spi.h | 4 +- 12 files changed, 97 insertions(+), 71 deletions(-) (limited to 'include') diff --git a/drivers/iio/accel/st_accel_core.c b/drivers/iio/accel/st_accel_core.c index 6fc490ffef7e..630909702a19 100644 --- a/drivers/iio/accel/st_accel_core.c +++ b/drivers/iio/accel/st_accel_core.c @@ -1183,9 +1183,7 @@ int st_accel_common_probe(struct iio_dev *indio_dev) if (err) return err; - err = st_sensors_check_device_support(indio_dev, - ARRAY_SIZE(st_accel_sensors_settings), - st_accel_sensors_settings); + err = st_sensors_verify_id(indio_dev); if (err < 0) goto st_accel_power_off; diff --git a/drivers/iio/accel/st_accel_spi.c b/drivers/iio/accel/st_accel_spi.c index c0556db9d60a..8af7027d5598 100644 --- a/drivers/iio/accel/st_accel_spi.c +++ b/drivers/iio/accel/st_accel_spi.c @@ -124,7 +124,9 @@ static int st_accel_spi_probe(struct spi_device *spi) adata = iio_priv(indio_dev); adata->sensor_settings = (struct st_sensor_settings *)settings; - st_sensors_spi_configure(indio_dev, spi, adata); + err = st_sensors_spi_configure(indio_dev, spi); + if (err < 0) + return err; err = st_accel_common_probe(indio_dev); if (err < 0) diff --git a/drivers/iio/common/st_sensors/st_sensors_core.c b/drivers/iio/common/st_sensors/st_sensors_core.c index 3610ca9eaa87..f05bf7cf0594 100644 --- a/drivers/iio/common/st_sensors/st_sensors_core.c +++ b/drivers/iio/common/st_sensors/st_sensors_core.c @@ -608,31 +608,6 @@ out: } EXPORT_SYMBOL(st_sensors_read_info_raw); -static int st_sensors_init_interface_mode(struct iio_dev *indio_dev, - const struct st_sensor_settings *sensor_settings) -{ - struct st_sensor_data *sdata = iio_priv(indio_dev); - struct device_node *np = sdata->dev->of_node; - struct st_sensors_platform_data *pdata; - - pdata = (struct st_sensors_platform_data *)sdata->dev->platform_data; - if (((np && of_property_read_bool(np, "spi-3wire")) || - (pdata && pdata->spi_3wire)) && sensor_settings->sim.addr) { - int err; - - err = sdata->tf->write_byte(&sdata->tb, sdata->dev, - sensor_settings->sim.addr, - sensor_settings->sim.value); - if (err < 0) { - dev_err(&indio_dev->dev, - "failed to init interface mode\n"); - return err; - } - } - - return 0; -} - /* * st_sensors_get_settings_index() - get index of the sensor settings for a * specific device from list of settings @@ -660,36 +635,30 @@ int st_sensors_get_settings_index(const char *name, } EXPORT_SYMBOL(st_sensors_get_settings_index); -int st_sensors_check_device_support(struct iio_dev *indio_dev, - int num_sensors_list, - const struct st_sensor_settings *sensor_settings) +/* + * st_sensors_verify_id() - verify sensor ID (WhoAmI) is matching with the + * expected value + * @indio_dev: IIO device reference. + * + * Return: 0 on success (valid sensor ID), else a negative error code. + */ +int st_sensors_verify_id(struct iio_dev *indio_dev) { struct st_sensor_data *sdata = iio_priv(indio_dev); - int i, err; + int err; u8 wai; - i = st_sensors_get_settings_index(indio_dev->name, - sensor_settings, num_sensors_list); - if (i < 0) { - dev_err(&indio_dev->dev, "device name %s not recognized.\n", - indio_dev->name); - return i; - } - - err = st_sensors_init_interface_mode(indio_dev, &sensor_settings[i]); - if (err < 0) - return err; - - if (sensor_settings[i].wai_addr) { + if (sdata->sensor_settings->wai_addr) { err = sdata->tf->read_byte(&sdata->tb, sdata->dev, - sensor_settings[i].wai_addr, &wai); + sdata->sensor_settings->wai_addr, + &wai); if (err < 0) { dev_err(&indio_dev->dev, "failed to read Who-Am-I register.\n"); return err; } - if (sensor_settings[i].wai != wai) { + if (sdata->sensor_settings->wai != wai) { dev_err(&indio_dev->dev, "%s: WhoAmI mismatch (0x%x).\n", indio_dev->name, wai); @@ -697,12 +666,9 @@ int st_sensors_check_device_support(struct iio_dev *indio_dev, } } - sdata->sensor_settings = - (struct st_sensor_settings *)&sensor_settings[i]; - - return i; + return 0; } -EXPORT_SYMBOL(st_sensors_check_device_support); +EXPORT_SYMBOL(st_sensors_verify_id); ssize_t st_sensors_sysfs_sampling_frequency_avail(struct device *dev, struct device_attribute *attr, char *buf) diff --git a/drivers/iio/common/st_sensors/st_sensors_spi.c b/drivers/iio/common/st_sensors/st_sensors_spi.c index 2213843f02cb..a57cd648975c 100644 --- a/drivers/iio/common/st_sensors/st_sensors_spi.c +++ b/drivers/iio/common/st_sensors/st_sensors_spi.c @@ -102,9 +102,68 @@ static const struct st_sensor_transfer_function st_sensors_tf_spi = { .read_multiple_byte = st_sensors_spi_read_multiple_byte, }; -void st_sensors_spi_configure(struct iio_dev *indio_dev, - struct spi_device *spi, struct st_sensor_data *sdata) +/* + * st_sensors_is_spi_3_wire() - check if SPI 3-wire mode has been selected + * @spi: spi device reference. + * + * Return: true if SPI 3-wire mode is selected, false otherwise. + */ +static bool st_sensors_is_spi_3_wire(struct spi_device *spi) +{ + struct device_node *np = spi->dev.of_node; + struct st_sensors_platform_data *pdata; + + pdata = (struct st_sensors_platform_data *)spi->dev.platform_data; + if ((np && of_property_read_bool(np, "spi-3wire")) || + (pdata && pdata->spi_3wire)) { + return true; + } + + return false; +} + +/* + * st_sensors_configure_spi_3_wire() - configure SPI 3-wire if needed + * @spi: spi device reference. + * @settings: sensor specific settings reference. + * + * Return: 0 on success, else a negative error code. + */ +static int st_sensors_configure_spi_3_wire(struct spi_device *spi, + struct st_sensor_settings *settings) +{ + if (settings->sim.addr) { + u8 buffer[] = { + settings->sim.addr, + settings->sim.value + }; + + return spi_write(spi, buffer, 2); + } + + return 0; +} + +/* + * st_sensors_spi_configure() - configure SPI interface + * @indio_dev: IIO device reference. + * @spi: spi device reference. + * + * Return: 0 on success, else a negative error code. + */ +int st_sensors_spi_configure(struct iio_dev *indio_dev, + struct spi_device *spi) { + struct st_sensor_data *sdata = iio_priv(indio_dev); + int err; + + if (st_sensors_is_spi_3_wire(spi)) { + err = st_sensors_configure_spi_3_wire(spi, + sdata->sensor_settings); + if (err < 0) + return err; + } + spi_set_drvdata(spi, indio_dev); indio_dev->dev.parent = &spi->dev; @@ -113,6 +172,8 @@ void st_sensors_spi_configure(struct iio_dev *indio_dev, sdata->dev = &spi->dev; sdata->tf = &st_sensors_tf_spi; sdata->get_irq_data_ready = st_sensors_spi_get_irq; + + return 0; } EXPORT_SYMBOL(st_sensors_spi_configure); diff --git a/drivers/iio/gyro/st_gyro_core.c b/drivers/iio/gyro/st_gyro_core.c index 5cc63d41d855..4b87e79aa744 100644 --- a/drivers/iio/gyro/st_gyro_core.c +++ b/drivers/iio/gyro/st_gyro_core.c @@ -400,9 +400,7 @@ int st_gyro_common_probe(struct iio_dev *indio_dev) if (err) return err; - err = st_sensors_check_device_support(indio_dev, - ARRAY_SIZE(st_gyro_sensors_settings), - st_gyro_sensors_settings); + err = st_sensors_verify_id(indio_dev); if (err < 0) goto st_gyro_power_off; diff --git a/drivers/iio/gyro/st_gyro_spi.c b/drivers/iio/gyro/st_gyro_spi.c index bb7082055f85..b5c624251231 100644 --- a/drivers/iio/gyro/st_gyro_spi.c +++ b/drivers/iio/gyro/st_gyro_spi.c @@ -91,7 +91,9 @@ static int st_gyro_spi_probe(struct spi_device *spi) gdata = iio_priv(indio_dev); gdata->sensor_settings = (struct st_sensor_settings *)settings; - st_sensors_spi_configure(indio_dev, spi, gdata); + err = st_sensors_spi_configure(indio_dev, spi); + if (err < 0) + return err; err = st_gyro_common_probe(indio_dev); if (err < 0) diff --git a/drivers/iio/magnetometer/st_magn_core.c b/drivers/iio/magnetometer/st_magn_core.c index 43a49a52c81a..3f313aefece6 100644 --- a/drivers/iio/magnetometer/st_magn_core.c +++ b/drivers/iio/magnetometer/st_magn_core.c @@ -502,9 +502,7 @@ int st_magn_common_probe(struct iio_dev *indio_dev) if (err) return err; - err = st_sensors_check_device_support(indio_dev, - ARRAY_SIZE(st_magn_sensors_settings), - st_magn_sensors_settings); + err = st_sensors_verify_id(indio_dev); if (err < 0) goto st_magn_power_off; diff --git a/drivers/iio/magnetometer/st_magn_spi.c b/drivers/iio/magnetometer/st_magn_spi.c index a3045afc6b53..fbf909bde841 100644 --- a/drivers/iio/magnetometer/st_magn_spi.c +++ b/drivers/iio/magnetometer/st_magn_spi.c @@ -73,7 +73,9 @@ static int st_magn_spi_probe(struct spi_device *spi) mdata = iio_priv(indio_dev); mdata->sensor_settings = (struct st_sensor_settings *)settings; - st_sensors_spi_configure(indio_dev, spi, mdata); + err = st_sensors_spi_configure(indio_dev, spi); + if (err < 0) + return err; err = st_magn_common_probe(indio_dev); if (err < 0) diff --git a/drivers/iio/pressure/st_pressure_core.c b/drivers/iio/pressure/st_pressure_core.c index 35d80ff27464..a783fc075c26 100644 --- a/drivers/iio/pressure/st_pressure_core.c +++ b/drivers/iio/pressure/st_pressure_core.c @@ -698,9 +698,7 @@ int st_press_common_probe(struct iio_dev *indio_dev) if (err) return err; - err = st_sensors_check_device_support(indio_dev, - ARRAY_SIZE(st_press_sensors_settings), - st_press_sensors_settings); + err = st_sensors_verify_id(indio_dev); if (err < 0) goto st_press_power_off; diff --git a/drivers/iio/pressure/st_pressure_spi.c b/drivers/iio/pressure/st_pressure_spi.c index 3e8c1ffe001e..7c8b70221e70 100644 --- a/drivers/iio/pressure/st_pressure_spi.c +++ b/drivers/iio/pressure/st_pressure_spi.c @@ -83,7 +83,9 @@ static int st_press_spi_probe(struct spi_device *spi) press_data = iio_priv(indio_dev); press_data->sensor_settings = (struct st_sensor_settings *)settings; - st_sensors_spi_configure(indio_dev, spi, press_data); + err = st_sensors_spi_configure(indio_dev, spi); + if (err < 0) + return err; err = st_press_common_probe(indio_dev); if (err < 0) diff --git a/include/linux/iio/common/st_sensors.h b/include/linux/iio/common/st_sensors.h index 17fbf3e9b013..566b955e2980 100644 --- a/include/linux/iio/common/st_sensors.h +++ b/include/linux/iio/common/st_sensors.h @@ -338,8 +338,7 @@ int st_sensors_get_settings_index(const char *name, const struct st_sensor_settings *list, const int list_length); -int st_sensors_check_device_support(struct iio_dev *indio_dev, - int num_sensors_list, const struct st_sensor_settings *sensor_settings); +int st_sensors_verify_id(struct iio_dev *indio_dev); ssize_t st_sensors_sysfs_sampling_frequency_avail(struct device *dev, struct device_attribute *attr, char *buf); diff --git a/include/linux/iio/common/st_sensors_spi.h b/include/linux/iio/common/st_sensors_spi.h index 6020f7167859..90b25f087f06 100644 --- a/include/linux/iio/common/st_sensors_spi.h +++ b/include/linux/iio/common/st_sensors_spi.h @@ -13,7 +13,7 @@ #include #include -void st_sensors_spi_configure(struct iio_dev *indio_dev, - struct spi_device *spi, struct st_sensor_data *sdata); +int st_sensors_spi_configure(struct iio_dev *indio_dev, + struct spi_device *spi); #endif /* ST_SENSORS_SPI_H */ -- cgit v1.2.3 From 062809ef7733209312562e87cefc84a470430929 Mon Sep 17 00:00:00 2001 From: Denis Ciocca Date: Thu, 18 Jul 2019 15:53:53 -0700 Subject: iio: make st_sensors drivers use regmap This patch is meant to replace the i2c/spi transfer functions with regmap. SPI framework requires DMA safe buffers so let's add GFP_DMA flag for memory allocation used by bulk_read functions. Signed-off-by: Denis Ciocca Signed-off-by: Jonathan Cameron --- drivers/iio/accel/st_accel_buffer.c | 3 +- drivers/iio/accel/st_accel_core.c | 3 - drivers/iio/accel/st_accel_i2c.c | 4 +- drivers/iio/common/st_sensors/st_sensors_buffer.c | 10 +-- drivers/iio/common/st_sensors/st_sensors_core.c | 39 +++----- drivers/iio/common/st_sensors/st_sensors_i2c.c | 73 ++++++++------- drivers/iio/common/st_sensors/st_sensors_spi.c | 100 +++++---------------- drivers/iio/common/st_sensors/st_sensors_trigger.c | 10 +-- drivers/iio/gyro/st_gyro_buffer.c | 3 +- drivers/iio/gyro/st_gyro_core.c | 3 - drivers/iio/gyro/st_gyro_i2c.c | 4 +- drivers/iio/magnetometer/st_magn_buffer.c | 3 +- drivers/iio/magnetometer/st_magn_core.c | 3 - drivers/iio/magnetometer/st_magn_i2c.c | 4 +- drivers/iio/pressure/st_pressure_buffer.c | 3 +- drivers/iio/pressure/st_pressure_core.c | 3 - drivers/iio/pressure/st_pressure_i2c.c | 4 +- include/linux/iio/common/st_sensors.h | 40 +-------- include/linux/iio/common/st_sensors_i2c.h | 4 +- 19 files changed, 104 insertions(+), 212 deletions(-) (limited to 'include') diff --git a/drivers/iio/accel/st_accel_buffer.c b/drivers/iio/accel/st_accel_buffer.c index 0205c0167cdd..05f9aea431e2 100644 --- a/drivers/iio/accel/st_accel_buffer.c +++ b/drivers/iio/accel/st_accel_buffer.c @@ -39,7 +39,8 @@ static int st_accel_buffer_postenable(struct iio_dev *indio_dev) int err; struct st_sensor_data *adata = iio_priv(indio_dev); - adata->buffer_data = kmalloc(indio_dev->scan_bytes, GFP_KERNEL); + adata->buffer_data = kmalloc(indio_dev->scan_bytes, + GFP_DMA | GFP_KERNEL); if (adata->buffer_data == NULL) { err = -ENOMEM; goto allocate_memory_error; diff --git a/drivers/iio/accel/st_accel_core.c b/drivers/iio/accel/st_accel_core.c index 630909702a19..0b17004cb12e 100644 --- a/drivers/iio/accel/st_accel_core.c +++ b/drivers/iio/accel/st_accel_core.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include #include @@ -1177,7 +1176,6 @@ int st_accel_common_probe(struct iio_dev *indio_dev) indio_dev->modes = INDIO_DIRECT_MODE; indio_dev->info = &accel_info; - mutex_init(&adata->tb.buf_lock); err = st_sensors_power_enable(indio_dev); if (err) @@ -1188,7 +1186,6 @@ int st_accel_common_probe(struct iio_dev *indio_dev) goto st_accel_power_off; adata->num_data_channels = ST_ACCEL_NUMBER_DATA_CHANNELS; - adata->multiread_bit = adata->sensor_settings->multi_read_bit; indio_dev->num_channels = ST_SENSORS_NUMBER_ALL_CHANNELS; channels_size = indio_dev->num_channels * sizeof(struct iio_chan_spec); diff --git a/drivers/iio/accel/st_accel_i2c.c b/drivers/iio/accel/st_accel_i2c.c index a92cf776031e..50fa0fc32baa 100644 --- a/drivers/iio/accel/st_accel_i2c.c +++ b/drivers/iio/accel/st_accel_i2c.c @@ -174,7 +174,9 @@ static int st_accel_i2c_probe(struct i2c_client *client) adata = iio_priv(indio_dev); adata->sensor_settings = (struct st_sensor_settings *)settings; - st_sensors_i2c_configure(indio_dev, client, adata); + ret = st_sensors_i2c_configure(indio_dev, client); + if (ret < 0) + return ret; ret = st_accel_common_probe(indio_dev); if (ret < 0) diff --git a/drivers/iio/common/st_sensors/st_sensors_buffer.c b/drivers/iio/common/st_sensors/st_sensors_buffer.c index 4a68669dc555..eee30130ae23 100644 --- a/drivers/iio/common/st_sensors/st_sensors_buffer.c +++ b/drivers/iio/common/st_sensors/st_sensors_buffer.c @@ -17,15 +17,16 @@ #include #include #include +#include #include static int st_sensors_get_buffer_element(struct iio_dev *indio_dev, u8 *buf) { - int i; struct st_sensor_data *sdata = iio_priv(indio_dev); unsigned int num_data_channels = sdata->num_data_channels; + int i; for_each_set_bit(i, indio_dev->active_scan_mask, num_data_channels) { const struct iio_chan_spec *channel = &indio_dev->channels[i]; @@ -36,11 +37,8 @@ static int st_sensors_get_buffer_element(struct iio_dev *indio_dev, u8 *buf) channel->scan_type.storagebits >> 3; buf = PTR_ALIGN(buf, storage_bytes); - if (sdata->tf->read_multiple_byte(&sdata->tb, sdata->dev, - channel->address, - bytes_to_read, buf, - sdata->multiread_bit) < - bytes_to_read) + if (regmap_bulk_read(sdata->regmap, channel->address, + buf, bytes_to_read) < 0) return -EIO; /* Advance the buffer pointer */ diff --git a/drivers/iio/common/st_sensors/st_sensors_core.c b/drivers/iio/common/st_sensors/st_sensors_core.c index f05bf7cf0594..4a3064fb6cd9 100644 --- a/drivers/iio/common/st_sensors/st_sensors_core.c +++ b/drivers/iio/common/st_sensors/st_sensors_core.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -28,19 +29,10 @@ static inline u32 st_sensors_get_unaligned_le24(const u8 *p) int st_sensors_write_data_with_mask(struct iio_dev *indio_dev, u8 reg_addr, u8 mask, u8 data) { - int err; - u8 new_data; struct st_sensor_data *sdata = iio_priv(indio_dev); - err = sdata->tf->read_byte(&sdata->tb, sdata->dev, reg_addr, &new_data); - if (err < 0) - goto st_sensors_write_data_with_mask_error; - - new_data = ((new_data & (~mask)) | ((data << __ffs(mask)) & mask)); - err = sdata->tf->write_byte(&sdata->tb, sdata->dev, reg_addr, new_data); - -st_sensors_write_data_with_mask_error: - return err; + return regmap_update_bits(sdata->regmap, + reg_addr, mask, data << __ffs(mask)); } int st_sensors_debugfs_reg_access(struct iio_dev *indio_dev, @@ -48,19 +40,15 @@ int st_sensors_debugfs_reg_access(struct iio_dev *indio_dev, unsigned *readval) { struct st_sensor_data *sdata = iio_priv(indio_dev); - u8 readdata; int err; if (!readval) - return sdata->tf->write_byte(&sdata->tb, sdata->dev, - (u8)reg, (u8)writeval); + return regmap_write(sdata->regmap, reg, writeval); - err = sdata->tf->read_byte(&sdata->tb, sdata->dev, (u8)reg, &readdata); + err = regmap_read(sdata->regmap, reg, readval); if (err < 0) return err; - *readval = (unsigned)readdata; - return 0; } EXPORT_SYMBOL(st_sensors_debugfs_reg_access); @@ -545,7 +533,7 @@ st_sensors_match_scale_error: EXPORT_SYMBOL(st_sensors_set_fullscale_by_gain); static int st_sensors_read_axis_data(struct iio_dev *indio_dev, - struct iio_chan_spec const *ch, int *data) + struct iio_chan_spec const *ch, int *data) { int err; u8 *outdata; @@ -554,13 +542,12 @@ static int st_sensors_read_axis_data(struct iio_dev *indio_dev, byte_for_channel = DIV_ROUND_UP(ch->scan_type.realbits + ch->scan_type.shift, 8); - outdata = kmalloc(byte_for_channel, GFP_KERNEL); + outdata = kmalloc(byte_for_channel, GFP_DMA | GFP_KERNEL); if (!outdata) return -ENOMEM; - err = sdata->tf->read_multiple_byte(&sdata->tb, sdata->dev, - ch->address, byte_for_channel, - outdata, sdata->multiread_bit); + err = regmap_bulk_read(sdata->regmap, ch->address, + outdata, byte_for_channel); if (err < 0) goto st_sensors_free_memory; @@ -645,13 +632,11 @@ EXPORT_SYMBOL(st_sensors_get_settings_index); int st_sensors_verify_id(struct iio_dev *indio_dev) { struct st_sensor_data *sdata = iio_priv(indio_dev); - int err; - u8 wai; + int wai, err; if (sdata->sensor_settings->wai_addr) { - err = sdata->tf->read_byte(&sdata->tb, sdata->dev, - sdata->sensor_settings->wai_addr, - &wai); + err = regmap_read(sdata->regmap, + sdata->sensor_settings->wai_addr, &wai); if (err < 0) { dev_err(&indio_dev->dev, "failed to read Who-Am-I register.\n"); diff --git a/drivers/iio/common/st_sensors/st_sensors_i2c.c b/drivers/iio/common/st_sensors/st_sensors_i2c.c index b1c9812407e7..9240625534df 100644 --- a/drivers/iio/common/st_sensors/st_sensors_i2c.c +++ b/drivers/iio/common/st_sensors/st_sensors_i2c.c @@ -13,6 +13,7 @@ #include #include #include +#include #include @@ -26,55 +27,51 @@ static unsigned int st_sensors_i2c_get_irq(struct iio_dev *indio_dev) return to_i2c_client(sdata->dev)->irq; } -static int st_sensors_i2c_read_byte(struct st_sensor_transfer_buffer *tb, - struct device *dev, u8 reg_addr, u8 *res_byte) -{ - int err; - - err = i2c_smbus_read_byte_data(to_i2c_client(dev), reg_addr); - if (err < 0) - goto st_accel_i2c_read_byte_error; - - *res_byte = err & 0xff; - -st_accel_i2c_read_byte_error: - return err < 0 ? err : 0; -} - -static int st_sensors_i2c_read_multiple_byte( - struct st_sensor_transfer_buffer *tb, struct device *dev, - u8 reg_addr, int len, u8 *data, bool multiread_bit) -{ - if (multiread_bit) - reg_addr |= ST_SENSORS_I2C_MULTIREAD; - - return i2c_smbus_read_i2c_block_data_or_emulated(to_i2c_client(dev), - reg_addr, len, data); -} - -static int st_sensors_i2c_write_byte(struct st_sensor_transfer_buffer *tb, - struct device *dev, u8 reg_addr, u8 data) -{ - return i2c_smbus_write_byte_data(to_i2c_client(dev), reg_addr, data); -} +static const struct regmap_config st_sensors_i2c_regmap_config = { + .reg_bits = 8, + .val_bits = 8, +}; -static const struct st_sensor_transfer_function st_sensors_tf_i2c = { - .read_byte = st_sensors_i2c_read_byte, - .write_byte = st_sensors_i2c_write_byte, - .read_multiple_byte = st_sensors_i2c_read_multiple_byte, +static const struct regmap_config st_sensors_i2c_regmap_multiread_bit_config = { + .reg_bits = 8, + .val_bits = 8, + .read_flag_mask = ST_SENSORS_I2C_MULTIREAD, }; -void st_sensors_i2c_configure(struct iio_dev *indio_dev, - struct i2c_client *client, struct st_sensor_data *sdata) +/* + * st_sensors_i2c_configure() - configure I2C interface + * @indio_dev: IIO device reference. + * @client: i2c client reference. + * + * Return: 0 on success, else a negative error code. + */ +int st_sensors_i2c_configure(struct iio_dev *indio_dev, + struct i2c_client *client) { + struct st_sensor_data *sdata = iio_priv(indio_dev); + const struct regmap_config *config; + + if (sdata->sensor_settings->multi_read_bit) + config = &st_sensors_i2c_regmap_multiread_bit_config; + else + config = &st_sensors_i2c_regmap_config; + + sdata->regmap = devm_regmap_init_i2c(client, config); + if (IS_ERR(sdata->regmap)) { + dev_err(&client->dev, "Failed to register i2c regmap (%d)\n", + (int)PTR_ERR(sdata->regmap)); + return PTR_ERR(sdata->regmap); + } + i2c_set_clientdata(client, indio_dev); indio_dev->dev.parent = &client->dev; indio_dev->name = client->name; sdata->dev = &client->dev; - sdata->tf = &st_sensors_tf_i2c; sdata->get_irq_data_ready = st_sensors_i2c_get_irq; + + return 0; } EXPORT_SYMBOL(st_sensors_i2c_configure); diff --git a/drivers/iio/common/st_sensors/st_sensors_spi.c b/drivers/iio/common/st_sensors/st_sensors_spi.c index a57cd648975c..9c0661a283d0 100644 --- a/drivers/iio/common/st_sensors/st_sensors_spi.c +++ b/drivers/iio/common/st_sensors/st_sensors_spi.c @@ -11,12 +11,12 @@ #include #include #include +#include #include - +#include "st_sensors_core.h" #define ST_SENSORS_SPI_MULTIREAD 0xc0 -#define ST_SENSORS_SPI_READ 0x80 static unsigned int st_sensors_spi_get_irq(struct iio_dev *indio_dev) { @@ -25,81 +25,15 @@ static unsigned int st_sensors_spi_get_irq(struct iio_dev *indio_dev) return to_spi_device(sdata->dev)->irq; } -static int st_sensors_spi_read(struct st_sensor_transfer_buffer *tb, - struct device *dev, u8 reg_addr, int len, u8 *data, bool multiread_bit) -{ - int err; - - struct spi_transfer xfers[] = { - { - .tx_buf = tb->tx_buf, - .bits_per_word = 8, - .len = 1, - }, - { - .rx_buf = tb->rx_buf, - .bits_per_word = 8, - .len = len, - } - }; - - mutex_lock(&tb->buf_lock); - if ((multiread_bit) && (len > 1)) - tb->tx_buf[0] = reg_addr | ST_SENSORS_SPI_MULTIREAD; - else - tb->tx_buf[0] = reg_addr | ST_SENSORS_SPI_READ; - - err = spi_sync_transfer(to_spi_device(dev), xfers, ARRAY_SIZE(xfers)); - if (err) - goto acc_spi_read_error; - - memcpy(data, tb->rx_buf, len); - mutex_unlock(&tb->buf_lock); - return len; - -acc_spi_read_error: - mutex_unlock(&tb->buf_lock); - return err; -} - -static int st_sensors_spi_read_byte(struct st_sensor_transfer_buffer *tb, - struct device *dev, u8 reg_addr, u8 *res_byte) -{ - return st_sensors_spi_read(tb, dev, reg_addr, 1, res_byte, false); -} - -static int st_sensors_spi_read_multiple_byte( - struct st_sensor_transfer_buffer *tb, struct device *dev, - u8 reg_addr, int len, u8 *data, bool multiread_bit) -{ - return st_sensors_spi_read(tb, dev, reg_addr, len, data, multiread_bit); -} - -static int st_sensors_spi_write_byte(struct st_sensor_transfer_buffer *tb, - struct device *dev, u8 reg_addr, u8 data) -{ - int err; - - struct spi_transfer xfers = { - .tx_buf = tb->tx_buf, - .bits_per_word = 8, - .len = 2, - }; - - mutex_lock(&tb->buf_lock); - tb->tx_buf[0] = reg_addr; - tb->tx_buf[1] = data; - - err = spi_sync_transfer(to_spi_device(dev), &xfers, 1); - mutex_unlock(&tb->buf_lock); - - return err; -} +static const struct regmap_config st_sensors_spi_regmap_config = { + .reg_bits = 8, + .val_bits = 8, +}; -static const struct st_sensor_transfer_function st_sensors_tf_spi = { - .read_byte = st_sensors_spi_read_byte, - .write_byte = st_sensors_spi_write_byte, - .read_multiple_byte = st_sensors_spi_read_multiple_byte, +static const struct regmap_config st_sensors_spi_regmap_multiread_bit_config = { + .reg_bits = 8, + .val_bits = 8, + .read_flag_mask = ST_SENSORS_SPI_MULTIREAD, }; /* @@ -155,6 +89,7 @@ int st_sensors_spi_configure(struct iio_dev *indio_dev, struct spi_device *spi) { struct st_sensor_data *sdata = iio_priv(indio_dev); + const struct regmap_config *config; int err; if (st_sensors_is_spi_3_wire(spi)) { @@ -164,13 +99,24 @@ int st_sensors_spi_configure(struct iio_dev *indio_dev, return err; } + if (sdata->sensor_settings->multi_read_bit) + config = &st_sensors_spi_regmap_multiread_bit_config; + else + config = &st_sensors_spi_regmap_config; + + sdata->regmap = devm_regmap_init_spi(spi, config); + if (IS_ERR(sdata->regmap)) { + dev_err(&spi->dev, "Failed to register spi regmap (%d)\n", + (int)PTR_ERR(sdata->regmap)); + return PTR_ERR(sdata->regmap); + } + spi_set_drvdata(spi, indio_dev); indio_dev->dev.parent = &spi->dev; indio_dev->name = spi->modalias; sdata->dev = &spi->dev; - sdata->tf = &st_sensors_tf_spi; sdata->get_irq_data_ready = st_sensors_spi_get_irq; return 0; diff --git a/drivers/iio/common/st_sensors/st_sensors_trigger.c b/drivers/iio/common/st_sensors/st_sensors_trigger.c index 630c8cb35e8b..bed7b8682b17 100644 --- a/drivers/iio/common/st_sensors/st_sensors_trigger.c +++ b/drivers/iio/common/st_sensors/st_sensors_trigger.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include "st_sensors_core.h" @@ -26,8 +27,7 @@ static int st_sensors_new_samples_available(struct iio_dev *indio_dev, struct st_sensor_data *sdata) { - u8 status; - int ret; + int ret, status; /* How would I know if I can't check it? */ if (!sdata->sensor_settings->drdy_irq.stat_drdy.addr) @@ -37,9 +37,9 @@ static int st_sensors_new_samples_available(struct iio_dev *indio_dev, if (!indio_dev->active_scan_mask) return 0; - ret = sdata->tf->read_byte(&sdata->tb, sdata->dev, - sdata->sensor_settings->drdy_irq.stat_drdy.addr, - &status); + ret = regmap_read(sdata->regmap, + sdata->sensor_settings->drdy_irq.stat_drdy.addr, + &status); if (ret < 0) { dev_err(sdata->dev, "error checking samples available\n"); diff --git a/drivers/iio/gyro/st_gyro_buffer.c b/drivers/iio/gyro/st_gyro_buffer.c index 6e362f735e92..21360681d4dd 100644 --- a/drivers/iio/gyro/st_gyro_buffer.c +++ b/drivers/iio/gyro/st_gyro_buffer.c @@ -39,7 +39,8 @@ static int st_gyro_buffer_postenable(struct iio_dev *indio_dev) int err; struct st_sensor_data *gdata = iio_priv(indio_dev); - gdata->buffer_data = kmalloc(indio_dev->scan_bytes, GFP_KERNEL); + gdata->buffer_data = kmalloc(indio_dev->scan_bytes, + GFP_DMA | GFP_KERNEL); if (gdata->buffer_data == NULL) { err = -ENOMEM; goto allocate_memory_error; diff --git a/drivers/iio/gyro/st_gyro_core.c b/drivers/iio/gyro/st_gyro_core.c index 4b87e79aa744..02e42c945181 100644 --- a/drivers/iio/gyro/st_gyro_core.c +++ b/drivers/iio/gyro/st_gyro_core.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include @@ -394,7 +393,6 @@ int st_gyro_common_probe(struct iio_dev *indio_dev) indio_dev->modes = INDIO_DIRECT_MODE; indio_dev->info = &gyro_info; - mutex_init(&gdata->tb.buf_lock); err = st_sensors_power_enable(indio_dev); if (err) @@ -405,7 +403,6 @@ int st_gyro_common_probe(struct iio_dev *indio_dev) goto st_gyro_power_off; gdata->num_data_channels = ST_GYRO_NUMBER_DATA_CHANNELS; - gdata->multiread_bit = gdata->sensor_settings->multi_read_bit; indio_dev->channels = gdata->sensor_settings->ch; indio_dev->num_channels = ST_SENSORS_NUMBER_ALL_CHANNELS; diff --git a/drivers/iio/gyro/st_gyro_i2c.c b/drivers/iio/gyro/st_gyro_i2c.c index fa71e94b76f4..05a1a0874bd5 100644 --- a/drivers/iio/gyro/st_gyro_i2c.c +++ b/drivers/iio/gyro/st_gyro_i2c.c @@ -87,7 +87,9 @@ static int st_gyro_i2c_probe(struct i2c_client *client, gdata = iio_priv(indio_dev); gdata->sensor_settings = (struct st_sensor_settings *)settings; - st_sensors_i2c_configure(indio_dev, client, gdata); + err = st_sensors_i2c_configure(indio_dev, client); + if (err < 0) + return err; err = st_gyro_common_probe(indio_dev); if (err < 0) diff --git a/drivers/iio/magnetometer/st_magn_buffer.c b/drivers/iio/magnetometer/st_magn_buffer.c index 11d7806655bc..9dba93539a99 100644 --- a/drivers/iio/magnetometer/st_magn_buffer.c +++ b/drivers/iio/magnetometer/st_magn_buffer.c @@ -34,7 +34,8 @@ static int st_magn_buffer_postenable(struct iio_dev *indio_dev) int err; struct st_sensor_data *mdata = iio_priv(indio_dev); - mdata->buffer_data = kmalloc(indio_dev->scan_bytes, GFP_KERNEL); + mdata->buffer_data = kmalloc(indio_dev->scan_bytes, + GFP_DMA | GFP_KERNEL); if (mdata->buffer_data == NULL) { err = -ENOMEM; goto allocate_memory_error; diff --git a/drivers/iio/magnetometer/st_magn_core.c b/drivers/iio/magnetometer/st_magn_core.c index 3f313aefece6..804353a483c7 100644 --- a/drivers/iio/magnetometer/st_magn_core.c +++ b/drivers/iio/magnetometer/st_magn_core.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include @@ -496,7 +495,6 @@ int st_magn_common_probe(struct iio_dev *indio_dev) indio_dev->modes = INDIO_DIRECT_MODE; indio_dev->info = &magn_info; - mutex_init(&mdata->tb.buf_lock); err = st_sensors_power_enable(indio_dev); if (err) @@ -507,7 +505,6 @@ int st_magn_common_probe(struct iio_dev *indio_dev) goto st_magn_power_off; mdata->num_data_channels = ST_MAGN_NUMBER_DATA_CHANNELS; - mdata->multiread_bit = mdata->sensor_settings->multi_read_bit; indio_dev->channels = mdata->sensor_settings->ch; indio_dev->num_channels = ST_SENSORS_NUMBER_ALL_CHANNELS; diff --git a/drivers/iio/magnetometer/st_magn_i2c.c b/drivers/iio/magnetometer/st_magn_i2c.c index d5d565639bed..fdba480a12be 100644 --- a/drivers/iio/magnetometer/st_magn_i2c.c +++ b/drivers/iio/magnetometer/st_magn_i2c.c @@ -79,7 +79,9 @@ static int st_magn_i2c_probe(struct i2c_client *client, mdata = iio_priv(indio_dev); mdata->sensor_settings = (struct st_sensor_settings *)settings; - st_sensors_i2c_configure(indio_dev, client, mdata); + err = st_sensors_i2c_configure(indio_dev, client); + if (err < 0) + return err; err = st_magn_common_probe(indio_dev); if (err < 0) diff --git a/drivers/iio/pressure/st_pressure_buffer.c b/drivers/iio/pressure/st_pressure_buffer.c index 4566e08a64a1..f21b630abaa0 100644 --- a/drivers/iio/pressure/st_pressure_buffer.c +++ b/drivers/iio/pressure/st_pressure_buffer.c @@ -39,7 +39,8 @@ static int st_press_buffer_postenable(struct iio_dev *indio_dev) int err; struct st_sensor_data *press_data = iio_priv(indio_dev); - press_data->buffer_data = kmalloc(indio_dev->scan_bytes, GFP_KERNEL); + press_data->buffer_data = kmalloc(indio_dev->scan_bytes, + GFP_DMA | GFP_KERNEL); if (press_data->buffer_data == NULL) { err = -ENOMEM; goto allocate_memory_error; diff --git a/drivers/iio/pressure/st_pressure_core.c b/drivers/iio/pressure/st_pressure_core.c index a783fc075c26..9ef92a501286 100644 --- a/drivers/iio/pressure/st_pressure_core.c +++ b/drivers/iio/pressure/st_pressure_core.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include @@ -692,7 +691,6 @@ int st_press_common_probe(struct iio_dev *indio_dev) indio_dev->modes = INDIO_DIRECT_MODE; indio_dev->info = &press_info; - mutex_init(&press_data->tb.buf_lock); err = st_sensors_power_enable(indio_dev); if (err) @@ -709,7 +707,6 @@ int st_press_common_probe(struct iio_dev *indio_dev) * element. */ press_data->num_data_channels = press_data->sensor_settings->num_ch - 1; - press_data->multiread_bit = press_data->sensor_settings->multi_read_bit; indio_dev->channels = press_data->sensor_settings->ch; indio_dev->num_channels = press_data->sensor_settings->num_ch; diff --git a/drivers/iio/pressure/st_pressure_i2c.c b/drivers/iio/pressure/st_pressure_i2c.c index 466e7dde5eae..71d2ed6b4948 100644 --- a/drivers/iio/pressure/st_pressure_i2c.c +++ b/drivers/iio/pressure/st_pressure_i2c.c @@ -112,7 +112,9 @@ static int st_press_i2c_probe(struct i2c_client *client, press_data = iio_priv(indio_dev); press_data->sensor_settings = (struct st_sensor_settings *)settings; - st_sensors_i2c_configure(indio_dev, client, press_data); + ret = st_sensors_i2c_configure(indio_dev, client); + if (ret < 0) + return ret; ret = st_press_common_probe(indio_dev); if (ret < 0) diff --git a/include/linux/iio/common/st_sensors.h b/include/linux/iio/common/st_sensors.h index 566b955e2980..28fc1f9fa7d5 100644 --- a/include/linux/iio/common/st_sensors.h +++ b/include/linux/iio/common/st_sensors.h @@ -16,6 +16,7 @@ #include #include #include +#include #include @@ -169,36 +170,6 @@ struct st_sensor_data_ready_irq { } ig1; }; -/** - * struct st_sensor_transfer_buffer - ST sensor device I/O buffer - * @buf_lock: Mutex to protect rx and tx buffers. - * @tx_buf: Buffer used by SPI transfer function to send data to the sensors. - * This buffer is used to avoid DMA not-aligned issue. - * @rx_buf: Buffer used by SPI transfer to receive data from sensors. - * This buffer is used to avoid DMA not-aligned issue. - */ -struct st_sensor_transfer_buffer { - struct mutex buf_lock; - u8 rx_buf[ST_SENSORS_RX_MAX_LENGTH]; - u8 tx_buf[ST_SENSORS_TX_MAX_LENGTH] ____cacheline_aligned; -}; - -/** - * struct st_sensor_transfer_function - ST sensor device I/O function - * @read_byte: Function used to read one byte. - * @write_byte: Function used to write one byte. - * @read_multiple_byte: Function used to read multiple byte. - */ -struct st_sensor_transfer_function { - int (*read_byte) (struct st_sensor_transfer_buffer *tb, - struct device *dev, u8 reg_addr, u8 *res_byte); - int (*write_byte) (struct st_sensor_transfer_buffer *tb, - struct device *dev, u8 reg_addr, u8 data); - int (*read_multiple_byte) (struct st_sensor_transfer_buffer *tb, - struct device *dev, u8 reg_addr, int len, u8 *data, - bool multiread_bit); -}; - /** * struct st_sensor_settings - ST specific sensor settings * @wai: Contents of WhoAmI register. @@ -242,16 +213,14 @@ struct st_sensor_settings { * @current_fullscale: Maximum range of measure by the sensor. * @vdd: Pointer to sensor's Vdd power supply * @vdd_io: Pointer to sensor's Vdd-IO power supply + * @regmap: Pointer to specific sensor regmap configuration. * @enabled: Status of the sensor (false->off, true->on). - * @multiread_bit: Use or not particular bit for [I2C/SPI] multiread. * @buffer_data: Data used by buffer part. * @odr: Output data rate of the sensor [Hz]. * num_data_channels: Number of data channels used in buffer. * @drdy_int_pin: Redirect DRDY on pin 1 (1) or pin 2 (2). * @int_pin_open_drain: Set the interrupt/DRDY to open drain. * @get_irq_data_ready: Function to get the IRQ used for data ready signal. - * @tf: Transfer function structure used by I/O operations. - * @tb: Transfer buffers and mutex used by I/O operations. * @edge_irq: the IRQ triggers on edges and need special handling. * @hw_irq_trigger: if we're using the hardware interrupt on the sensor. * @hw_timestamp: Latest timestamp from the interrupt handler, when in use. @@ -264,9 +233,9 @@ struct st_sensor_data { struct st_sensor_fullscale_avl *current_fullscale; struct regulator *vdd; struct regulator *vdd_io; + struct regmap *regmap; bool enabled; - bool multiread_bit; char *buffer_data; @@ -278,9 +247,6 @@ struct st_sensor_data { unsigned int (*get_irq_data_ready) (struct iio_dev *indio_dev); - const struct st_sensor_transfer_function *tf; - struct st_sensor_transfer_buffer tb; - bool edge_irq; bool hw_irq_trigger; s64 hw_timestamp; diff --git a/include/linux/iio/common/st_sensors_i2c.h b/include/linux/iio/common/st_sensors_i2c.h index 5ada89944698..01e424e2af4f 100644 --- a/include/linux/iio/common/st_sensors_i2c.h +++ b/include/linux/iio/common/st_sensors_i2c.h @@ -14,8 +14,8 @@ #include #include -void st_sensors_i2c_configure(struct iio_dev *indio_dev, - struct i2c_client *client, struct st_sensor_data *sdata); +int st_sensors_i2c_configure(struct iio_dev *indio_dev, + struct i2c_client *client); #ifdef CONFIG_ACPI int st_sensors_match_acpi_device(struct device *dev); -- cgit v1.2.3 From dca39af8831e65adc13e0f9f8bdca3a746185072 Mon Sep 17 00:00:00 2001 From: Alexandru Ardelean Date: Tue, 23 Jul 2019 10:36:38 +0300 Subject: iio: imu: adis: Add support for SPI transfer cs_change_delay The ADIS16460 requires a higher delay before the next transfer. Since the SPI framework supports configuring the delay before the next transfer, this driver will become the first user of it. The support for this functionality in ADIS16460 requires an addition to the ADIS lib to support the `cs_change_delay` functionality from the SPI subsystem. Signed-off-by: Michael Hennerich Signed-off-by: Alexandru Ardelean Signed-off-by: Jonathan Cameron --- drivers/iio/imu/adis.c | 12 ++++++++++++ include/linux/iio/imu/adis.h | 2 ++ 2 files changed, 14 insertions(+) (limited to 'include') diff --git a/drivers/iio/imu/adis.c b/drivers/iio/imu/adis.c index 30281e91dbf9..1631c255deab 100644 --- a/drivers/iio/imu/adis.c +++ b/drivers/iio/imu/adis.c @@ -39,18 +39,24 @@ int adis_write_reg(struct adis *adis, unsigned int reg, .len = 2, .cs_change = 1, .delay_usecs = adis->data->write_delay, + .cs_change_delay = adis->data->cs_change_delay, + .cs_change_delay_unit = SPI_DELAY_UNIT_USECS, }, { .tx_buf = adis->tx + 2, .bits_per_word = 8, .len = 2, .cs_change = 1, .delay_usecs = adis->data->write_delay, + .cs_change_delay = adis->data->cs_change_delay, + .cs_change_delay_unit = SPI_DELAY_UNIT_USECS, }, { .tx_buf = adis->tx + 4, .bits_per_word = 8, .len = 2, .cs_change = 1, .delay_usecs = adis->data->write_delay, + .cs_change_delay = adis->data->cs_change_delay, + .cs_change_delay_unit = SPI_DELAY_UNIT_USECS, }, { .tx_buf = adis->tx + 6, .bits_per_word = 8, @@ -133,12 +139,16 @@ int adis_read_reg(struct adis *adis, unsigned int reg, .len = 2, .cs_change = 1, .delay_usecs = adis->data->write_delay, + .cs_change_delay = adis->data->cs_change_delay, + .cs_change_delay_unit = SPI_DELAY_UNIT_USECS, }, { .tx_buf = adis->tx + 2, .bits_per_word = 8, .len = 2, .cs_change = 1, .delay_usecs = adis->data->read_delay, + .cs_change_delay = adis->data->cs_change_delay, + .cs_change_delay_unit = SPI_DELAY_UNIT_USECS, }, { .tx_buf = adis->tx + 4, .rx_buf = adis->rx, @@ -146,6 +156,8 @@ int adis_read_reg(struct adis *adis, unsigned int reg, .len = 2, .cs_change = 1, .delay_usecs = adis->data->read_delay, + .cs_change_delay = adis->data->cs_change_delay, + .cs_change_delay_unit = SPI_DELAY_UNIT_USECS, }, { .rx_buf = adis->rx + 2, .bits_per_word = 8, diff --git a/include/linux/iio/imu/adis.h b/include/linux/iio/imu/adis.h index 3428d06b2f44..4c53815bb729 100644 --- a/include/linux/iio/imu/adis.h +++ b/include/linux/iio/imu/adis.h @@ -26,6 +26,7 @@ struct adis_burst; * struct adis_data - ADIS chip variant specific data * @read_delay: SPI delay for read operations in us * @write_delay: SPI delay for write operations in us + * @cs_change_delay: SPI delay between CS changes in us * @glob_cmd_reg: Register address of the GLOB_CMD register * @msc_ctrl_reg: Register address of the MSC_CTRL register * @diag_stat_reg: Register address of the DIAG_STAT register @@ -35,6 +36,7 @@ struct adis_burst; struct adis_data { unsigned int read_delay; unsigned int write_delay; + unsigned int cs_change_delay; unsigned int glob_cmd_reg; unsigned int msc_ctrl_reg; -- cgit v1.2.3 From 12bf745c9afb6755acba186091bcbb61229f4165 Mon Sep 17 00:00:00 2001 From: Gwendal Grignou Date: Mon, 15 Jul 2019 16:14:51 -0700 Subject: iio: cros_ec: Add sign vector in core for backward compatibility To allow cros_ec iio core library to be used with legacy device, add a vector to rotate sensor data if necessary: legacy devices are not reporting data in HTML5/Android sensor referential. Check the data is not rotated on recent chromebooks that use the HTML5 standard to present sensor data. Signed-off-by: Gwendal Grignou Reviewed-by: Douglas Anderson Signed-off-by: Jonathan Cameron --- drivers/iio/common/cros_ec_sensors/cros_ec_sensors_core.c | 4 ++++ include/linux/iio/common/cros_ec_sensors_core.h | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/iio/common/cros_ec_sensors/cros_ec_sensors_core.c b/drivers/iio/common/cros_ec_sensors/cros_ec_sensors_core.c index d02660b1f0fe..3b491cf7be95 100644 --- a/drivers/iio/common/cros_ec_sensors/cros_ec_sensors_core.c +++ b/drivers/iio/common/cros_ec_sensors/cros_ec_sensors_core.c @@ -101,6 +101,9 @@ int cros_ec_sensors_core_init(struct platform_device *pdev, } state->type = state->resp->info.type; state->loc = state->resp->info.location; + + /* Set sign vector, only used for backward compatibility. */ + memset(state->sign, 1, CROS_EC_SENSOR_MAX_AXIS); } return 0; @@ -303,6 +306,7 @@ static int cros_ec_sensors_read_data_unsafe(struct iio_dev *indio_dev, if (ret < 0) return ret; + *data *= st->sign[i]; data++; } diff --git a/include/linux/iio/common/cros_ec_sensors_core.h b/include/linux/iio/common/cros_ec_sensors_core.h index bb03a252bd04..6f13c73bac3d 100644 --- a/include/linux/iio/common/cros_ec_sensors_core.h +++ b/include/linux/iio/common/cros_ec_sensors_core.h @@ -66,7 +66,7 @@ struct cros_ec_sensors_core_state { s16 offset; u16 scale; } calib[CROS_EC_SENSOR_MAX_AXIS]; - + s8 sign[CROS_EC_SENSOR_MAX_AXIS]; u8 samples[CROS_EC_SAMPLE_SIZE]; int (*read_ec_sensors_data)(struct iio_dev *indio_dev, -- cgit v1.2.3 From ae7b02ad2f32d39d1434655f346c04a16c1aa703 Mon Sep 17 00:00:00 2001 From: Fabien Lahoudere Date: Tue, 16 Jul 2019 11:11:06 +0200 Subject: iio: common: cros_ec_sensors: Expose cros_ec_sensors frequency range via iio sysfs Embedded controller return minimum and maximum frequencies, unfortunately we have no way to know the step for all available frequencies. Even if not complete, we can return a list of known values using the standard read_avail callback (IIO_CHAN_INFO_SAMP_FREQ) to provide them to userland. Now cros_ec_* sensors provides frequencies values in sysfs like this: "0 min max". 0 is always true to disable the sensor. Default frequencies are provided for earlier protocol. Signed-off-by: Nick Vaccaro Signed-off-by: Fabien Lahoudere [rebased on top of iio/testing and solved conflicts] Signed-off-by: Enric Balletbo i Serra Signed-off-by: Jonathan Cameron --- .../iio/common/cros_ec_sensors/cros_ec_sensors.c | 3 + .../common/cros_ec_sensors/cros_ec_sensors_core.c | 65 ++++++++++++++++++++++ drivers/iio/light/cros_ec_light_prox.c | 3 + include/linux/iio/common/cros_ec_sensors_core.h | 21 +++++++ 4 files changed, 92 insertions(+) (limited to 'include') diff --git a/drivers/iio/common/cros_ec_sensors/cros_ec_sensors.c b/drivers/iio/common/cros_ec_sensors/cros_ec_sensors.c index 2af09606c438..6a4919cd83c3 100644 --- a/drivers/iio/common/cros_ec_sensors/cros_ec_sensors.c +++ b/drivers/iio/common/cros_ec_sensors/cros_ec_sensors.c @@ -215,6 +215,7 @@ static int cros_ec_sensors_write(struct iio_dev *indio_dev, static const struct iio_info ec_sensors_info = { .read_raw = &cros_ec_sensors_read, .write_raw = &cros_ec_sensors_write, + .read_avail = &cros_ec_sensors_core_read_avail, }; static int cros_ec_sensors_probe(struct platform_device *pdev) @@ -252,6 +253,8 @@ static int cros_ec_sensors_probe(struct platform_device *pdev) BIT(IIO_CHAN_INFO_SCALE) | BIT(IIO_CHAN_INFO_FREQUENCY) | BIT(IIO_CHAN_INFO_SAMP_FREQ); + channel->info_mask_shared_by_all_available = + BIT(IIO_CHAN_INFO_SAMP_FREQ); channel->scan_type.realbits = CROS_EC_SENSOR_BITS; channel->scan_type.storagebits = CROS_EC_SENSOR_BITS; channel->scan_index = i; diff --git a/drivers/iio/common/cros_ec_sensors/cros_ec_sensors_core.c b/drivers/iio/common/cros_ec_sensors/cros_ec_sensors_core.c index 3b491cf7be95..fd833295bb17 100644 --- a/drivers/iio/common/cros_ec_sensors/cros_ec_sensors_core.c +++ b/drivers/iio/common/cros_ec_sensors/cros_ec_sensors_core.c @@ -50,6 +50,37 @@ static int cros_ec_get_host_cmd_version_mask(struct cros_ec_device *ec_dev, return ret; } +static void get_default_min_max_freq(enum motionsensor_type type, + u32 *min_freq, + u32 *max_freq) +{ + switch (type) { + case MOTIONSENSE_TYPE_ACCEL: + case MOTIONSENSE_TYPE_GYRO: + *min_freq = 12500; + *max_freq = 100000; + break; + case MOTIONSENSE_TYPE_MAG: + *min_freq = 5000; + *max_freq = 25000; + break; + case MOTIONSENSE_TYPE_PROX: + case MOTIONSENSE_TYPE_LIGHT: + *min_freq = 100; + *max_freq = 50000; + break; + case MOTIONSENSE_TYPE_BARO: + *min_freq = 250; + *max_freq = 20000; + break; + case MOTIONSENSE_TYPE_ACTIVITY: + default: + *min_freq = 0; + *max_freq = 0; + break; + } +} + int cros_ec_sensors_core_init(struct platform_device *pdev, struct iio_dev *indio_dev, bool physical_device) @@ -104,6 +135,19 @@ int cros_ec_sensors_core_init(struct platform_device *pdev, /* Set sign vector, only used for backward compatibility. */ memset(state->sign, 1, CROS_EC_SENSOR_MAX_AXIS); + + /* 0 is a correct value used to stop the device */ + state->frequencies[0] = 0; + if (state->msg->version < 3) { + get_default_min_max_freq(state->resp->info.type, + &state->frequencies[1], + &state->frequencies[2]); + } else { + state->frequencies[1] = + state->resp->info_3.min_frequency; + state->frequencies[2] = + state->resp->info_3.max_frequency; + } } return 0; @@ -471,6 +515,27 @@ int cros_ec_sensors_core_read(struct cros_ec_sensors_core_state *st, } EXPORT_SYMBOL_GPL(cros_ec_sensors_core_read); +int cros_ec_sensors_core_read_avail(struct iio_dev *indio_dev, + struct iio_chan_spec const *chan, + const int **vals, + int *type, + int *length, + long mask) +{ + struct cros_ec_sensors_core_state *state = iio_priv(indio_dev); + + switch (mask) { + case IIO_CHAN_INFO_SAMP_FREQ: + *length = ARRAY_SIZE(state->frequencies); + *vals = (const int *)&state->frequencies; + *type = IIO_VAL_INT; + return IIO_AVAIL_LIST; + } + + return -EINVAL; +} +EXPORT_SYMBOL_GPL(cros_ec_sensors_core_read_avail); + int cros_ec_sensors_core_write(struct cros_ec_sensors_core_state *st, struct iio_chan_spec const *chan, int val, int val2, long mask) diff --git a/drivers/iio/light/cros_ec_light_prox.c b/drivers/iio/light/cros_ec_light_prox.c index 965f346d7d64..44313a009928 100644 --- a/drivers/iio/light/cros_ec_light_prox.c +++ b/drivers/iio/light/cros_ec_light_prox.c @@ -162,6 +162,7 @@ static int cros_ec_light_prox_write(struct iio_dev *indio_dev, static const struct iio_info cros_ec_light_prox_info = { .read_raw = &cros_ec_light_prox_read, .write_raw = &cros_ec_light_prox_write, + .read_avail = &cros_ec_sensors_core_read_avail, }; static int cros_ec_light_prox_probe(struct platform_device *pdev) @@ -196,6 +197,8 @@ static int cros_ec_light_prox_probe(struct platform_device *pdev) channel->info_mask_shared_by_all = BIT(IIO_CHAN_INFO_SAMP_FREQ) | BIT(IIO_CHAN_INFO_FREQUENCY); + channel->info_mask_shared_by_all_available = + BIT(IIO_CHAN_INFO_SAMP_FREQ); channel->scan_type.realbits = CROS_EC_SENSOR_BITS; channel->scan_type.storagebits = CROS_EC_SENSOR_BITS; channel->scan_type.shift = 0; diff --git a/include/linux/iio/common/cros_ec_sensors_core.h b/include/linux/iio/common/cros_ec_sensors_core.h index 6f13c73bac3d..ea1f50ce2e49 100644 --- a/include/linux/iio/common/cros_ec_sensors_core.h +++ b/include/linux/iio/common/cros_ec_sensors_core.h @@ -73,6 +73,9 @@ struct cros_ec_sensors_core_state { unsigned long scan_mask, s16 *data); int curr_sampl_freq; + + /* Table of known available frequencies : 0, Min and Max in mHz */ + int frequencies[3]; }; /** @@ -153,6 +156,24 @@ int cros_ec_sensors_core_read(struct cros_ec_sensors_core_state *st, struct iio_chan_spec const *chan, int *val, int *val2, long mask); +/** + * cros_ec_sensors_core_read_avail() - get available values + * @indio_dev: pointer to state information for device + * @chan: channel specification structure table + * @vals: list of available values + * @type: type of data returned + * @length: number of data returned in the array + * @mask: specifies which values to be requested + * + * Return: an error code, IIO_AVAIL_RANGE or IIO_AVAIL_LIST + */ +int cros_ec_sensors_core_read_avail(struct iio_dev *indio_dev, + struct iio_chan_spec const *chan, + const int **vals, + int *type, + int *length, + long mask); + /** * cros_ec_sensors_core_write() - function to write a value to the sensor * @st: pointer to state information for device -- cgit v1.2.3 From ffe0bbabb0cffceceae07484fde1ec2a63b1537c Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 8 Jul 2019 10:23:43 +0200 Subject: gpio: don't WARN() on NULL descs if gpiolib is disabled If gpiolib is disabled, we use the inline stubs from gpio/consumer.h instead of regular definitions of GPIO API. The stubs for 'optional' variants of gpiod_get routines return NULL in this case as if the relevant GPIO wasn't found. This is correct so far. Calling other (non-gpio_get) stubs from this header triggers a warning because the GPIO descriptor couldn't have been requested. The warning however is unconditional (WARN_ON(1)) and is emitted even if the passed descriptor pointer is NULL. We don't want to force the users of 'optional' gpio_get to check the returned pointer before calling e.g. gpiod_set_value() so let's only WARN on non-NULL descriptors. Cc: stable@vger.kernel.org Reported-by: Claus H. Stovgaard Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/consumer.h | 64 +++++++++++++++++++++---------------------- 1 file changed, 32 insertions(+), 32 deletions(-) (limited to 'include') diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h index 9ddcf50a3c59..a7f08fb0f865 100644 --- a/include/linux/gpio/consumer.h +++ b/include/linux/gpio/consumer.h @@ -247,7 +247,7 @@ static inline void gpiod_put(struct gpio_desc *desc) might_sleep(); /* GPIO can never have been requested */ - WARN_ON(1); + WARN_ON(desc); } static inline void devm_gpiod_unhinge(struct device *dev, @@ -256,7 +256,7 @@ static inline void devm_gpiod_unhinge(struct device *dev, might_sleep(); /* GPIO can never have been requested */ - WARN_ON(1); + WARN_ON(desc); } static inline void gpiod_put_array(struct gpio_descs *descs) @@ -264,7 +264,7 @@ static inline void gpiod_put_array(struct gpio_descs *descs) might_sleep(); /* GPIO can never have been requested */ - WARN_ON(1); + WARN_ON(descs); } static inline struct gpio_desc *__must_check @@ -317,7 +317,7 @@ static inline void devm_gpiod_put(struct device *dev, struct gpio_desc *desc) might_sleep(); /* GPIO can never have been requested */ - WARN_ON(1); + WARN_ON(desc); } static inline void devm_gpiod_put_array(struct device *dev, @@ -326,32 +326,32 @@ static inline void devm_gpiod_put_array(struct device *dev, might_sleep(); /* GPIO can never have been requested */ - WARN_ON(1); + WARN_ON(descs); } static inline int gpiod_get_direction(const struct gpio_desc *desc) { /* GPIO can never have been requested */ - WARN_ON(1); + WARN_ON(desc); return -ENOSYS; } static inline int gpiod_direction_input(struct gpio_desc *desc) { /* GPIO can never have been requested */ - WARN_ON(1); + WARN_ON(desc); return -ENOSYS; } static inline int gpiod_direction_output(struct gpio_desc *desc, int value) { /* GPIO can never have been requested */ - WARN_ON(1); + WARN_ON(desc); return -ENOSYS; } static inline int gpiod_direction_output_raw(struct gpio_desc *desc, int value) { /* GPIO can never have been requested */ - WARN_ON(1); + WARN_ON(desc); return -ENOSYS; } @@ -359,7 +359,7 @@ static inline int gpiod_direction_output_raw(struct gpio_desc *desc, int value) static inline int gpiod_get_value(const struct gpio_desc *desc) { /* GPIO can never have been requested */ - WARN_ON(1); + WARN_ON(desc); return 0; } static inline int gpiod_get_array_value(unsigned int array_size, @@ -368,13 +368,13 @@ static inline int gpiod_get_array_value(unsigned int array_size, unsigned long *value_bitmap) { /* GPIO can never have been requested */ - WARN_ON(1); + WARN_ON(desc_array); return 0; } static inline void gpiod_set_value(struct gpio_desc *desc, int value) { /* GPIO can never have been requested */ - WARN_ON(1); + WARN_ON(desc); } static inline int gpiod_set_array_value(unsigned int array_size, struct gpio_desc **desc_array, @@ -382,13 +382,13 @@ static inline int gpiod_set_array_value(unsigned int array_size, unsigned long *value_bitmap) { /* GPIO can never have been requested */ - WARN_ON(1); + WARN_ON(desc_array); return 0; } static inline int gpiod_get_raw_value(const struct gpio_desc *desc) { /* GPIO can never have been requested */ - WARN_ON(1); + WARN_ON(desc); return 0; } static inline int gpiod_get_raw_array_value(unsigned int array_size, @@ -397,13 +397,13 @@ static inline int gpiod_get_raw_array_value(unsigned int array_size, unsigned long *value_bitmap) { /* GPIO can never have been requested */ - WARN_ON(1); + WARN_ON(desc_array); return 0; } static inline void gpiod_set_raw_value(struct gpio_desc *desc, int value) { /* GPIO can never have been requested */ - WARN_ON(1); + WARN_ON(desc); } static inline int gpiod_set_raw_array_value(unsigned int array_size, struct gpio_desc **desc_array, @@ -411,14 +411,14 @@ static inline int gpiod_set_raw_array_value(unsigned int array_size, unsigned long *value_bitmap) { /* GPIO can never have been requested */ - WARN_ON(1); + WARN_ON(desc_array); return 0; } static inline int gpiod_get_value_cansleep(const struct gpio_desc *desc) { /* GPIO can never have been requested */ - WARN_ON(1); + WARN_ON(desc); return 0; } static inline int gpiod_get_array_value_cansleep(unsigned int array_size, @@ -427,13 +427,13 @@ static inline int gpiod_get_array_value_cansleep(unsigned int array_size, unsigned long *value_bitmap) { /* GPIO can never have been requested */ - WARN_ON(1); + WARN_ON(desc_array); return 0; } static inline void gpiod_set_value_cansleep(struct gpio_desc *desc, int value) { /* GPIO can never have been requested */ - WARN_ON(1); + WARN_ON(desc); } static inline int gpiod_set_array_value_cansleep(unsigned int array_size, struct gpio_desc **desc_array, @@ -441,13 +441,13 @@ static inline int gpiod_set_array_value_cansleep(unsigned int array_size, unsigned long *value_bitmap) { /* GPIO can never have been requested */ - WARN_ON(1); + WARN_ON(desc_array); return 0; } static inline int gpiod_get_raw_value_cansleep(const struct gpio_desc *desc) { /* GPIO can never have been requested */ - WARN_ON(1); + WARN_ON(desc); return 0; } static inline int gpiod_get_raw_array_value_cansleep(unsigned int array_size, @@ -456,14 +456,14 @@ static inline int gpiod_get_raw_array_value_cansleep(unsigned int array_size, unsigned long *value_bitmap) { /* GPIO can never have been requested */ - WARN_ON(1); + WARN_ON(desc_array); return 0; } static inline void gpiod_set_raw_value_cansleep(struct gpio_desc *desc, int value) { /* GPIO can never have been requested */ - WARN_ON(1); + WARN_ON(desc); } static inline int gpiod_set_raw_array_value_cansleep(unsigned int array_size, struct gpio_desc **desc_array, @@ -471,41 +471,41 @@ static inline int gpiod_set_raw_array_value_cansleep(unsigned int array_size, unsigned long *value_bitmap) { /* GPIO can never have been requested */ - WARN_ON(1); + WARN_ON(desc_array); return 0; } static inline int gpiod_set_debounce(struct gpio_desc *desc, unsigned debounce) { /* GPIO can never have been requested */ - WARN_ON(1); + WARN_ON(desc); return -ENOSYS; } static inline int gpiod_set_transitory(struct gpio_desc *desc, bool transitory) { /* GPIO can never have been requested */ - WARN_ON(1); + WARN_ON(desc); return -ENOSYS; } static inline int gpiod_is_active_low(const struct gpio_desc *desc) { /* GPIO can never have been requested */ - WARN_ON(1); + WARN_ON(desc); return 0; } static inline int gpiod_cansleep(const struct gpio_desc *desc) { /* GPIO can never have been requested */ - WARN_ON(1); + WARN_ON(desc); return 0; } static inline int gpiod_to_irq(const struct gpio_desc *desc) { /* GPIO can never have been requested */ - WARN_ON(1); + WARN_ON(desc); return -EINVAL; } @@ -513,7 +513,7 @@ static inline int gpiod_set_consumer_name(struct gpio_desc *desc, const char *name) { /* GPIO can never have been requested */ - WARN_ON(1); + WARN_ON(desc); return -EINVAL; } @@ -525,7 +525,7 @@ static inline struct gpio_desc *gpio_to_desc(unsigned gpio) static inline int desc_to_gpio(const struct gpio_desc *desc) { /* GPIO can never have been requested */ - WARN_ON(1); + WARN_ON(desc); return -EINVAL; } -- cgit v1.2.3 From 085771ec14b9bdb843fe9283d4703ced395d1b0b Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 22 Jul 2019 09:26:20 -0700 Subject: fs-verity: add UAPI header Add the UAPI header for fs-verity, including two ioctls: - FS_IOC_ENABLE_VERITY - FS_IOC_MEASURE_VERITY These ioctls are documented in the "User API" section of Documentation/filesystems/fsverity.rst. Examples of using these ioctls can be found in fsverity-utils (https://git.kernel.org/pub/scm/linux/kernel/git/ebiggers/fsverity-utils.git). I've also written xfstests that test these ioctls (https://git.kernel.org/pub/scm/linux/kernel/git/ebiggers/xfstests-dev.git/log/?h=fsverity). Reviewed-by: Theodore Ts'o Reviewed-by: Jaegeuk Kim Signed-off-by: Eric Biggers --- Documentation/ioctl/ioctl-number.rst | 1 + include/uapi/linux/fsverity.h | 39 ++++++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+) create mode 100644 include/uapi/linux/fsverity.h (limited to 'include') diff --git a/Documentation/ioctl/ioctl-number.rst b/Documentation/ioctl/ioctl-number.rst index 7f8dcae7a230..bef79cd4c6b4 100644 --- a/Documentation/ioctl/ioctl-number.rst +++ b/Documentation/ioctl/ioctl-number.rst @@ -233,6 +233,7 @@ Code Seq# Include File Comments 'f' 00-0F fs/ext4/ext4.h conflict! 'f' 00-0F linux/fs.h conflict! 'f' 00-0F fs/ocfs2/ocfs2_fs.h conflict! +'f' 81-8F linux/fsverity.h 'g' 00-0F linux/usb/gadgetfs.h 'g' 20-2F linux/usb/g_printer.h 'h' 00-7F conflict! Charon filesystem diff --git a/include/uapi/linux/fsverity.h b/include/uapi/linux/fsverity.h new file mode 100644 index 000000000000..57d1d7fc0c34 --- /dev/null +++ b/include/uapi/linux/fsverity.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * fs-verity user API + * + * These ioctls can be used on filesystems that support fs-verity. See the + * "User API" section of Documentation/filesystems/fsverity.rst. + * + * Copyright 2019 Google LLC + */ +#ifndef _UAPI_LINUX_FSVERITY_H +#define _UAPI_LINUX_FSVERITY_H + +#include +#include + +#define FS_VERITY_HASH_ALG_SHA256 1 + +struct fsverity_enable_arg { + __u32 version; + __u32 hash_algorithm; + __u32 block_size; + __u32 salt_size; + __u64 salt_ptr; + __u32 sig_size; + __u32 __reserved1; + __u64 sig_ptr; + __u64 __reserved2[11]; +}; + +struct fsverity_digest { + __u16 digest_algorithm; + __u16 digest_size; /* input/output */ + __u8 digest[]; +}; + +#define FS_IOC_ENABLE_VERITY _IOW('f', 133, struct fsverity_enable_arg) +#define FS_IOC_MEASURE_VERITY _IOWR('f', 134, struct fsverity_digest) + +#endif /* _UAPI_LINUX_FSVERITY_H */ -- cgit v1.2.3 From fe9918d3b228b3e8c726849d1486933f46b9069e Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 22 Jul 2019 09:26:21 -0700 Subject: fs: uapi: define verity bit for FS_IOC_GETFLAGS Add FS_VERITY_FL to the flags for FS_IOC_GETFLAGS, so that applications can easily determine whether a file is a verity file at the same time as they're checking other file flags. This flag will be gettable only; FS_IOC_SETFLAGS won't allow setting it, since an ioctl must be used instead to provide more parameters. This flag matches the on-disk bit that was already allocated for ext4. Reviewed-by: Theodore Ts'o Reviewed-by: Jaegeuk Kim Signed-off-by: Eric Biggers --- include/uapi/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 59c71fa8c553..df261b7e0587 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -306,6 +306,7 @@ struct fscrypt_key { #define FS_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ #define FS_HUGE_FILE_FL 0x00040000 /* Reserved for ext4 */ #define FS_EXTENT_FL 0x00080000 /* Extents */ +#define FS_VERITY_FL 0x00100000 /* Verity protected inode */ #define FS_EA_INODE_FL 0x00200000 /* Inode used for large EA */ #define FS_EOFBLOCKS_FL 0x00400000 /* Reserved for ext4 */ #define FS_NOCOW_FL 0x00800000 /* Do not cow file */ -- cgit v1.2.3 From 5585f2af737ae3d7454cb0ae77b995cd3ac7e43c Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 22 Jul 2019 09:26:21 -0700 Subject: fs-verity: add inode and superblock fields Analogous to fs/crypto/, add fields to the VFS inode and superblock for use by the fs/verity/ support layer: - ->s_vop: points to the fsverity_operations if the filesystem supports fs-verity, otherwise is NULL. - ->i_verity_info: points to cached fs-verity information for the inode after someone opens it, otherwise is NULL. - S_VERITY: bit in ->i_flags that identifies verity inodes, even when they haven't been opened yet and thus still have NULL ->i_verity_info. Reviewed-by: Theodore Ts'o Reviewed-by: Jaegeuk Kim Signed-off-by: Eric Biggers --- include/linux/fs.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 56b8e358af5c..b3a0f5bfb06d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -64,6 +64,8 @@ struct workqueue_struct; struct iov_iter; struct fscrypt_info; struct fscrypt_operations; +struct fsverity_info; +struct fsverity_operations; struct fs_context; struct fs_parameter_description; @@ -723,6 +725,10 @@ struct inode { struct fscrypt_info *i_crypt_info; #endif +#ifdef CONFIG_FS_VERITY + struct fsverity_info *i_verity_info; +#endif + void *i_private; /* fs or device private pointer */ } __randomize_layout; @@ -1427,6 +1433,9 @@ struct super_block { const struct xattr_handler **s_xattr; #ifdef CONFIG_FS_ENCRYPTION const struct fscrypt_operations *s_cop; +#endif +#ifdef CONFIG_FS_VERITY + const struct fsverity_operations *s_vop; #endif struct hlist_bl_head s_roots; /* alternate root dentries for NFS */ struct list_head s_mounts; /* list of mounts; _not_ for fs use */ @@ -1965,6 +1974,7 @@ struct super_operations { #endif #define S_ENCRYPTED 16384 /* Encrypted file (using fs/crypto/) */ #define S_CASEFOLD 32768 /* Casefolded file */ +#define S_VERITY 65536 /* Verity file (using fs/verity/) */ /* * Note that nosuid etc flags are inode-specific: setting some file-system @@ -2006,6 +2016,7 @@ static inline bool sb_rdonly(const struct super_block *sb) { return sb->s_flags #define IS_DAX(inode) ((inode)->i_flags & S_DAX) #define IS_ENCRYPTED(inode) ((inode)->i_flags & S_ENCRYPTED) #define IS_CASEFOLDED(inode) ((inode)->i_flags & S_CASEFOLD) +#define IS_VERITY(inode) ((inode)->i_flags & S_VERITY) #define IS_WHITEOUT(inode) (S_ISCHR(inode->i_mode) && \ (inode)->i_rdev == WHITEOUT_DEV) -- cgit v1.2.3 From fd2d1acfcadfe2e42567afaec5e989b38061a7d2 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 22 Jul 2019 09:26:22 -0700 Subject: fs-verity: add the hook for file ->open() Add the fsverity_file_open() function, which prepares an fs-verity file to be read from. If not already done, it loads the fs-verity descriptor from the filesystem and sets up an fsverity_info structure for the inode which describes the Merkle tree and contains the file measurement. It also denies all attempts to open verity files for writing. This commit also begins the include/linux/fsverity.h header, which declares the interface between fs/verity/ and filesystems. Reviewed-by: Theodore Ts'o Reviewed-by: Jaegeuk Kim Signed-off-by: Eric Biggers --- fs/verity/Makefile | 3 +- fs/verity/fsverity_private.h | 54 +++++++- fs/verity/init.c | 6 + fs/verity/open.c | 318 +++++++++++++++++++++++++++++++++++++++++++ include/linux/fsverity.h | 71 ++++++++++ 5 files changed, 449 insertions(+), 3 deletions(-) create mode 100644 fs/verity/open.c create mode 100644 include/linux/fsverity.h (limited to 'include') diff --git a/fs/verity/Makefile b/fs/verity/Makefile index 398f3f85fa18..e6a8951c493a 100644 --- a/fs/verity/Makefile +++ b/fs/verity/Makefile @@ -1,4 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_FS_VERITY) += hash_algs.o \ - init.o + init.o \ + open.o diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h index 9697aaebb5dc..c79746ff335e 100644 --- a/fs/verity/fsverity_private.h +++ b/fs/verity/fsverity_private.h @@ -15,8 +15,7 @@ #define pr_fmt(fmt) "fs-verity: " fmt #include -#include -#include +#include struct ahash_request; @@ -59,6 +58,40 @@ struct merkle_tree_params { u64 level_start[FS_VERITY_MAX_LEVELS]; }; +/** + * fsverity_info - cached verity metadata for an inode + * + * When a verity file is first opened, an instance of this struct is allocated + * and stored in ->i_verity_info; it remains until the inode is evicted. It + * caches information about the Merkle tree that's needed to efficiently verify + * data read from the file. It also caches the file measurement. The Merkle + * tree pages themselves are not cached here, but the filesystem may cache them. + */ +struct fsverity_info { + struct merkle_tree_params tree_params; + u8 root_hash[FS_VERITY_MAX_DIGEST_SIZE]; + u8 measurement[FS_VERITY_MAX_DIGEST_SIZE]; + const struct inode *inode; +}; + +/* + * Merkle tree properties. The file measurement is the hash of this structure. + */ +struct fsverity_descriptor { + __u8 version; /* must be 1 */ + __u8 hash_algorithm; /* Merkle tree hash algorithm */ + __u8 log_blocksize; /* log2 of size of data and tree blocks */ + __u8 salt_size; /* size of salt in bytes; 0 if none */ + __le32 sig_size; /* reserved, must be 0 */ + __le64 data_size; /* size of file the Merkle tree is built over */ + __u8 root_hash[64]; /* Merkle tree root hash */ + __u8 salt[32]; /* salt prepended to each hashed block */ + __u8 __reserved[144]; /* must be 0's */ +}; + +/* Arbitrary limit to bound the kmalloc() size. Can be changed. */ +#define FS_VERITY_MAX_DESCRIPTOR_SIZE 16384 + /* hash_algs.c */ extern struct fsverity_hash_alg fsverity_hash_algs[]; @@ -85,4 +118,21 @@ fsverity_msg(const struct inode *inode, const char *level, #define fsverity_err(inode, fmt, ...) \ fsverity_msg((inode), KERN_ERR, fmt, ##__VA_ARGS__) +/* open.c */ + +int fsverity_init_merkle_tree_params(struct merkle_tree_params *params, + const struct inode *inode, + unsigned int hash_algorithm, + unsigned int log_blocksize, + const u8 *salt, size_t salt_size); + +struct fsverity_info *fsverity_create_info(const struct inode *inode, + const void *desc, size_t desc_size); + +void fsverity_set_info(struct inode *inode, struct fsverity_info *vi); + +void fsverity_free_info(struct fsverity_info *vi); + +int __init fsverity_init_info_cache(void); + #endif /* _FSVERITY_PRIVATE_H */ diff --git a/fs/verity/init.c b/fs/verity/init.c index 40076bbe452a..fff1fd634335 100644 --- a/fs/verity/init.c +++ b/fs/verity/init.c @@ -33,8 +33,14 @@ void fsverity_msg(const struct inode *inode, const char *level, static int __init fsverity_init(void) { + int err; + fsverity_check_hash_algs(); + err = fsverity_init_info_cache(); + if (err) + return err; + pr_debug("Initialized fs-verity\n"); return 0; } diff --git a/fs/verity/open.c b/fs/verity/open.c new file mode 100644 index 000000000000..8013f77f907e --- /dev/null +++ b/fs/verity/open.c @@ -0,0 +1,318 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fs/verity/open.c: opening fs-verity files + * + * Copyright 2019 Google LLC + */ + +#include "fsverity_private.h" + +#include + +static struct kmem_cache *fsverity_info_cachep; + +/** + * fsverity_init_merkle_tree_params() - initialize Merkle tree parameters + * @params: the parameters struct to initialize + * @inode: the inode for which the Merkle tree is being built + * @hash_algorithm: number of hash algorithm to use + * @log_blocksize: log base 2 of block size to use + * @salt: pointer to salt (optional) + * @salt_size: size of salt, possibly 0 + * + * Validate the hash algorithm and block size, then compute the tree topology + * (num levels, num blocks in each level, etc.) and initialize @params. + * + * Return: 0 on success, -errno on failure + */ +int fsverity_init_merkle_tree_params(struct merkle_tree_params *params, + const struct inode *inode, + unsigned int hash_algorithm, + unsigned int log_blocksize, + const u8 *salt, size_t salt_size) +{ + const struct fsverity_hash_alg *hash_alg; + int err; + u64 blocks; + u64 offset; + int level; + + memset(params, 0, sizeof(*params)); + + hash_alg = fsverity_get_hash_alg(inode, hash_algorithm); + if (IS_ERR(hash_alg)) + return PTR_ERR(hash_alg); + params->hash_alg = hash_alg; + params->digest_size = hash_alg->digest_size; + + params->hashstate = fsverity_prepare_hash_state(hash_alg, salt, + salt_size); + if (IS_ERR(params->hashstate)) { + err = PTR_ERR(params->hashstate); + params->hashstate = NULL; + fsverity_err(inode, "Error %d preparing hash state", err); + goto out_err; + } + + if (log_blocksize != PAGE_SHIFT) { + fsverity_warn(inode, "Unsupported log_blocksize: %u", + log_blocksize); + err = -EINVAL; + goto out_err; + } + params->log_blocksize = log_blocksize; + params->block_size = 1 << log_blocksize; + + if (WARN_ON(!is_power_of_2(params->digest_size))) { + err = -EINVAL; + goto out_err; + } + if (params->block_size < 2 * params->digest_size) { + fsverity_warn(inode, + "Merkle tree block size (%u) too small for hash algorithm \"%s\"", + params->block_size, hash_alg->name); + err = -EINVAL; + goto out_err; + } + params->log_arity = params->log_blocksize - ilog2(params->digest_size); + params->hashes_per_block = 1 << params->log_arity; + + pr_debug("Merkle tree uses %s with %u-byte blocks (%u hashes/block), salt=%*phN\n", + hash_alg->name, params->block_size, params->hashes_per_block, + (int)salt_size, salt); + + /* + * Compute the number of levels in the Merkle tree and create a map from + * level to the starting block of that level. Level 'num_levels - 1' is + * the root and is stored first. Level 0 is the level directly "above" + * the data blocks and is stored last. + */ + + /* Compute number of levels and the number of blocks in each level */ + blocks = (inode->i_size + params->block_size - 1) >> log_blocksize; + pr_debug("Data is %lld bytes (%llu blocks)\n", inode->i_size, blocks); + while (blocks > 1) { + if (params->num_levels >= FS_VERITY_MAX_LEVELS) { + fsverity_err(inode, "Too many levels in Merkle tree"); + err = -EINVAL; + goto out_err; + } + blocks = (blocks + params->hashes_per_block - 1) >> + params->log_arity; + /* temporarily using level_start[] to store blocks in level */ + params->level_start[params->num_levels++] = blocks; + } + + /* Compute the starting block of each level */ + offset = 0; + for (level = (int)params->num_levels - 1; level >= 0; level--) { + blocks = params->level_start[level]; + params->level_start[level] = offset; + pr_debug("Level %d is %llu blocks starting at index %llu\n", + level, blocks, offset); + offset += blocks; + } + + params->tree_size = offset << log_blocksize; + return 0; + +out_err: + kfree(params->hashstate); + memset(params, 0, sizeof(*params)); + return err; +} + +/* Compute the file measurement by hashing the fsverity_descriptor. */ +static int compute_file_measurement(const struct fsverity_hash_alg *hash_alg, + const struct fsverity_descriptor *desc, + u8 *measurement) +{ + return fsverity_hash_buffer(hash_alg, desc, sizeof(*desc), measurement); +} + +/* + * Validate the given fsverity_descriptor and create a new fsverity_info from + * it. + */ +struct fsverity_info *fsverity_create_info(const struct inode *inode, + const void *_desc, size_t desc_size) +{ + const struct fsverity_descriptor *desc = _desc; + struct fsverity_info *vi; + int err; + + if (desc_size < sizeof(*desc)) { + fsverity_err(inode, "Unrecognized descriptor size: %zu bytes", + desc_size); + return ERR_PTR(-EINVAL); + } + + if (desc->version != 1) { + fsverity_err(inode, "Unrecognized descriptor version: %u", + desc->version); + return ERR_PTR(-EINVAL); + } + + if (desc->sig_size || + memchr_inv(desc->__reserved, 0, sizeof(desc->__reserved))) { + fsverity_err(inode, "Reserved bits set in descriptor"); + return ERR_PTR(-EINVAL); + } + + if (desc->salt_size > sizeof(desc->salt)) { + fsverity_err(inode, "Invalid salt_size: %u", desc->salt_size); + return ERR_PTR(-EINVAL); + } + + if (le64_to_cpu(desc->data_size) != inode->i_size) { + fsverity_err(inode, + "Wrong data_size: %llu (desc) != %lld (inode)", + le64_to_cpu(desc->data_size), inode->i_size); + return ERR_PTR(-EINVAL); + } + + vi = kmem_cache_zalloc(fsverity_info_cachep, GFP_KERNEL); + if (!vi) + return ERR_PTR(-ENOMEM); + vi->inode = inode; + + err = fsverity_init_merkle_tree_params(&vi->tree_params, inode, + desc->hash_algorithm, + desc->log_blocksize, + desc->salt, desc->salt_size); + if (err) { + fsverity_err(inode, + "Error %d initializing Merkle tree parameters", + err); + goto out; + } + + memcpy(vi->root_hash, desc->root_hash, vi->tree_params.digest_size); + + err = compute_file_measurement(vi->tree_params.hash_alg, desc, + vi->measurement); + if (err) { + fsverity_err(inode, "Error %d computing file measurement", err); + goto out; + } + pr_debug("Computed file measurement: %s:%*phN\n", + vi->tree_params.hash_alg->name, + vi->tree_params.digest_size, vi->measurement); +out: + if (err) { + fsverity_free_info(vi); + vi = ERR_PTR(err); + } + return vi; +} + +void fsverity_set_info(struct inode *inode, struct fsverity_info *vi) +{ + /* + * Multiple processes may race to set ->i_verity_info, so use cmpxchg. + * This pairs with the READ_ONCE() in fsverity_get_info(). + */ + if (cmpxchg(&inode->i_verity_info, NULL, vi) != NULL) + fsverity_free_info(vi); +} + +void fsverity_free_info(struct fsverity_info *vi) +{ + if (!vi) + return; + kfree(vi->tree_params.hashstate); + kmem_cache_free(fsverity_info_cachep, vi); +} + +/* Ensure the inode has an ->i_verity_info */ +static int ensure_verity_info(struct inode *inode) +{ + struct fsverity_info *vi = fsverity_get_info(inode); + struct fsverity_descriptor *desc; + int res; + + if (vi) + return 0; + + res = inode->i_sb->s_vop->get_verity_descriptor(inode, NULL, 0); + if (res < 0) { + fsverity_err(inode, + "Error %d getting verity descriptor size", res); + return res; + } + if (res > FS_VERITY_MAX_DESCRIPTOR_SIZE) { + fsverity_err(inode, "Verity descriptor is too large (%d bytes)", + res); + return -EMSGSIZE; + } + desc = kmalloc(res, GFP_KERNEL); + if (!desc) + return -ENOMEM; + res = inode->i_sb->s_vop->get_verity_descriptor(inode, desc, res); + if (res < 0) { + fsverity_err(inode, "Error %d reading verity descriptor", res); + goto out_free_desc; + } + + vi = fsverity_create_info(inode, desc, res); + if (IS_ERR(vi)) { + res = PTR_ERR(vi); + goto out_free_desc; + } + + fsverity_set_info(inode, vi); + res = 0; +out_free_desc: + kfree(desc); + return res; +} + +/** + * fsverity_file_open() - prepare to open a verity file + * @inode: the inode being opened + * @filp: the struct file being set up + * + * When opening a verity file, deny the open if it is for writing. Otherwise, + * set up the inode's ->i_verity_info if not already done. + * + * When combined with fscrypt, this must be called after fscrypt_file_open(). + * Otherwise, we won't have the key set up to decrypt the verity metadata. + * + * Return: 0 on success, -errno on failure + */ +int fsverity_file_open(struct inode *inode, struct file *filp) +{ + if (!IS_VERITY(inode)) + return 0; + + if (filp->f_mode & FMODE_WRITE) { + pr_debug("Denying opening verity file (ino %lu) for write\n", + inode->i_ino); + return -EPERM; + } + + return ensure_verity_info(inode); +} +EXPORT_SYMBOL_GPL(fsverity_file_open); + +/** + * fsverity_cleanup_inode() - free the inode's verity info, if present + * + * Filesystems must call this on inode eviction to free ->i_verity_info. + */ +void fsverity_cleanup_inode(struct inode *inode) +{ + fsverity_free_info(inode->i_verity_info); + inode->i_verity_info = NULL; +} +EXPORT_SYMBOL_GPL(fsverity_cleanup_inode); + +int __init fsverity_init_info_cache(void) +{ + fsverity_info_cachep = KMEM_CACHE_USERCOPY(fsverity_info, + SLAB_RECLAIM_ACCOUNT, + measurement); + if (!fsverity_info_cachep) + return -ENOMEM; + return 0; +} diff --git a/include/linux/fsverity.h b/include/linux/fsverity.h new file mode 100644 index 000000000000..09b04dab6452 --- /dev/null +++ b/include/linux/fsverity.h @@ -0,0 +1,71 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * fs-verity: read-only file-based authenticity protection + * + * This header declares the interface between the fs/verity/ support layer and + * filesystems that support fs-verity. + * + * Copyright 2019 Google LLC + */ + +#ifndef _LINUX_FSVERITY_H +#define _LINUX_FSVERITY_H + +#include +#include + +/* Verity operations for filesystems */ +struct fsverity_operations { + + /** + * Get the verity descriptor of the given inode. + * + * @inode: an inode with the S_VERITY flag set + * @buf: buffer in which to place the verity descriptor + * @bufsize: size of @buf, or 0 to retrieve the size only + * + * If bufsize == 0, then the size of the verity descriptor is returned. + * Otherwise the verity descriptor is written to 'buf' and its actual + * size is returned; -ERANGE is returned if it's too large. This may be + * called by multiple processes concurrently on the same inode. + * + * Return: the size on success, -errno on failure + */ + int (*get_verity_descriptor)(struct inode *inode, void *buf, + size_t bufsize); +}; + +#ifdef CONFIG_FS_VERITY + +static inline struct fsverity_info *fsverity_get_info(const struct inode *inode) +{ + /* pairs with the cmpxchg() in fsverity_set_info() */ + return READ_ONCE(inode->i_verity_info); +} + +/* open.c */ + +extern int fsverity_file_open(struct inode *inode, struct file *filp); +extern void fsverity_cleanup_inode(struct inode *inode); + +#else /* !CONFIG_FS_VERITY */ + +static inline struct fsverity_info *fsverity_get_info(const struct inode *inode) +{ + return NULL; +} + +/* open.c */ + +static inline int fsverity_file_open(struct inode *inode, struct file *filp) +{ + return IS_VERITY(inode) ? -EOPNOTSUPP : 0; +} + +static inline void fsverity_cleanup_inode(struct inode *inode) +{ +} + +#endif /* !CONFIG_FS_VERITY */ + +#endif /* _LINUX_FSVERITY_H */ -- cgit v1.2.3 From c1d9b584e2cf3f0562d8fcf34574c044d17853a1 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 22 Jul 2019 09:26:22 -0700 Subject: fs-verity: add the hook for file ->setattr() Add a function fsverity_prepare_setattr() which filesystems that support fs-verity must call to deny truncates of verity files. Reviewed-by: Theodore Ts'o Reviewed-by: Jaegeuk Kim Signed-off-by: Eric Biggers --- fs/verity/open.c | 21 +++++++++++++++++++++ include/linux/fsverity.h | 7 +++++++ 2 files changed, 28 insertions(+) (limited to 'include') diff --git a/fs/verity/open.c b/fs/verity/open.c index 8013f77f907e..2cb2fe8082bf 100644 --- a/fs/verity/open.c +++ b/fs/verity/open.c @@ -295,6 +295,27 @@ int fsverity_file_open(struct inode *inode, struct file *filp) } EXPORT_SYMBOL_GPL(fsverity_file_open); +/** + * fsverity_prepare_setattr() - prepare to change a verity inode's attributes + * @dentry: dentry through which the inode is being changed + * @attr: attributes to change + * + * Verity files are immutable, so deny truncates. This isn't covered by the + * open-time check because sys_truncate() takes a path, not a file descriptor. + * + * Return: 0 on success, -errno on failure + */ +int fsverity_prepare_setattr(struct dentry *dentry, struct iattr *attr) +{ + if (IS_VERITY(d_inode(dentry)) && (attr->ia_valid & ATTR_SIZE)) { + pr_debug("Denying truncate of verity file (ino %lu)\n", + d_inode(dentry)->i_ino); + return -EPERM; + } + return 0; +} +EXPORT_SYMBOL_GPL(fsverity_prepare_setattr); + /** * fsverity_cleanup_inode() - free the inode's verity info, if present * diff --git a/include/linux/fsverity.h b/include/linux/fsverity.h index 09b04dab6452..cbd0f84e1620 100644 --- a/include/linux/fsverity.h +++ b/include/linux/fsverity.h @@ -46,6 +46,7 @@ static inline struct fsverity_info *fsverity_get_info(const struct inode *inode) /* open.c */ extern int fsverity_file_open(struct inode *inode, struct file *filp); +extern int fsverity_prepare_setattr(struct dentry *dentry, struct iattr *attr); extern void fsverity_cleanup_inode(struct inode *inode); #else /* !CONFIG_FS_VERITY */ @@ -62,6 +63,12 @@ static inline int fsverity_file_open(struct inode *inode, struct file *filp) return IS_VERITY(inode) ? -EOPNOTSUPP : 0; } +static inline int fsverity_prepare_setattr(struct dentry *dentry, + struct iattr *attr) +{ + return IS_VERITY(d_inode(dentry)) ? -EOPNOTSUPP : 0; +} + static inline void fsverity_cleanup_inode(struct inode *inode) { } -- cgit v1.2.3 From 8a1d0f9cacc997bedc017056a94f35dc823394ed Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 22 Jul 2019 09:26:22 -0700 Subject: fs-verity: add data verification hooks for ->readpages() Add functions that verify data pages that have been read from a fs-verity file, against that file's Merkle tree. These will be called from filesystems' ->readpage() and ->readpages() methods. Since data verification can block, a workqueue is provided for these methods to enqueue verification work from their bio completion callback. See the "Verifying data" section of Documentation/filesystems/fsverity.rst for more information. Reviewed-by: Theodore Ts'o Reviewed-by: Jaegeuk Kim Signed-off-by: Eric Biggers --- fs/verity/Makefile | 3 +- fs/verity/fsverity_private.h | 5 + fs/verity/init.c | 8 ++ fs/verity/open.c | 6 + fs/verity/verify.c | 275 +++++++++++++++++++++++++++++++++++++++++++ include/linux/fsverity.h | 56 +++++++++ 6 files changed, 352 insertions(+), 1 deletion(-) create mode 100644 fs/verity/verify.c (limited to 'include') diff --git a/fs/verity/Makefile b/fs/verity/Makefile index e6a8951c493a..7fa628cd5eba 100644 --- a/fs/verity/Makefile +++ b/fs/verity/Makefile @@ -2,4 +2,5 @@ obj-$(CONFIG_FS_VERITY) += hash_algs.o \ init.o \ - open.o + open.o \ + verify.o diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h index c79746ff335e..eaa2b3b93bbf 100644 --- a/fs/verity/fsverity_private.h +++ b/fs/verity/fsverity_private.h @@ -134,5 +134,10 @@ void fsverity_set_info(struct inode *inode, struct fsverity_info *vi); void fsverity_free_info(struct fsverity_info *vi); int __init fsverity_init_info_cache(void); +void __init fsverity_exit_info_cache(void); + +/* verify.c */ + +int __init fsverity_init_workqueue(void); #endif /* _FSVERITY_PRIVATE_H */ diff --git a/fs/verity/init.c b/fs/verity/init.c index fff1fd634335..b593805aafcc 100644 --- a/fs/verity/init.c +++ b/fs/verity/init.c @@ -41,7 +41,15 @@ static int __init fsverity_init(void) if (err) return err; + err = fsverity_init_workqueue(); + if (err) + goto err_exit_info_cache; + pr_debug("Initialized fs-verity\n"); return 0; + +err_exit_info_cache: + fsverity_exit_info_cache(); + return err; } late_initcall(fsverity_init) diff --git a/fs/verity/open.c b/fs/verity/open.c index 2cb2fe8082bf..3636a1ed8e2c 100644 --- a/fs/verity/open.c +++ b/fs/verity/open.c @@ -337,3 +337,9 @@ int __init fsverity_init_info_cache(void) return -ENOMEM; return 0; } + +void __init fsverity_exit_info_cache(void) +{ + kmem_cache_destroy(fsverity_info_cachep); + fsverity_info_cachep = NULL; +} diff --git a/fs/verity/verify.c b/fs/verity/verify.c new file mode 100644 index 000000000000..62ab8f6a8ea1 --- /dev/null +++ b/fs/verity/verify.c @@ -0,0 +1,275 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fs/verity/verify.c: data verification functions, i.e. hooks for ->readpages() + * + * Copyright 2019 Google LLC + */ + +#include "fsverity_private.h" + +#include +#include +#include + +static struct workqueue_struct *fsverity_read_workqueue; + +/** + * hash_at_level() - compute the location of the block's hash at the given level + * + * @params: (in) the Merkle tree parameters + * @dindex: (in) the index of the data block being verified + * @level: (in) the level of hash we want (0 is leaf level) + * @hindex: (out) the index of the hash block containing the wanted hash + * @hoffset: (out) the byte offset to the wanted hash within the hash block + */ +static void hash_at_level(const struct merkle_tree_params *params, + pgoff_t dindex, unsigned int level, pgoff_t *hindex, + unsigned int *hoffset) +{ + pgoff_t position; + + /* Offset of the hash within the level's region, in hashes */ + position = dindex >> (level * params->log_arity); + + /* Index of the hash block in the tree overall */ + *hindex = params->level_start[level] + (position >> params->log_arity); + + /* Offset of the wanted hash (in bytes) within the hash block */ + *hoffset = (position & ((1 << params->log_arity) - 1)) << + (params->log_blocksize - params->log_arity); +} + +/* Extract a hash from a hash page */ +static void extract_hash(struct page *hpage, unsigned int hoffset, + unsigned int hsize, u8 *out) +{ + void *virt = kmap_atomic(hpage); + + memcpy(out, virt + hoffset, hsize); + kunmap_atomic(virt); +} + +static inline int cmp_hashes(const struct fsverity_info *vi, + const u8 *want_hash, const u8 *real_hash, + pgoff_t index, int level) +{ + const unsigned int hsize = vi->tree_params.digest_size; + + if (memcmp(want_hash, real_hash, hsize) == 0) + return 0; + + fsverity_err(vi->inode, + "FILE CORRUPTED! index=%lu, level=%d, want_hash=%s:%*phN, real_hash=%s:%*phN", + index, level, + vi->tree_params.hash_alg->name, hsize, want_hash, + vi->tree_params.hash_alg->name, hsize, real_hash); + return -EBADMSG; +} + +/* + * Verify a single data page against the file's Merkle tree. + * + * In principle, we need to verify the entire path to the root node. However, + * for efficiency the filesystem may cache the hash pages. Therefore we need + * only ascend the tree until an already-verified page is seen, as indicated by + * the PageChecked bit being set; then verify the path to that page. + * + * This code currently only supports the case where the verity block size is + * equal to PAGE_SIZE. Doing otherwise would be possible but tricky, since we + * wouldn't be able to use the PageChecked bit. + * + * Note that multiple processes may race to verify a hash page and mark it + * Checked, but it doesn't matter; the result will be the same either way. + * + * Return: true if the page is valid, else false. + */ +static bool verify_page(struct inode *inode, const struct fsverity_info *vi, + struct ahash_request *req, struct page *data_page) +{ + const struct merkle_tree_params *params = &vi->tree_params; + const unsigned int hsize = params->digest_size; + const pgoff_t index = data_page->index; + int level; + u8 _want_hash[FS_VERITY_MAX_DIGEST_SIZE]; + const u8 *want_hash; + u8 real_hash[FS_VERITY_MAX_DIGEST_SIZE]; + struct page *hpages[FS_VERITY_MAX_LEVELS]; + unsigned int hoffsets[FS_VERITY_MAX_LEVELS]; + int err; + + if (WARN_ON_ONCE(!PageLocked(data_page) || PageUptodate(data_page))) + return false; + + pr_debug_ratelimited("Verifying data page %lu...\n", index); + + /* + * Starting at the leaf level, ascend the tree saving hash pages along + * the way until we find a verified hash page, indicated by PageChecked; + * or until we reach the root. + */ + for (level = 0; level < params->num_levels; level++) { + pgoff_t hindex; + unsigned int hoffset; + struct page *hpage; + + hash_at_level(params, index, level, &hindex, &hoffset); + + pr_debug_ratelimited("Level %d: hindex=%lu, hoffset=%u\n", + level, hindex, hoffset); + + hpage = inode->i_sb->s_vop->read_merkle_tree_page(inode, + hindex); + if (IS_ERR(hpage)) { + err = PTR_ERR(hpage); + fsverity_err(inode, + "Error %d reading Merkle tree page %lu", + err, hindex); + goto out; + } + + if (PageChecked(hpage)) { + extract_hash(hpage, hoffset, hsize, _want_hash); + want_hash = _want_hash; + put_page(hpage); + pr_debug_ratelimited("Hash page already checked, want %s:%*phN\n", + params->hash_alg->name, + hsize, want_hash); + goto descend; + } + pr_debug_ratelimited("Hash page not yet checked\n"); + hpages[level] = hpage; + hoffsets[level] = hoffset; + } + + want_hash = vi->root_hash; + pr_debug("Want root hash: %s:%*phN\n", + params->hash_alg->name, hsize, want_hash); +descend: + /* Descend the tree verifying hash pages */ + for (; level > 0; level--) { + struct page *hpage = hpages[level - 1]; + unsigned int hoffset = hoffsets[level - 1]; + + err = fsverity_hash_page(params, inode, req, hpage, real_hash); + if (err) + goto out; + err = cmp_hashes(vi, want_hash, real_hash, index, level - 1); + if (err) + goto out; + SetPageChecked(hpage); + extract_hash(hpage, hoffset, hsize, _want_hash); + want_hash = _want_hash; + put_page(hpage); + pr_debug("Verified hash page at level %d, now want %s:%*phN\n", + level - 1, params->hash_alg->name, hsize, want_hash); + } + + /* Finally, verify the data page */ + err = fsverity_hash_page(params, inode, req, data_page, real_hash); + if (err) + goto out; + err = cmp_hashes(vi, want_hash, real_hash, index, -1); +out: + for (; level > 0; level--) + put_page(hpages[level - 1]); + + return err == 0; +} + +/** + * fsverity_verify_page() - verify a data page + * + * Verify a page that has just been read from a verity file. The page must be a + * pagecache page that is still locked and not yet uptodate. + * + * Return: true if the page is valid, else false. + */ +bool fsverity_verify_page(struct page *page) +{ + struct inode *inode = page->mapping->host; + const struct fsverity_info *vi = inode->i_verity_info; + struct ahash_request *req; + bool valid; + + req = ahash_request_alloc(vi->tree_params.hash_alg->tfm, GFP_NOFS); + if (unlikely(!req)) + return false; + + valid = verify_page(inode, vi, req, page); + + ahash_request_free(req); + + return valid; +} +EXPORT_SYMBOL_GPL(fsverity_verify_page); + +#ifdef CONFIG_BLOCK +/** + * fsverity_verify_bio() - verify a 'read' bio that has just completed + * + * Verify a set of pages that have just been read from a verity file. The pages + * must be pagecache pages that are still locked and not yet uptodate. Pages + * that fail verification are set to the Error state. Verification is skipped + * for pages already in the Error state, e.g. due to fscrypt decryption failure. + * + * This is a helper function for use by the ->readpages() method of filesystems + * that issue bios to read data directly into the page cache. Filesystems that + * populate the page cache without issuing bios (e.g. non block-based + * filesystems) must instead call fsverity_verify_page() directly on each page. + * All filesystems must also call fsverity_verify_page() on holes. + */ +void fsverity_verify_bio(struct bio *bio) +{ + struct inode *inode = bio_first_page_all(bio)->mapping->host; + const struct fsverity_info *vi = inode->i_verity_info; + struct ahash_request *req; + struct bio_vec *bv; + struct bvec_iter_all iter_all; + + req = ahash_request_alloc(vi->tree_params.hash_alg->tfm, GFP_NOFS); + if (unlikely(!req)) { + bio_for_each_segment_all(bv, bio, iter_all) + SetPageError(bv->bv_page); + return; + } + + bio_for_each_segment_all(bv, bio, iter_all) { + struct page *page = bv->bv_page; + + if (!PageError(page) && !verify_page(inode, vi, req, page)) + SetPageError(page); + } + + ahash_request_free(req); +} +EXPORT_SYMBOL_GPL(fsverity_verify_bio); +#endif /* CONFIG_BLOCK */ + +/** + * fsverity_enqueue_verify_work() - enqueue work on the fs-verity workqueue + * + * Enqueue verification work for asynchronous processing. + */ +void fsverity_enqueue_verify_work(struct work_struct *work) +{ + queue_work(fsverity_read_workqueue, work); +} +EXPORT_SYMBOL_GPL(fsverity_enqueue_verify_work); + +int __init fsverity_init_workqueue(void) +{ + /* + * Use an unbound workqueue to allow bios to be verified in parallel + * even when they happen to complete on the same CPU. This sacrifices + * locality, but it's worthwhile since hashing is CPU-intensive. + * + * Also use a high-priority workqueue to prioritize verification work, + * which blocks reads from completing, over regular application tasks. + */ + fsverity_read_workqueue = alloc_workqueue("fsverity_read_queue", + WQ_UNBOUND | WQ_HIGHPRI, + num_online_cpus()); + if (!fsverity_read_workqueue) + return -ENOMEM; + return 0; +} diff --git a/include/linux/fsverity.h b/include/linux/fsverity.h index cbd0f84e1620..95c257cd7ff0 100644 --- a/include/linux/fsverity.h +++ b/include/linux/fsverity.h @@ -33,6 +33,23 @@ struct fsverity_operations { */ int (*get_verity_descriptor)(struct inode *inode, void *buf, size_t bufsize); + + /** + * Read a Merkle tree page of the given inode. + * + * @inode: the inode + * @index: 0-based index of the page within the Merkle tree + * + * This can be called at any time on an open verity file, as well as + * between ->begin_enable_verity() and ->end_enable_verity(). It may be + * called by multiple processes concurrently, even with the same page. + * + * Note that this must retrieve a *page*, not necessarily a *block*. + * + * Return: the page on success, ERR_PTR() on failure + */ + struct page *(*read_merkle_tree_page)(struct inode *inode, + pgoff_t index); }; #ifdef CONFIG_FS_VERITY @@ -49,6 +66,12 @@ extern int fsverity_file_open(struct inode *inode, struct file *filp); extern int fsverity_prepare_setattr(struct dentry *dentry, struct iattr *attr); extern void fsverity_cleanup_inode(struct inode *inode); +/* verify.c */ + +extern bool fsverity_verify_page(struct page *page); +extern void fsverity_verify_bio(struct bio *bio); +extern void fsverity_enqueue_verify_work(struct work_struct *work); + #else /* !CONFIG_FS_VERITY */ static inline struct fsverity_info *fsverity_get_info(const struct inode *inode) @@ -73,6 +96,39 @@ static inline void fsverity_cleanup_inode(struct inode *inode) { } +/* verify.c */ + +static inline bool fsverity_verify_page(struct page *page) +{ + WARN_ON(1); + return false; +} + +static inline void fsverity_verify_bio(struct bio *bio) +{ + WARN_ON(1); +} + +static inline void fsverity_enqueue_verify_work(struct work_struct *work) +{ + WARN_ON(1); +} + #endif /* !CONFIG_FS_VERITY */ +/** + * fsverity_active() - do reads from the inode need to go through fs-verity? + * + * This checks whether ->i_verity_info has been set. + * + * Filesystems call this from ->readpages() to check whether the pages need to + * be verified or not. Don't use IS_VERITY() for this purpose; it's subject to + * a race condition where the file is being read concurrently with + * FS_IOC_ENABLE_VERITY completing. (S_VERITY is set before ->i_verity_info.) + */ +static inline bool fsverity_active(const struct inode *inode) +{ + return fsverity_get_info(inode) != NULL; +} + #endif /* _LINUX_FSVERITY_H */ -- cgit v1.2.3 From 91826ba13855f73e252fef68369b3b0e1ed25253 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 29 Jul 2019 00:51:38 +0900 Subject: netfilter: add include guard to xt_connlabel.h Add a header include guard just in case. Signed-off-by: Masahiro Yamada Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/xt_connlabel.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/netfilter/xt_connlabel.h b/include/uapi/linux/netfilter/xt_connlabel.h index 2312f0ec07b2..323f0dfc2a4e 100644 --- a/include/uapi/linux/netfilter/xt_connlabel.h +++ b/include/uapi/linux/netfilter/xt_connlabel.h @@ -1,4 +1,8 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ + +#ifndef _UAPI_XT_CONNLABEL_H +#define _UAPI_XT_CONNLABEL_H + #include #define XT_CONNLABEL_MAXBIT 127 @@ -11,3 +15,5 @@ struct xt_connlabel_mtinfo { __u16 bit; __u16 options; }; + +#endif /* _UAPI_XT_CONNLABEL_H */ -- cgit v1.2.3 From 90d4962cfc87a6994a19db0d76e1fa214dd67267 Mon Sep 17 00:00:00 2001 From: John Crispin Date: Mon, 29 Jul 2019 12:23:41 +0200 Subject: mac80211: fix ieee80211_he_oper_size() comment Johannes mentioned that the comment should not reference mac80211 as other subsystems might call the helper. Signed-off-by: John Crispin Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20190729102342.8659-1-john@phrozen.org Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index a656f31262f1..9c81895c63c1 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -2053,8 +2053,8 @@ ieee80211_he_ppe_size(u8 ppe_thres_hdr, const u8 *phy_cap_info) * ieee80211_he_oper_size - calculate 802.11ax HE Operations IE size * @he_oper_ie: byte data of the He Operations IE, stating from the the byte * after the ext ID byte. It is assumed that he_oper_ie has at least - * sizeof(struct ieee80211_he_operation) bytes, checked already in - * ieee802_11_parse_elems_crc() + * sizeof(struct ieee80211_he_operation) bytes, the caller must have + * validated this. * @return the actual size of the IE data (not including header), or 0 on error */ static inline u8 -- cgit v1.2.3 From 697f6c507c74991057eb6df3cfb46579ca136467 Mon Sep 17 00:00:00 2001 From: John Crispin Date: Mon, 29 Jul 2019 12:23:42 +0200 Subject: mac80211: propagate HE operation info into bss_conf Upon a successful assoc a station shall store the content of the HE operation element inside bss_conf so that the driver can setup the hardware accordingly. Signed-off-by: Shashidhar Lakkavalli Signed-off-by: John Crispin Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20190729102342.8659-2-john@phrozen.org [use struct copy] Signed-off-by: Johannes Berg --- include/net/mac80211.h | 2 ++ net/mac80211/he.c | 15 +++++++++++++++ net/mac80211/ieee80211_i.h | 4 ++++ net/mac80211/mlme.c | 1 + 4 files changed, 22 insertions(+) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 9effd286c1ae..bd91388797fc 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -600,6 +600,7 @@ struct ieee80211_ftm_responder_params { * nontransmitted BSSIDs * @profile_periodicity: the least number of beacon frames need to be received * in order to discover all the nontransmitted BSSIDs in the set. + * @he_operation: HE operation information of the AP we are connected to */ struct ieee80211_bss_conf { const u8 *bssid; @@ -661,6 +662,7 @@ struct ieee80211_bss_conf { u8 bssid_indicator; bool ema_ap; u8 profile_periodicity; + struct ieee80211_he_operation he_operation; }; /** diff --git a/net/mac80211/he.c b/net/mac80211/he.c index 219650591c79..f910f730ad0d 100644 --- a/net/mac80211/he.c +++ b/net/mac80211/he.c @@ -50,3 +50,18 @@ ieee80211_he_cap_ie_to_sta_he_cap(struct ieee80211_sub_if_data *sdata, he_cap->has_he = true; } + +void +ieee80211_he_op_ie_to_bss_conf(struct ieee80211_vif *vif, + const struct ieee80211_he_operation *he_op_ie_elem) +{ + struct ieee80211_he_operation *he_operation = + &vif->bss_conf.he_operation; + + if (!he_op_ie_elem) { + memset(he_operation, 0, sizeof(*he_operation)); + return; + } + + vif->bss_conf.he_operation = *he_op_ie_elem; +} diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index a5818ff0e2a4..ba1bc245678e 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1873,6 +1873,10 @@ ieee80211_he_cap_ie_to_sta_he_cap(struct ieee80211_sub_if_data *sdata, const u8 *he_cap_ie, u8 he_cap_len, struct sta_info *sta); +void +ieee80211_he_op_ie_to_bss_conf(struct ieee80211_vif *vif, + const struct ieee80211_he_operation *he_op_ie_elem); + /* Spectrum management */ void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index a99ad0325309..2b8a7428973d 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -3381,6 +3381,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, if (elems.uora_element) bss_conf->uora_ocw_range = elems.uora_element[0]; + ieee80211_he_op_ie_to_bss_conf(&sdata->vif, elems.he_operation); /* TODO: OPEN: what happens if BSS color disable is set? */ } -- cgit v1.2.3 From 2ab45876756fb6c132ae801b0939e0474f84c426 Mon Sep 17 00:00:00 2001 From: John Crispin Date: Mon, 29 Jul 2019 12:45:12 +0200 Subject: mac80211: add support for the ADDBA extension element HE allows peers to negotiate the aggregation fragmentation level to be used during transmission. The level can be 1-3. The Ext element is added behind the ADDBA request inside the action frame. The responder will then reply with the same level or a lower one if the requested one is not supported. This patch only handles the negotiation part as the ADDBA frames get passed to the ATH11k firmware, which does the rest of the magic for us aswell as generating the requests. Signed-off-by: Shashidhar Lakkavalli Signed-off-by: John Crispin Link: https://lore.kernel.org/r/20190729104512.27615-1-john@phrozen.org Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 2 ++ net/mac80211/agg-rx.c | 70 ++++++++++++++++++++++++++++++++++++++++------ net/mac80211/ht.c | 2 +- net/mac80211/ieee80211_i.h | 3 +- 4 files changed, 66 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 9c81895c63c1..7d3f2ced92d1 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -981,6 +981,8 @@ struct ieee80211_mgmt { __le16 capab; __le16 timeout; __le16 start_seq_num; + /* followed by BA Extension */ + u8 variable[0]; } __packed addba_req; struct{ u8 action_code; diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index 01b0dad24500..0e1bb43973b8 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -178,17 +178,52 @@ static void sta_rx_agg_reorder_timer_expired(struct timer_list *t) rcu_read_unlock(); } -static void ieee80211_send_addba_resp(struct ieee80211_sub_if_data *sdata, u8 *da, u16 tid, +static void ieee80211_add_addbaext(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb, + const struct ieee80211_addba_ext_ie *req) +{ + struct ieee80211_supported_band *sband; + struct ieee80211_addba_ext_ie *resp; + const struct ieee80211_sta_he_cap *he_cap; + u8 frag_level, cap_frag_level; + u8 *pos; + + sband = ieee80211_get_sband(sdata); + he_cap = ieee80211_get_he_iftype_cap(sband, sdata->vif.type); + if (!he_cap) + return; + + pos = skb_put_zero(skb, 2 + sizeof(struct ieee80211_addba_ext_ie)); + *pos++ = WLAN_EID_ADDBA_EXT; + *pos++ = sizeof(struct ieee80211_addba_ext_ie); + resp = (struct ieee80211_addba_ext_ie *)pos; + resp->data = req->data & IEEE80211_ADDBA_EXT_NO_FRAG; + + frag_level = u32_get_bits(req->data, + IEEE80211_ADDBA_EXT_FRAG_LEVEL_MASK); + cap_frag_level = u32_get_bits(he_cap->he_cap_elem.mac_cap_info[0], + IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_MASK); + if (frag_level > cap_frag_level) + frag_level = cap_frag_level; + resp->data |= u8_encode_bits(frag_level, + IEEE80211_ADDBA_EXT_FRAG_LEVEL_MASK); +} + +static void ieee80211_send_addba_resp(struct sta_info *sta, u8 *da, u16 tid, u8 dialog_token, u16 status, u16 policy, - u16 buf_size, u16 timeout) + u16 buf_size, u16 timeout, + const struct ieee80211_addba_ext_ie *addbaext) { + struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; struct sk_buff *skb; struct ieee80211_mgmt *mgmt; bool amsdu = ieee80211_hw_check(&local->hw, SUPPORTS_AMSDU_IN_AMPDU); u16 capab; - skb = dev_alloc_skb(sizeof(*mgmt) + local->hw.extra_tx_headroom); + skb = dev_alloc_skb(sizeof(*mgmt) + + 2 + sizeof(struct ieee80211_addba_ext_ie) + + local->hw.extra_tx_headroom); if (!skb) return; @@ -222,13 +257,17 @@ static void ieee80211_send_addba_resp(struct ieee80211_sub_if_data *sdata, u8 *d mgmt->u.action.u.addba_resp.timeout = cpu_to_le16(timeout); mgmt->u.action.u.addba_resp.status = cpu_to_le16(status); + if (sta->sta.he_cap.has_he && addbaext) + ieee80211_add_addbaext(sdata, skb, addbaext); + ieee80211_tx_skb(sdata, skb); } void ___ieee80211_start_rx_ba_session(struct sta_info *sta, u8 dialog_token, u16 timeout, u16 start_seq_num, u16 ba_policy, u16 tid, - u16 buf_size, bool tx, bool auto_seq) + u16 buf_size, bool tx, bool auto_seq, + const struct ieee80211_addba_ext_ie *addbaext) { struct ieee80211_local *local = sta->sdata->local; struct tid_ampdu_rx *tid_agg_rx; @@ -410,21 +449,22 @@ end: } if (tx) - ieee80211_send_addba_resp(sta->sdata, sta->sta.addr, tid, + ieee80211_send_addba_resp(sta, sta->sta.addr, tid, dialog_token, status, 1, buf_size, - timeout); + timeout, addbaext); } static void __ieee80211_start_rx_ba_session(struct sta_info *sta, u8 dialog_token, u16 timeout, u16 start_seq_num, u16 ba_policy, u16 tid, u16 buf_size, bool tx, - bool auto_seq) + bool auto_seq, + const struct ieee80211_addba_ext_ie *addbaext) { mutex_lock(&sta->ampdu_mlme.mtx); ___ieee80211_start_rx_ba_session(sta, dialog_token, timeout, start_seq_num, ba_policy, tid, - buf_size, tx, auto_seq); + buf_size, tx, auto_seq, addbaext); mutex_unlock(&sta->ampdu_mlme.mtx); } @@ -434,7 +474,9 @@ void ieee80211_process_addba_request(struct ieee80211_local *local, size_t len) { u16 capab, tid, timeout, ba_policy, buf_size, start_seq_num; + struct ieee802_11_elems elems = { 0 }; u8 dialog_token; + int ies_len; /* extract session parameters from addba request frame */ dialog_token = mgmt->u.action.u.addba_req.dialog_token; @@ -447,9 +489,19 @@ void ieee80211_process_addba_request(struct ieee80211_local *local, tid = (capab & IEEE80211_ADDBA_PARAM_TID_MASK) >> 2; buf_size = (capab & IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK) >> 6; + ies_len = len - offsetof(struct ieee80211_mgmt, + u.action.u.addba_req.variable); + if (ies_len) { + ieee802_11_parse_elems(mgmt->u.action.u.addba_req.variable, + ies_len, true, &elems, mgmt->bssid, NULL); + if (elems.parse_error) + return; + } + __ieee80211_start_rx_ba_session(sta, dialog_token, timeout, start_seq_num, ba_policy, tid, - buf_size, true, false); + buf_size, true, false, + elems.addba_ext_ie); } void ieee80211_manage_rx_ba_offl(struct ieee80211_vif *vif, diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c index d5a500b2a448..a2e4d6b8fd98 100644 --- a/net/mac80211/ht.c +++ b/net/mac80211/ht.c @@ -359,7 +359,7 @@ void ieee80211_ba_session_work(struct work_struct *work) sta->ampdu_mlme.tid_rx_manage_offl)) ___ieee80211_start_rx_ba_session(sta, 0, 0, 0, 1, tid, IEEE80211_MAX_AMPDU_BUF_HT, - false, true); + false, true, NULL); if (test_and_clear_bit(tid + IEEE80211_NUM_TIDS, sta->ampdu_mlme.tid_rx_manage_offl)) diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index ba1bc245678e..38769f5c3da4 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1807,7 +1807,8 @@ void __ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid, void ___ieee80211_start_rx_ba_session(struct sta_info *sta, u8 dialog_token, u16 timeout, u16 start_seq_num, u16 ba_policy, u16 tid, - u16 buf_size, bool tx, bool auto_seq); + u16 buf_size, bool tx, bool auto_seq, + const struct ieee80211_addba_ext_ie *addbaext); void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta, enum ieee80211_agg_stop_reason reason); void ieee80211_process_delba(struct ieee80211_sub_if_data *sdata, -- cgit v1.2.3 From 3b0b278312ba7d6c1eb8b2fb48d459fb7f341a20 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 29 Jul 2019 16:35:03 +0300 Subject: NFC: nxp-nci: Get rid of platform data Legacy platform data must go away. We are on the safe side here since there are no users of it in the kernel. If anyone by any odd reason needs it the GPIO lookup tables and built-in device properties at your service. Signed-off-by: Andy Shevchenko Tested-by: Sedat Dilek Signed-off-by: David S. Miller --- MAINTAINERS | 1 - drivers/nfc/nxp-nci/core.c | 1 - drivers/nfc/nxp-nci/i2c.c | 9 +-------- drivers/nfc/nxp-nci/nxp-nci.h | 1 - include/linux/platform_data/nxp-nci.h | 19 ------------------- 5 files changed, 1 insertion(+), 30 deletions(-) delete mode 100644 include/linux/platform_data/nxp-nci.h (limited to 'include') diff --git a/MAINTAINERS b/MAINTAINERS index 9cc156c58f0c..ee663e0e2f2e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11327,7 +11327,6 @@ F: include/net/nfc/ F: include/uapi/linux/nfc.h F: drivers/nfc/ F: include/linux/platform_data/nfcmrvl.h -F: include/linux/platform_data/nxp-nci.h F: Documentation/devicetree/bindings/net/nfc/ NFS, SUNRPC, AND LOCKD CLIENTS diff --git a/drivers/nfc/nxp-nci/core.c b/drivers/nfc/nxp-nci/core.c index 8dafc696719f..aed18ca60170 100644 --- a/drivers/nfc/nxp-nci/core.c +++ b/drivers/nfc/nxp-nci/core.c @@ -14,7 +14,6 @@ #include #include #include -#include #include diff --git a/drivers/nfc/nxp-nci/i2c.c b/drivers/nfc/nxp-nci/i2c.c index 5db71869f04b..47b3b7e612e6 100644 --- a/drivers/nfc/nxp-nci/i2c.c +++ b/drivers/nfc/nxp-nci/i2c.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include @@ -304,7 +303,6 @@ static int nxp_nci_i2c_probe(struct i2c_client *client, const struct i2c_device_id *id) { struct nxp_nci_i2c_phy *phy; - struct nxp_nci_nfc_platform_data *pdata; int r; if (!i2c_check_functionality(client->adapter, I2C_FUNC_I2C)) { @@ -323,17 +321,12 @@ static int nxp_nci_i2c_probe(struct i2c_client *client, phy->i2c_dev = client; i2c_set_clientdata(client, phy); - pdata = client->dev.platform_data; - - if (!pdata && client->dev.of_node) { + if (client->dev.of_node) { r = nxp_nci_i2c_parse_devtree(client); if (r < 0) { nfc_err(&client->dev, "Failed to get DT data\n"); goto probe_exit; } - } else if (pdata) { - phy->gpio_en = pdata->gpio_en; - phy->gpio_fw = pdata->gpio_fw; } else if (ACPI_HANDLE(&client->dev)) { r = nxp_nci_i2c_acpi_config(phy); if (r < 0) diff --git a/drivers/nfc/nxp-nci/nxp-nci.h b/drivers/nfc/nxp-nci/nxp-nci.h index 6fe7c45544bf..ae3fb2735a4e 100644 --- a/drivers/nfc/nxp-nci/nxp-nci.h +++ b/drivers/nfc/nxp-nci/nxp-nci.h @@ -14,7 +14,6 @@ #include #include #include -#include #include diff --git a/include/linux/platform_data/nxp-nci.h b/include/linux/platform_data/nxp-nci.h deleted file mode 100644 index 97827ad468e2..000000000000 --- a/include/linux/platform_data/nxp-nci.h +++ /dev/null @@ -1,19 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Generic platform data for the NXP NCI NFC chips. - * - * Copyright (C) 2014 NXP Semiconductors All rights reserved. - * - * Authors: Clément Perrochaud - */ - -#ifndef _NXP_NCI_H_ -#define _NXP_NCI_H_ - -struct nxp_nci_nfc_platform_data { - unsigned int gpio_en; - unsigned int gpio_fw; - unsigned int irq; -}; - -#endif /* _NXP_NCI_H_ */ -- cgit v1.2.3 From 56f8af5e9d38f120cba2c2adb0786fa2dbc901a4 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 2 Jul 2019 16:44:06 +0100 Subject: iommu: Pass struct iommu_iotlb_gather to ->unmap() and ->iotlb_sync() To allow IOMMU drivers to batch up TLB flushing operations and postpone them until ->iotlb_sync() is called, extend the prototypes for the ->unmap() and ->iotlb_sync() IOMMU ops callbacks to take a pointer to the current iommu_iotlb_gather structure. All affected IOMMU drivers are updated, but there should be no functional change since the extra parameter is ignored for now. Signed-off-by: Will Deacon --- drivers/iommu/amd_iommu.c | 11 +++++++++-- drivers/iommu/arm-smmu-v3.c | 7 ++++--- drivers/iommu/arm-smmu.c | 5 +++-- drivers/iommu/exynos-iommu.c | 3 ++- drivers/iommu/intel-iommu.c | 3 ++- drivers/iommu/iommu.c | 2 +- drivers/iommu/ipmmu-vmsa.c | 12 +++++++++--- drivers/iommu/msm_iommu.c | 2 +- drivers/iommu/mtk_iommu.c | 13 ++++++++++--- drivers/iommu/mtk_iommu_v1.c | 3 ++- drivers/iommu/omap-iommu.c | 2 +- drivers/iommu/qcom_iommu.c | 12 +++++++++--- drivers/iommu/rockchip-iommu.c | 2 +- drivers/iommu/s390-iommu.c | 3 ++- drivers/iommu/tegra-gart.c | 12 +++++++++--- drivers/iommu/tegra-smmu.c | 2 +- drivers/iommu/virtio-iommu.c | 5 +++-- include/linux/iommu.h | 7 ++++--- 18 files changed, 73 insertions(+), 33 deletions(-) (limited to 'include') diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index f93b148cf55e..29eeea914660 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -3055,7 +3055,8 @@ static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova, } static size_t amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova, - size_t page_size) + size_t page_size, + struct iommu_iotlb_gather *gather) { struct protection_domain *domain = to_pdomain(dom); size_t unmap_size; @@ -3196,6 +3197,12 @@ static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain) domain_flush_complete(dom); } +static void amd_iommu_iotlb_sync(struct iommu_domain *domain, + struct iommu_iotlb_gather *gather) +{ + amd_iommu_flush_iotlb_all(domain); +} + const struct iommu_ops amd_iommu_ops = { .capable = amd_iommu_capable, .domain_alloc = amd_iommu_domain_alloc, @@ -3214,7 +3221,7 @@ const struct iommu_ops amd_iommu_ops = { .is_attach_deferred = amd_iommu_is_attach_deferred, .pgsize_bitmap = AMD_IOMMU_PGSIZES, .flush_iotlb_all = amd_iommu_flush_iotlb_all, - .iotlb_sync = amd_iommu_flush_iotlb_all, + .iotlb_sync = amd_iommu_iotlb_sync, }; /***************************************************************************** diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c index 7e137e1e28f1..80753b8ca054 100644 --- a/drivers/iommu/arm-smmu-v3.c +++ b/drivers/iommu/arm-smmu-v3.c @@ -1985,8 +1985,8 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova, return ops->map(ops, iova, paddr, size, prot); } -static size_t -arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova, size_t size) +static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova, + size_t size, struct iommu_iotlb_gather *gather) { int ret; struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); @@ -2010,7 +2010,8 @@ static void arm_smmu_flush_iotlb_all(struct iommu_domain *domain) arm_smmu_tlb_inv_context(smmu_domain); } -static void arm_smmu_iotlb_sync(struct iommu_domain *domain) +static void arm_smmu_iotlb_sync(struct iommu_domain *domain, + struct iommu_iotlb_gather *gather) { struct arm_smmu_device *smmu = to_smmu_domain(domain)->smmu; diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c index dc08db347ef3..e535ae2a9e65 100644 --- a/drivers/iommu/arm-smmu.c +++ b/drivers/iommu/arm-smmu.c @@ -1301,7 +1301,7 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova, } static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova, - size_t size) + size_t size, struct iommu_iotlb_gather *gather) { struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops; struct arm_smmu_device *smmu = to_smmu_domain(domain)->smmu; @@ -1329,7 +1329,8 @@ static void arm_smmu_flush_iotlb_all(struct iommu_domain *domain) } } -static void arm_smmu_iotlb_sync(struct iommu_domain *domain) +static void arm_smmu_iotlb_sync(struct iommu_domain *domain, + struct iommu_iotlb_gather *gather) { struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); struct arm_smmu_device *smmu = smmu_domain->smmu; diff --git a/drivers/iommu/exynos-iommu.c b/drivers/iommu/exynos-iommu.c index b0c1e5f9daae..cf5af34cb681 100644 --- a/drivers/iommu/exynos-iommu.c +++ b/drivers/iommu/exynos-iommu.c @@ -1130,7 +1130,8 @@ static void exynos_iommu_tlb_invalidate_entry(struct exynos_iommu_domain *domain } static size_t exynos_iommu_unmap(struct iommu_domain *iommu_domain, - unsigned long l_iova, size_t size) + unsigned long l_iova, size_t size, + struct iommu_iotlb_gather *gather) { struct exynos_iommu_domain *domain = to_exynos_domain(iommu_domain); sysmmu_iova_t iova = (sysmmu_iova_t)l_iova; diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index ac4172c02244..b9fb8d6ddc6e 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -5147,7 +5147,8 @@ static int intel_iommu_map(struct iommu_domain *domain, } static size_t intel_iommu_unmap(struct iommu_domain *domain, - unsigned long iova, size_t size) + unsigned long iova, size_t size, + struct iommu_iotlb_gather *gather) { struct dmar_domain *dmar_domain = to_dmar_domain(domain); struct page *freelist = NULL; diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index d67222fdfe44..70bfbcc09248 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -1899,7 +1899,7 @@ static size_t __iommu_unmap(struct iommu_domain *domain, while (unmapped < size) { size_t pgsize = iommu_pgsize(domain, iova, size - unmapped); - unmapped_page = ops->unmap(domain, iova, pgsize); + unmapped_page = ops->unmap(domain, iova, pgsize, iotlb_gather); if (!unmapped_page) break; diff --git a/drivers/iommu/ipmmu-vmsa.c b/drivers/iommu/ipmmu-vmsa.c index 2c14a2c65b22..a9332b893ce2 100644 --- a/drivers/iommu/ipmmu-vmsa.c +++ b/drivers/iommu/ipmmu-vmsa.c @@ -733,14 +733,14 @@ static int ipmmu_map(struct iommu_domain *io_domain, unsigned long iova, } static size_t ipmmu_unmap(struct iommu_domain *io_domain, unsigned long iova, - size_t size) + size_t size, struct iommu_iotlb_gather *gather) { struct ipmmu_vmsa_domain *domain = to_vmsa_domain(io_domain); return domain->iop->unmap(domain->iop, iova, size); } -static void ipmmu_iotlb_sync(struct iommu_domain *io_domain) +static void ipmmu_flush_iotlb_all(struct iommu_domain *io_domain) { struct ipmmu_vmsa_domain *domain = to_vmsa_domain(io_domain); @@ -748,6 +748,12 @@ static void ipmmu_iotlb_sync(struct iommu_domain *io_domain) ipmmu_tlb_flush_all(domain); } +static void ipmmu_iotlb_sync(struct iommu_domain *io_domain, + struct iommu_iotlb_gather *gather) +{ + ipmmu_flush_iotlb_all(io_domain); +} + static phys_addr_t ipmmu_iova_to_phys(struct iommu_domain *io_domain, dma_addr_t iova) { @@ -957,7 +963,7 @@ static const struct iommu_ops ipmmu_ops = { .detach_dev = ipmmu_detach_device, .map = ipmmu_map, .unmap = ipmmu_unmap, - .flush_iotlb_all = ipmmu_iotlb_sync, + .flush_iotlb_all = ipmmu_flush_iotlb_all, .iotlb_sync = ipmmu_iotlb_sync, .iova_to_phys = ipmmu_iova_to_phys, .add_device = ipmmu_add_device, diff --git a/drivers/iommu/msm_iommu.c b/drivers/iommu/msm_iommu.c index 8b602384a385..681ab3d3376d 100644 --- a/drivers/iommu/msm_iommu.c +++ b/drivers/iommu/msm_iommu.c @@ -509,7 +509,7 @@ static int msm_iommu_map(struct iommu_domain *domain, unsigned long iova, } static size_t msm_iommu_unmap(struct iommu_domain *domain, unsigned long iova, - size_t len) + size_t len, struct iommu_iotlb_gather *gather) { struct msm_priv *priv = to_msm_priv(domain); unsigned long flags; diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c index fed77658d67e..c870f1674903 100644 --- a/drivers/iommu/mtk_iommu.c +++ b/drivers/iommu/mtk_iommu.c @@ -371,7 +371,8 @@ static int mtk_iommu_map(struct iommu_domain *domain, unsigned long iova, } static size_t mtk_iommu_unmap(struct iommu_domain *domain, - unsigned long iova, size_t size) + unsigned long iova, size_t size, + struct iommu_iotlb_gather *gather) { struct mtk_iommu_domain *dom = to_mtk_domain(domain); unsigned long flags; @@ -384,7 +385,13 @@ static size_t mtk_iommu_unmap(struct iommu_domain *domain, return unmapsz; } -static void mtk_iommu_iotlb_sync(struct iommu_domain *domain) +static void mtk_iommu_flush_iotlb_all(struct iommu_domain *domain) +{ + mtk_iommu_tlb_sync(mtk_iommu_get_m4u_data()); +} + +static void mtk_iommu_iotlb_sync(struct iommu_domain *domain, + struct iommu_iotlb_gather *gather) { mtk_iommu_tlb_sync(mtk_iommu_get_m4u_data()); } @@ -490,7 +497,7 @@ static const struct iommu_ops mtk_iommu_ops = { .detach_dev = mtk_iommu_detach_device, .map = mtk_iommu_map, .unmap = mtk_iommu_unmap, - .flush_iotlb_all = mtk_iommu_iotlb_sync, + .flush_iotlb_all = mtk_iommu_flush_iotlb_all, .iotlb_sync = mtk_iommu_iotlb_sync, .iova_to_phys = mtk_iommu_iova_to_phys, .add_device = mtk_iommu_add_device, diff --git a/drivers/iommu/mtk_iommu_v1.c b/drivers/iommu/mtk_iommu_v1.c index abeeac488372..7b92ddd5d9fd 100644 --- a/drivers/iommu/mtk_iommu_v1.c +++ b/drivers/iommu/mtk_iommu_v1.c @@ -324,7 +324,8 @@ static int mtk_iommu_map(struct iommu_domain *domain, unsigned long iova, } static size_t mtk_iommu_unmap(struct iommu_domain *domain, - unsigned long iova, size_t size) + unsigned long iova, size_t size, + struct iommu_iotlb_gather *gather) { struct mtk_iommu_domain *dom = to_mtk_domain(domain); unsigned long flags; diff --git a/drivers/iommu/omap-iommu.c b/drivers/iommu/omap-iommu.c index dfb961d8c21b..8039bc5ee425 100644 --- a/drivers/iommu/omap-iommu.c +++ b/drivers/iommu/omap-iommu.c @@ -1149,7 +1149,7 @@ static int omap_iommu_map(struct iommu_domain *domain, unsigned long da, } static size_t omap_iommu_unmap(struct iommu_domain *domain, unsigned long da, - size_t size) + size_t size, struct iommu_iotlb_gather *gather) { struct omap_iommu_domain *omap_domain = to_omap_domain(domain); struct device *dev = omap_domain->dev; diff --git a/drivers/iommu/qcom_iommu.c b/drivers/iommu/qcom_iommu.c index fd9d9f4da735..a7432991fa04 100644 --- a/drivers/iommu/qcom_iommu.c +++ b/drivers/iommu/qcom_iommu.c @@ -417,7 +417,7 @@ static int qcom_iommu_map(struct iommu_domain *domain, unsigned long iova, } static size_t qcom_iommu_unmap(struct iommu_domain *domain, unsigned long iova, - size_t size) + size_t size, struct iommu_iotlb_gather *gather) { size_t ret; unsigned long flags; @@ -441,7 +441,7 @@ static size_t qcom_iommu_unmap(struct iommu_domain *domain, unsigned long iova, return ret; } -static void qcom_iommu_iotlb_sync(struct iommu_domain *domain) +static void qcom_iommu_flush_iotlb_all(struct iommu_domain *domain) { struct qcom_iommu_domain *qcom_domain = to_qcom_iommu_domain(domain); struct io_pgtable *pgtable = container_of(qcom_domain->pgtbl_ops, @@ -454,6 +454,12 @@ static void qcom_iommu_iotlb_sync(struct iommu_domain *domain) pm_runtime_put_sync(qcom_domain->iommu->dev); } +static void qcom_iommu_iotlb_sync(struct iommu_domain *domain, + struct iommu_iotlb_gather *gather) +{ + qcom_iommu_flush_iotlb_all(domain); +} + static phys_addr_t qcom_iommu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova) { @@ -581,7 +587,7 @@ static const struct iommu_ops qcom_iommu_ops = { .detach_dev = qcom_iommu_detach_dev, .map = qcom_iommu_map, .unmap = qcom_iommu_unmap, - .flush_iotlb_all = qcom_iommu_iotlb_sync, + .flush_iotlb_all = qcom_iommu_flush_iotlb_all, .iotlb_sync = qcom_iommu_iotlb_sync, .iova_to_phys = qcom_iommu_iova_to_phys, .add_device = qcom_iommu_add_device, diff --git a/drivers/iommu/rockchip-iommu.c b/drivers/iommu/rockchip-iommu.c index dc26d74d79c2..26290f310f90 100644 --- a/drivers/iommu/rockchip-iommu.c +++ b/drivers/iommu/rockchip-iommu.c @@ -794,7 +794,7 @@ static int rk_iommu_map(struct iommu_domain *domain, unsigned long _iova, } static size_t rk_iommu_unmap(struct iommu_domain *domain, unsigned long _iova, - size_t size) + size_t size, struct iommu_iotlb_gather *gather) { struct rk_iommu_domain *rk_domain = to_rk_domain(domain); unsigned long flags; diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c index 22d4db302c1c..3b0b18e23187 100644 --- a/drivers/iommu/s390-iommu.c +++ b/drivers/iommu/s390-iommu.c @@ -314,7 +314,8 @@ static phys_addr_t s390_iommu_iova_to_phys(struct iommu_domain *domain, } static size_t s390_iommu_unmap(struct iommu_domain *domain, - unsigned long iova, size_t size) + unsigned long iova, size_t size, + struct iommu_iotlb_gather *gather) { struct s390_domain *s390_domain = to_s390_domain(domain); int flags = ZPCI_PTE_INVALID; diff --git a/drivers/iommu/tegra-gart.c b/drivers/iommu/tegra-gart.c index 6d40bc1b38bf..3924f7c05544 100644 --- a/drivers/iommu/tegra-gart.c +++ b/drivers/iommu/tegra-gart.c @@ -207,7 +207,7 @@ static inline int __gart_iommu_unmap(struct gart_device *gart, } static size_t gart_iommu_unmap(struct iommu_domain *domain, unsigned long iova, - size_t bytes) + size_t bytes, struct iommu_iotlb_gather *gather) { struct gart_device *gart = gart_handle; int err; @@ -273,11 +273,17 @@ static int gart_iommu_of_xlate(struct device *dev, return 0; } -static void gart_iommu_sync(struct iommu_domain *domain) +static void gart_iommu_sync_map(struct iommu_domain *domain) { FLUSH_GART_REGS(gart_handle); } +static void gart_iommu_sync(struct iommu_domain *domain, + struct iommu_iotlb_gather *gather) +{ + gart_iommu_sync_map(domain); +} + static const struct iommu_ops gart_iommu_ops = { .capable = gart_iommu_capable, .domain_alloc = gart_iommu_domain_alloc, @@ -292,7 +298,7 @@ static const struct iommu_ops gart_iommu_ops = { .iova_to_phys = gart_iommu_iova_to_phys, .pgsize_bitmap = GART_IOMMU_PGSIZES, .of_xlate = gart_iommu_of_xlate, - .iotlb_sync_map = gart_iommu_sync, + .iotlb_sync_map = gart_iommu_sync_map, .iotlb_sync = gart_iommu_sync, }; diff --git a/drivers/iommu/tegra-smmu.c b/drivers/iommu/tegra-smmu.c index c4a652b227f8..7293fc3f796d 100644 --- a/drivers/iommu/tegra-smmu.c +++ b/drivers/iommu/tegra-smmu.c @@ -680,7 +680,7 @@ static int tegra_smmu_map(struct iommu_domain *domain, unsigned long iova, } static size_t tegra_smmu_unmap(struct iommu_domain *domain, unsigned long iova, - size_t size) + size_t size, struct iommu_iotlb_gather *gather) { struct tegra_smmu_as *as = to_smmu_as(domain); dma_addr_t pte_dma; diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c index 433f4d2ee956..5f9f91a4d7f3 100644 --- a/drivers/iommu/virtio-iommu.c +++ b/drivers/iommu/virtio-iommu.c @@ -742,7 +742,7 @@ static int viommu_map(struct iommu_domain *domain, unsigned long iova, } static size_t viommu_unmap(struct iommu_domain *domain, unsigned long iova, - size_t size) + size_t size, struct iommu_iotlb_gather *gather) { int ret = 0; size_t unmapped; @@ -788,7 +788,8 @@ static phys_addr_t viommu_iova_to_phys(struct iommu_domain *domain, return paddr; } -static void viommu_iotlb_sync(struct iommu_domain *domain) +static void viommu_iotlb_sync(struct iommu_domain *domain, + struct iommu_iotlb_gather *gather) { struct viommu_domain *vdomain = to_viommu_domain(domain); diff --git a/include/linux/iommu.h b/include/linux/iommu.h index ad41aee55bc6..64ebaff33455 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -258,10 +258,11 @@ struct iommu_ops { int (*map)(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t size, int prot); size_t (*unmap)(struct iommu_domain *domain, unsigned long iova, - size_t size); + size_t size, struct iommu_iotlb_gather *iotlb_gather); void (*flush_iotlb_all)(struct iommu_domain *domain); void (*iotlb_sync_map)(struct iommu_domain *domain); - void (*iotlb_sync)(struct iommu_domain *domain); + void (*iotlb_sync)(struct iommu_domain *domain, + struct iommu_iotlb_gather *iotlb_gather); phys_addr_t (*iova_to_phys)(struct iommu_domain *domain, dma_addr_t iova); int (*add_device)(struct device *dev); void (*remove_device)(struct device *dev); @@ -502,7 +503,7 @@ static inline void iommu_tlb_sync(struct iommu_domain *domain, struct iommu_iotlb_gather *iotlb_gather) { if (domain->ops->iotlb_sync) - domain->ops->iotlb_sync(domain); + domain->ops->iotlb_sync(domain, iotlb_gather); iommu_iotlb_gather_init(iotlb_gather); } -- cgit v1.2.3 From 3445545b2248300319b6965208e77140c960c3fd Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 2 Jul 2019 16:44:16 +0100 Subject: iommu/io-pgtable: Introduce tlb_flush_walk() and tlb_flush_leaf() In preparation for deferring TLB flushes to iommu_tlb_sync(), introduce two new synchronous invalidation helpers to the io-pgtable API, which allow the unmap() code to force invalidation in cases where it cannot be deferred (e.g. when replacing a table with a block or when TLBI_ON_MAP is set). Signed-off-by: Will Deacon --- include/linux/io-pgtable.h | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h index 6292ea15d674..27275575b305 100644 --- a/include/linux/io-pgtable.h +++ b/include/linux/io-pgtable.h @@ -19,17 +19,31 @@ enum io_pgtable_fmt { /** * struct iommu_flush_ops - IOMMU callbacks for TLB and page table management. * - * @tlb_flush_all: Synchronously invalidate the entire TLB context. - * @tlb_add_flush: Queue up a TLB invalidation for a virtual address range. - * @tlb_sync: Ensure any queued TLB invalidation has taken effect, and - * any corresponding page table updates are visible to the - * IOMMU. + * @tlb_flush_all: Synchronously invalidate the entire TLB context. + * @tlb_flush_walk: Synchronously invalidate all intermediate TLB state + * (sometimes referred to as the "walk cache") for a virtual + * address range. + * @tlb_flush_leaf: Synchronously invalidate all leaf TLB state for a virtual + * address range. + * @tlb_add_flush: Optional callback to queue up leaf TLB invalidation for a + * virtual address range. This function exists purely as an + * optimisation for IOMMUs that cannot batch TLB invalidation + * operations efficiently and are therefore better suited to + * issuing them early rather than deferring them until + * iommu_tlb_sync(). + * @tlb_sync: Ensure any queued TLB invalidation has taken effect, and + * any corresponding page table updates are visible to the + * IOMMU. * * Note that these can all be called in atomic context and must therefore * not block. */ struct iommu_flush_ops { void (*tlb_flush_all)(void *cookie); + void (*tlb_flush_walk)(unsigned long iova, size_t size, size_t granule, + void *cookie); + void (*tlb_flush_leaf)(unsigned long iova, size_t size, size_t granule, + void *cookie); void (*tlb_add_flush)(unsigned long iova, size_t size, size_t granule, bool leaf, void *cookie); void (*tlb_sync)(void *cookie); -- cgit v1.2.3 From 10b7a7d912697afd681a0bcfced9e05543aded35 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 2 Jul 2019 16:44:32 +0100 Subject: iommu/io-pgtable-arm: Call ->tlb_flush_walk() and ->tlb_flush_leaf() Now that all IOMMU drivers using the io-pgtable API implement the ->tlb_flush_walk() and ->tlb_flush_leaf() callbacks, we can use them in the io-pgtable code instead of ->tlb_add_flush() immediately followed by ->tlb_sync(). Signed-off-by: Will Deacon --- drivers/iommu/io-pgtable-arm-v7s.c | 25 +++++++++++++++---------- drivers/iommu/io-pgtable-arm.c | 17 ++++++++++++----- include/linux/io-pgtable.h | 14 ++++++++++++++ 3 files changed, 41 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/drivers/iommu/io-pgtable-arm-v7s.c b/drivers/iommu/io-pgtable-arm-v7s.c index 116f97ee991e..8d4914fe73bc 100644 --- a/drivers/iommu/io-pgtable-arm-v7s.c +++ b/drivers/iommu/io-pgtable-arm-v7s.c @@ -493,9 +493,8 @@ static int arm_v7s_map(struct io_pgtable_ops *ops, unsigned long iova, * a chance for anything to kick off a table walk for the new iova. */ if (iop->cfg.quirks & IO_PGTABLE_QUIRK_TLBI_ON_MAP) { - io_pgtable_tlb_add_flush(iop, iova, size, - ARM_V7S_BLOCK_SIZE(2), false); - io_pgtable_tlb_sync(iop); + io_pgtable_tlb_flush_walk(iop, iova, size, + ARM_V7S_BLOCK_SIZE(2)); } else { wmb(); } @@ -541,8 +540,7 @@ static arm_v7s_iopte arm_v7s_split_cont(struct arm_v7s_io_pgtable *data, __arm_v7s_pte_sync(ptep, ARM_V7S_CONT_PAGES, &iop->cfg); size *= ARM_V7S_CONT_PAGES; - io_pgtable_tlb_add_flush(iop, iova, size, size, true); - io_pgtable_tlb_sync(iop); + io_pgtable_tlb_flush_leaf(iop, iova, size, size); return pte; } @@ -637,9 +635,8 @@ static size_t __arm_v7s_unmap(struct arm_v7s_io_pgtable *data, for (i = 0; i < num_entries; i++) { if (ARM_V7S_PTE_IS_TABLE(pte[i], lvl)) { /* Also flush any partial walks */ - io_pgtable_tlb_add_flush(iop, iova, blk_size, - ARM_V7S_BLOCK_SIZE(lvl + 1), false); - io_pgtable_tlb_sync(iop); + io_pgtable_tlb_flush_walk(iop, iova, blk_size, + ARM_V7S_BLOCK_SIZE(lvl + 1)); ptep = iopte_deref(pte[i], lvl); __arm_v7s_free_table(ptep, lvl + 1, data); } else if (iop->cfg.quirks & IO_PGTABLE_QUIRK_NON_STRICT) { @@ -805,13 +802,19 @@ static void dummy_tlb_flush_all(void *cookie) WARN_ON(cookie != cfg_cookie); } -static void dummy_tlb_add_flush(unsigned long iova, size_t size, - size_t granule, bool leaf, void *cookie) +static void dummy_tlb_flush(unsigned long iova, size_t size, size_t granule, + void *cookie) { WARN_ON(cookie != cfg_cookie); WARN_ON(!(size & cfg_cookie->pgsize_bitmap)); } +static void dummy_tlb_add_flush(unsigned long iova, size_t size, + size_t granule, bool leaf, void *cookie) +{ + dummy_tlb_flush(iova, size, granule, cookie); +} + static void dummy_tlb_sync(void *cookie) { WARN_ON(cookie != cfg_cookie); @@ -819,6 +822,8 @@ static void dummy_tlb_sync(void *cookie) static const struct iommu_flush_ops dummy_tlb_ops = { .tlb_flush_all = dummy_tlb_flush_all, + .tlb_flush_walk = dummy_tlb_flush, + .tlb_flush_leaf = dummy_tlb_flush, .tlb_add_flush = dummy_tlb_add_flush, .tlb_sync = dummy_tlb_sync, }; diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c index 402f913b6f6d..b58338c86323 100644 --- a/drivers/iommu/io-pgtable-arm.c +++ b/drivers/iommu/io-pgtable-arm.c @@ -611,9 +611,8 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data, if (!iopte_leaf(pte, lvl, iop->fmt)) { /* Also flush any partial walks */ - io_pgtable_tlb_add_flush(iop, iova, size, - ARM_LPAE_GRANULE(data), false); - io_pgtable_tlb_sync(iop); + io_pgtable_tlb_flush_walk(iop, iova, size, + ARM_LPAE_GRANULE(data)); ptep = iopte_deref(pte, data); __arm_lpae_free_pgtable(data, lvl + 1, ptep); } else if (iop->cfg.quirks & IO_PGTABLE_QUIRK_NON_STRICT) { @@ -1069,13 +1068,19 @@ static void dummy_tlb_flush_all(void *cookie) WARN_ON(cookie != cfg_cookie); } -static void dummy_tlb_add_flush(unsigned long iova, size_t size, - size_t granule, bool leaf, void *cookie) +static void dummy_tlb_flush(unsigned long iova, size_t size, size_t granule, + void *cookie) { WARN_ON(cookie != cfg_cookie); WARN_ON(!(size & cfg_cookie->pgsize_bitmap)); } +static void dummy_tlb_add_flush(unsigned long iova, size_t size, + size_t granule, bool leaf, void *cookie) +{ + dummy_tlb_flush(iova, size, granule, cookie); +} + static void dummy_tlb_sync(void *cookie) { WARN_ON(cookie != cfg_cookie); @@ -1083,6 +1088,8 @@ static void dummy_tlb_sync(void *cookie) static const struct iommu_flush_ops dummy_tlb_ops __initconst = { .tlb_flush_all = dummy_tlb_flush_all, + .tlb_flush_walk = dummy_tlb_flush, + .tlb_flush_leaf = dummy_tlb_flush, .tlb_add_flush = dummy_tlb_add_flush, .tlb_sync = dummy_tlb_sync, }; diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h index 27275575b305..0618aac59e74 100644 --- a/include/linux/io-pgtable.h +++ b/include/linux/io-pgtable.h @@ -198,6 +198,20 @@ static inline void io_pgtable_tlb_flush_all(struct io_pgtable *iop) iop->cfg.tlb->tlb_flush_all(iop->cookie); } +static inline void +io_pgtable_tlb_flush_walk(struct io_pgtable *iop, unsigned long iova, + size_t size, size_t granule) +{ + iop->cfg.tlb->tlb_flush_walk(iova, size, granule, iop->cookie); +} + +static inline void +io_pgtable_tlb_flush_leaf(struct io_pgtable *iop, unsigned long iova, + size_t size, size_t granule) +{ + iop->cfg.tlb->tlb_flush_leaf(iova, size, granule, iop->cookie); +} + static inline void io_pgtable_tlb_add_flush(struct io_pgtable *iop, unsigned long iova, size_t size, size_t granule, bool leaf) { -- cgit v1.2.3 From abfd6fe0cd535d31ee83b668be6eb59ce6a8469d Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 2 Jul 2019 16:44:41 +0100 Subject: iommu/io-pgtable: Replace ->tlb_add_flush() with ->tlb_add_page() The ->tlb_add_flush() callback in the io-pgtable API now looks a bit silly: - It takes a size and a granule, which are always the same - It takes a 'bool leaf', which is always true - It only ever flushes a single page With that in mind, replace it with an optional ->tlb_add_page() callback that drops the useless parameters. Signed-off-by: Will Deacon --- drivers/gpu/drm/panfrost/panfrost_mmu.c | 5 -- drivers/iommu/arm-smmu-v3.c | 8 ++- drivers/iommu/arm-smmu.c | 88 +++++++++++++++++++++------------ drivers/iommu/io-pgtable-arm-v7s.c | 12 ++--- drivers/iommu/io-pgtable-arm.c | 11 ++--- drivers/iommu/ipmmu-vmsa.c | 7 --- drivers/iommu/msm_iommu.c | 7 ++- drivers/iommu/mtk_iommu.c | 8 ++- drivers/iommu/qcom_iommu.c | 8 ++- include/linux/io-pgtable.h | 22 ++++----- 10 files changed, 105 insertions(+), 71 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/panfrost/panfrost_mmu.c b/drivers/gpu/drm/panfrost/panfrost_mmu.c index 651858147bd6..ff9af320cacc 100644 --- a/drivers/gpu/drm/panfrost/panfrost_mmu.c +++ b/drivers/gpu/drm/panfrost/panfrost_mmu.c @@ -247,10 +247,6 @@ static void mmu_tlb_inv_context_s1(void *cookie) mmu_hw_do_operation(pfdev, 0, 0, ~0UL, AS_COMMAND_FLUSH_MEM); } -static void mmu_tlb_inv_range_nosync(unsigned long iova, size_t size, - size_t granule, bool leaf, void *cookie) -{} - static void mmu_tlb_sync_context(void *cookie) { //struct panfrost_device *pfdev = cookie; @@ -273,7 +269,6 @@ static const struct iommu_flush_ops mmu_tlb_ops = { .tlb_flush_all = mmu_tlb_inv_context_s1, .tlb_flush_walk = mmu_tlb_flush_walk, .tlb_flush_leaf = mmu_tlb_flush_leaf, - .tlb_add_flush = mmu_tlb_inv_range_nosync, .tlb_sync = mmu_tlb_sync_context, }; diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c index 79819b003b07..98c90a1b4b22 100644 --- a/drivers/iommu/arm-smmu-v3.c +++ b/drivers/iommu/arm-smmu-v3.c @@ -1603,6 +1603,12 @@ static void arm_smmu_tlb_inv_range_nosync(unsigned long iova, size_t size, } while (size -= granule); } +static void arm_smmu_tlb_inv_page_nosync(unsigned long iova, size_t granule, + void *cookie) +{ + arm_smmu_tlb_inv_range_nosync(iova, granule, granule, true, cookie); +} + static void arm_smmu_tlb_inv_walk(unsigned long iova, size_t size, size_t granule, void *cookie) { @@ -1627,7 +1633,7 @@ static const struct iommu_flush_ops arm_smmu_flush_ops = { .tlb_flush_all = arm_smmu_tlb_inv_context, .tlb_flush_walk = arm_smmu_tlb_inv_walk, .tlb_flush_leaf = arm_smmu_tlb_inv_leaf, - .tlb_add_flush = arm_smmu_tlb_inv_range_nosync, + .tlb_add_page = arm_smmu_tlb_inv_page_nosync, .tlb_sync = arm_smmu_tlb_sync, }; diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c index e9f01b860ae3..f056164a94b0 100644 --- a/drivers/iommu/arm-smmu.c +++ b/drivers/iommu/arm-smmu.c @@ -248,10 +248,16 @@ enum arm_smmu_domain_stage { ARM_SMMU_DOMAIN_BYPASS, }; +struct arm_smmu_flush_ops { + struct iommu_flush_ops tlb; + void (*tlb_inv_range)(unsigned long iova, size_t size, size_t granule, + bool leaf, void *cookie) +}; + struct arm_smmu_domain { struct arm_smmu_device *smmu; struct io_pgtable_ops *pgtbl_ops; - const struct iommu_flush_ops *tlb_ops; + const struct arm_smmu_flush_ops *flush_ops; struct arm_smmu_cfg cfg; enum arm_smmu_domain_stage stage; bool non_strict; @@ -551,42 +557,62 @@ static void arm_smmu_tlb_inv_walk(unsigned long iova, size_t size, size_t granule, void *cookie) { struct arm_smmu_domain *smmu_domain = cookie; + const struct arm_smmu_flush_ops *ops = smmu_domain->flush_ops; - smmu_domain->tlb_ops->tlb_add_flush(iova, size, granule, false, cookie); - smmu_domain->tlb_ops->tlb_sync(cookie); + ops->tlb_inv_range(iova, size, granule, false, cookie); + ops->tlb.tlb_sync(cookie); } static void arm_smmu_tlb_inv_leaf(unsigned long iova, size_t size, size_t granule, void *cookie) { struct arm_smmu_domain *smmu_domain = cookie; + const struct arm_smmu_flush_ops *ops = smmu_domain->flush_ops; + + ops->tlb_inv_range(iova, size, granule, true, cookie); + ops->tlb.tlb_sync(cookie); +} + +static void arm_smmu_tlb_add_page(unsigned long iova, size_t granule, + void *cookie) +{ + struct arm_smmu_domain *smmu_domain = cookie; + const struct arm_smmu_flush_ops *ops = smmu_domain->flush_ops; - smmu_domain->tlb_ops->tlb_add_flush(iova, size, granule, true, cookie); - smmu_domain->tlb_ops->tlb_sync(cookie); + ops->tlb_inv_range(iova, granule, granule, true, cookie); } -static const struct iommu_flush_ops arm_smmu_s1_tlb_ops = { - .tlb_flush_all = arm_smmu_tlb_inv_context_s1, - .tlb_flush_walk = arm_smmu_tlb_inv_walk, - .tlb_flush_leaf = arm_smmu_tlb_inv_leaf, - .tlb_add_flush = arm_smmu_tlb_inv_range_nosync, - .tlb_sync = arm_smmu_tlb_sync_context, +static const struct arm_smmu_flush_ops arm_smmu_s1_tlb_ops = { + .tlb = { + .tlb_flush_all = arm_smmu_tlb_inv_context_s1, + .tlb_flush_walk = arm_smmu_tlb_inv_walk, + .tlb_flush_leaf = arm_smmu_tlb_inv_leaf, + .tlb_add_page = arm_smmu_tlb_add_page, + .tlb_sync = arm_smmu_tlb_sync_context, + }, + .tlb_inv_range = arm_smmu_tlb_inv_range_nosync, }; -static const struct iommu_flush_ops arm_smmu_s2_tlb_ops_v2 = { - .tlb_flush_all = arm_smmu_tlb_inv_context_s2, - .tlb_flush_walk = arm_smmu_tlb_inv_walk, - .tlb_flush_leaf = arm_smmu_tlb_inv_leaf, - .tlb_add_flush = arm_smmu_tlb_inv_range_nosync, - .tlb_sync = arm_smmu_tlb_sync_context, +static const struct arm_smmu_flush_ops arm_smmu_s2_tlb_ops_v2 = { + .tlb = { + .tlb_flush_all = arm_smmu_tlb_inv_context_s2, + .tlb_flush_walk = arm_smmu_tlb_inv_walk, + .tlb_flush_leaf = arm_smmu_tlb_inv_leaf, + .tlb_add_page = arm_smmu_tlb_add_page, + .tlb_sync = arm_smmu_tlb_sync_context, + }, + .tlb_inv_range = arm_smmu_tlb_inv_range_nosync, }; -static const struct iommu_flush_ops arm_smmu_s2_tlb_ops_v1 = { - .tlb_flush_all = arm_smmu_tlb_inv_context_s2, - .tlb_flush_walk = arm_smmu_tlb_inv_walk, - .tlb_flush_leaf = arm_smmu_tlb_inv_leaf, - .tlb_add_flush = arm_smmu_tlb_inv_vmid_nosync, - .tlb_sync = arm_smmu_tlb_sync_vmid, +static const struct arm_smmu_flush_ops arm_smmu_s2_tlb_ops_v1 = { + .tlb = { + .tlb_flush_all = arm_smmu_tlb_inv_context_s2, + .tlb_flush_walk = arm_smmu_tlb_inv_walk, + .tlb_flush_leaf = arm_smmu_tlb_inv_leaf, + .tlb_add_page = arm_smmu_tlb_add_page, + .tlb_sync = arm_smmu_tlb_sync_vmid, + }, + .tlb_inv_range = arm_smmu_tlb_inv_vmid_nosync, }; static irqreturn_t arm_smmu_context_fault(int irq, void *dev) @@ -866,7 +892,7 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain, ias = min(ias, 32UL); oas = min(oas, 32UL); } - smmu_domain->tlb_ops = &arm_smmu_s1_tlb_ops; + smmu_domain->flush_ops = &arm_smmu_s1_tlb_ops; break; case ARM_SMMU_DOMAIN_NESTED: /* @@ -886,9 +912,9 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain, oas = min(oas, 40UL); } if (smmu->version == ARM_SMMU_V2) - smmu_domain->tlb_ops = &arm_smmu_s2_tlb_ops_v2; + smmu_domain->flush_ops = &arm_smmu_s2_tlb_ops_v2; else - smmu_domain->tlb_ops = &arm_smmu_s2_tlb_ops_v1; + smmu_domain->flush_ops = &arm_smmu_s2_tlb_ops_v1; break; default: ret = -EINVAL; @@ -917,7 +943,7 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain, .ias = ias, .oas = oas, .coherent_walk = smmu->features & ARM_SMMU_FEAT_COHERENT_WALK, - .tlb = smmu_domain->tlb_ops, + .tlb = &smmu_domain->flush_ops->tlb, .iommu_dev = smmu->dev, }; @@ -1346,9 +1372,9 @@ static void arm_smmu_flush_iotlb_all(struct iommu_domain *domain) struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); struct arm_smmu_device *smmu = smmu_domain->smmu; - if (smmu_domain->tlb_ops) { + if (smmu_domain->flush_ops) { arm_smmu_rpm_get(smmu); - smmu_domain->tlb_ops->tlb_flush_all(smmu_domain); + smmu_domain->flush_ops->tlb.tlb_flush_all(smmu_domain); arm_smmu_rpm_put(smmu); } } @@ -1359,9 +1385,9 @@ static void arm_smmu_iotlb_sync(struct iommu_domain *domain, struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); struct arm_smmu_device *smmu = smmu_domain->smmu; - if (smmu_domain->tlb_ops) { + if (smmu_domain->flush_ops) { arm_smmu_rpm_get(smmu); - smmu_domain->tlb_ops->tlb_sync(smmu_domain); + smmu_domain->flush_ops->tlb.tlb_sync(smmu_domain); arm_smmu_rpm_put(smmu); } } diff --git a/drivers/iommu/io-pgtable-arm-v7s.c b/drivers/iommu/io-pgtable-arm-v7s.c index 8d4914fe73bc..b3f975c95f76 100644 --- a/drivers/iommu/io-pgtable-arm-v7s.c +++ b/drivers/iommu/io-pgtable-arm-v7s.c @@ -584,7 +584,7 @@ static size_t arm_v7s_split_blk_unmap(struct arm_v7s_io_pgtable *data, return __arm_v7s_unmap(data, iova, size, 2, tablep); } - io_pgtable_tlb_add_flush(&data->iop, iova, size, size, true); + io_pgtable_tlb_add_page(&data->iop, iova, size); return size; } @@ -647,8 +647,7 @@ static size_t __arm_v7s_unmap(struct arm_v7s_io_pgtable *data, */ smp_wmb(); } else { - io_pgtable_tlb_add_flush(iop, iova, blk_size, - blk_size, true); + io_pgtable_tlb_add_page(iop, iova, blk_size); } iova += blk_size; } @@ -809,10 +808,9 @@ static void dummy_tlb_flush(unsigned long iova, size_t size, size_t granule, WARN_ON(!(size & cfg_cookie->pgsize_bitmap)); } -static void dummy_tlb_add_flush(unsigned long iova, size_t size, - size_t granule, bool leaf, void *cookie) +static void dummy_tlb_add_page(unsigned long iova, size_t granule, void *cookie) { - dummy_tlb_flush(iova, size, granule, cookie); + dummy_tlb_flush(iova, granule, granule, cookie); } static void dummy_tlb_sync(void *cookie) @@ -824,7 +822,7 @@ static const struct iommu_flush_ops dummy_tlb_ops = { .tlb_flush_all = dummy_tlb_flush_all, .tlb_flush_walk = dummy_tlb_flush, .tlb_flush_leaf = dummy_tlb_flush, - .tlb_add_flush = dummy_tlb_add_flush, + .tlb_add_page = dummy_tlb_add_page, .tlb_sync = dummy_tlb_sync, }; diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c index b58338c86323..a5c0db01533e 100644 --- a/drivers/iommu/io-pgtable-arm.c +++ b/drivers/iommu/io-pgtable-arm.c @@ -582,7 +582,7 @@ static size_t arm_lpae_split_blk_unmap(struct arm_lpae_io_pgtable *data, tablep = iopte_deref(pte, data); } else if (unmap_idx >= 0) { - io_pgtable_tlb_add_flush(&data->iop, iova, size, size, true); + io_pgtable_tlb_add_page(&data->iop, iova, size); return size; } @@ -623,7 +623,7 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data, */ smp_wmb(); } else { - io_pgtable_tlb_add_flush(iop, iova, size, size, true); + io_pgtable_tlb_add_page(iop, iova, size); } return size; @@ -1075,10 +1075,9 @@ static void dummy_tlb_flush(unsigned long iova, size_t size, size_t granule, WARN_ON(!(size & cfg_cookie->pgsize_bitmap)); } -static void dummy_tlb_add_flush(unsigned long iova, size_t size, - size_t granule, bool leaf, void *cookie) +static void dummy_tlb_add_page(unsigned long iova, size_t granule, void *cookie) { - dummy_tlb_flush(iova, size, granule, cookie); + dummy_tlb_flush(iova, granule, granule, cookie); } static void dummy_tlb_sync(void *cookie) @@ -1090,7 +1089,7 @@ static const struct iommu_flush_ops dummy_tlb_ops __initconst = { .tlb_flush_all = dummy_tlb_flush_all, .tlb_flush_walk = dummy_tlb_flush, .tlb_flush_leaf = dummy_tlb_flush, - .tlb_add_flush = dummy_tlb_add_flush, + .tlb_add_page = dummy_tlb_add_page, .tlb_sync = dummy_tlb_sync, }; diff --git a/drivers/iommu/ipmmu-vmsa.c b/drivers/iommu/ipmmu-vmsa.c index 9cc7bcb7e39d..c4da271af90e 100644 --- a/drivers/iommu/ipmmu-vmsa.c +++ b/drivers/iommu/ipmmu-vmsa.c @@ -367,17 +367,10 @@ static void ipmmu_tlb_flush(unsigned long iova, size_t size, ipmmu_tlb_flush_all(cookie); } -static void ipmmu_tlb_add_flush(unsigned long iova, size_t size, - size_t granule, bool leaf, void *cookie) -{ - /* The hardware doesn't support selective TLB flush. */ -} - static const struct iommu_flush_ops ipmmu_flush_ops = { .tlb_flush_all = ipmmu_tlb_flush_all, .tlb_flush_walk = ipmmu_tlb_flush, .tlb_flush_leaf = ipmmu_tlb_flush, - .tlb_add_flush = ipmmu_tlb_add_flush, .tlb_sync = ipmmu_tlb_flush_all, }; diff --git a/drivers/iommu/msm_iommu.c b/drivers/iommu/msm_iommu.c index 64132093751a..2cd83295a841 100644 --- a/drivers/iommu/msm_iommu.c +++ b/drivers/iommu/msm_iommu.c @@ -192,11 +192,16 @@ static void __flush_iotlb_leaf(unsigned long iova, size_t size, __flush_iotlb_sync(cookie); } +static void __flush_iotlb_page(unsigned long iova, size_t granule, void *cookie) +{ + __flush_iotlb_range(iova, granule, granule, true, cookie); +} + static const struct iommu_flush_ops msm_iommu_flush_ops = { .tlb_flush_all = __flush_iotlb, .tlb_flush_walk = __flush_iotlb_walk, .tlb_flush_leaf = __flush_iotlb_leaf, - .tlb_add_flush = __flush_iotlb_range, + .tlb_add_page = __flush_iotlb_page, .tlb_sync = __flush_iotlb_sync, }; diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c index 85a7176bf9ae..a0b4b4dc4b90 100644 --- a/drivers/iommu/mtk_iommu.c +++ b/drivers/iommu/mtk_iommu.c @@ -202,11 +202,17 @@ static void mtk_iommu_tlb_flush_leaf(unsigned long iova, size_t size, mtk_iommu_tlb_sync(cookie); } +static void mtk_iommu_tlb_flush_page_nosync(unsigned long iova, size_t granule, + void *cookie) +{ + mtk_iommu_tlb_add_flush_nosync(iova, granule, granule, true, cookie); +} + static const struct iommu_flush_ops mtk_iommu_flush_ops = { .tlb_flush_all = mtk_iommu_tlb_flush_all, .tlb_flush_walk = mtk_iommu_tlb_flush_walk, .tlb_flush_leaf = mtk_iommu_tlb_flush_leaf, - .tlb_add_flush = mtk_iommu_tlb_add_flush_nosync, + .tlb_add_page = mtk_iommu_tlb_flush_page_nosync, .tlb_sync = mtk_iommu_tlb_sync, }; diff --git a/drivers/iommu/qcom_iommu.c b/drivers/iommu/qcom_iommu.c index 643079e52e69..7d8411dee4cf 100644 --- a/drivers/iommu/qcom_iommu.c +++ b/drivers/iommu/qcom_iommu.c @@ -178,11 +178,17 @@ static void qcom_iommu_tlb_flush_leaf(unsigned long iova, size_t size, qcom_iommu_tlb_sync(cookie); } +static void qcom_iommu_tlb_add_page(unsigned long iova, size_t granule, + void *cookie) +{ + qcom_iommu_tlb_inv_range_nosync(iova, granule, granule, true, cookie); +} + static const struct iommu_flush_ops qcom_flush_ops = { .tlb_flush_all = qcom_iommu_tlb_inv_context, .tlb_flush_walk = qcom_iommu_tlb_flush_walk, .tlb_flush_leaf = qcom_iommu_tlb_flush_leaf, - .tlb_add_flush = qcom_iommu_tlb_inv_range_nosync, + .tlb_add_page = qcom_iommu_tlb_add_page, .tlb_sync = qcom_iommu_tlb_sync, }; diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h index 0618aac59e74..99e04bd2baa1 100644 --- a/include/linux/io-pgtable.h +++ b/include/linux/io-pgtable.h @@ -25,12 +25,11 @@ enum io_pgtable_fmt { * address range. * @tlb_flush_leaf: Synchronously invalidate all leaf TLB state for a virtual * address range. - * @tlb_add_flush: Optional callback to queue up leaf TLB invalidation for a - * virtual address range. This function exists purely as an - * optimisation for IOMMUs that cannot batch TLB invalidation - * operations efficiently and are therefore better suited to - * issuing them early rather than deferring them until - * iommu_tlb_sync(). + * @tlb_add_page: Optional callback to queue up leaf TLB invalidation for a + * single page. This function exists purely as an optimisation + * for IOMMUs that cannot batch TLB invalidation operations + * efficiently and are therefore better suited to issuing them + * early rather than deferring them until iommu_tlb_sync(). * @tlb_sync: Ensure any queued TLB invalidation has taken effect, and * any corresponding page table updates are visible to the * IOMMU. @@ -44,8 +43,7 @@ struct iommu_flush_ops { void *cookie); void (*tlb_flush_leaf)(unsigned long iova, size_t size, size_t granule, void *cookie); - void (*tlb_add_flush)(unsigned long iova, size_t size, size_t granule, - bool leaf, void *cookie); + void (*tlb_add_page)(unsigned long iova, size_t granule, void *cookie); void (*tlb_sync)(void *cookie); }; @@ -212,10 +210,12 @@ io_pgtable_tlb_flush_leaf(struct io_pgtable *iop, unsigned long iova, iop->cfg.tlb->tlb_flush_leaf(iova, size, granule, iop->cookie); } -static inline void io_pgtable_tlb_add_flush(struct io_pgtable *iop, - unsigned long iova, size_t size, size_t granule, bool leaf) +static inline void +io_pgtable_tlb_add_page(struct io_pgtable *iop, unsigned long iova, + size_t granule) { - iop->cfg.tlb->tlb_add_flush(iova, size, granule, leaf, iop->cookie); + if (iop->cfg.tlb->tlb_add_page) + iop->cfg.tlb->tlb_add_page(iova, granule, iop->cookie); } static inline void io_pgtable_tlb_sync(struct io_pgtable *iop) -- cgit v1.2.3 From e953f7f2fa78d1c7fd064171f88457c6b1e21af9 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 2 Jul 2019 16:44:50 +0100 Subject: iommu/io-pgtable: Remove unused ->tlb_sync() callback The ->tlb_sync() callback is no longer used, so it can be removed. Signed-off-by: Will Deacon --- drivers/gpu/drm/panfrost/panfrost_mmu.c | 1 - drivers/iommu/arm-smmu-v3.c | 8 -------- drivers/iommu/arm-smmu.c | 17 +++++++++-------- drivers/iommu/io-pgtable-arm-v7s.c | 6 ------ drivers/iommu/io-pgtable-arm.c | 6 ------ drivers/iommu/ipmmu-vmsa.c | 1 - drivers/iommu/msm_iommu.c | 20 +++++++------------- drivers/iommu/mtk_iommu.c | 1 - drivers/iommu/qcom_iommu.c | 1 - include/linux/io-pgtable.h | 9 --------- 10 files changed, 16 insertions(+), 54 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/panfrost/panfrost_mmu.c b/drivers/gpu/drm/panfrost/panfrost_mmu.c index ff9af320cacc..de22a2276e00 100644 --- a/drivers/gpu/drm/panfrost/panfrost_mmu.c +++ b/drivers/gpu/drm/panfrost/panfrost_mmu.c @@ -269,7 +269,6 @@ static const struct iommu_flush_ops mmu_tlb_ops = { .tlb_flush_all = mmu_tlb_inv_context_s1, .tlb_flush_walk = mmu_tlb_flush_walk, .tlb_flush_leaf = mmu_tlb_flush_leaf, - .tlb_sync = mmu_tlb_sync_context, }; static const char *access_type_name(struct panfrost_device *pfdev, diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c index 98c90a1b4b22..231093413ff9 100644 --- a/drivers/iommu/arm-smmu-v3.c +++ b/drivers/iommu/arm-smmu-v3.c @@ -1545,13 +1545,6 @@ static int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain, } /* IO_PGTABLE API */ -static void arm_smmu_tlb_sync(void *cookie) -{ - struct arm_smmu_domain *smmu_domain = cookie; - - arm_smmu_cmdq_issue_sync(smmu_domain->smmu); -} - static void arm_smmu_tlb_inv_context(void *cookie) { struct arm_smmu_domain *smmu_domain = cookie; @@ -1634,7 +1627,6 @@ static const struct iommu_flush_ops arm_smmu_flush_ops = { .tlb_flush_walk = arm_smmu_tlb_inv_walk, .tlb_flush_leaf = arm_smmu_tlb_inv_leaf, .tlb_add_page = arm_smmu_tlb_inv_page_nosync, - .tlb_sync = arm_smmu_tlb_sync, }; /* IOMMU API */ diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c index f056164a94b0..07a267c437d6 100644 --- a/drivers/iommu/arm-smmu.c +++ b/drivers/iommu/arm-smmu.c @@ -251,7 +251,8 @@ enum arm_smmu_domain_stage { struct arm_smmu_flush_ops { struct iommu_flush_ops tlb; void (*tlb_inv_range)(unsigned long iova, size_t size, size_t granule, - bool leaf, void *cookie) + bool leaf, void *cookie); + void (*tlb_sync)(void *cookie); }; struct arm_smmu_domain { @@ -539,7 +540,7 @@ static void arm_smmu_tlb_inv_range_nosync(unsigned long iova, size_t size, * On MMU-401 at least, the cost of firing off multiple TLBIVMIDs appears * almost negligible, but the benefit of getting the first one in as far ahead * of the sync as possible is significant, hence we don't just make this a - * no-op and set .tlb_sync to arm_smmu_inv_context_s2() as you might think. + * no-op and set .tlb_sync to arm_smmu_tlb_inv_context_s2() as you might think. */ static void arm_smmu_tlb_inv_vmid_nosync(unsigned long iova, size_t size, size_t granule, bool leaf, void *cookie) @@ -560,7 +561,7 @@ static void arm_smmu_tlb_inv_walk(unsigned long iova, size_t size, const struct arm_smmu_flush_ops *ops = smmu_domain->flush_ops; ops->tlb_inv_range(iova, size, granule, false, cookie); - ops->tlb.tlb_sync(cookie); + ops->tlb_sync(cookie); } static void arm_smmu_tlb_inv_leaf(unsigned long iova, size_t size, @@ -570,7 +571,7 @@ static void arm_smmu_tlb_inv_leaf(unsigned long iova, size_t size, const struct arm_smmu_flush_ops *ops = smmu_domain->flush_ops; ops->tlb_inv_range(iova, size, granule, true, cookie); - ops->tlb.tlb_sync(cookie); + ops->tlb_sync(cookie); } static void arm_smmu_tlb_add_page(unsigned long iova, size_t granule, @@ -588,9 +589,9 @@ static const struct arm_smmu_flush_ops arm_smmu_s1_tlb_ops = { .tlb_flush_walk = arm_smmu_tlb_inv_walk, .tlb_flush_leaf = arm_smmu_tlb_inv_leaf, .tlb_add_page = arm_smmu_tlb_add_page, - .tlb_sync = arm_smmu_tlb_sync_context, }, .tlb_inv_range = arm_smmu_tlb_inv_range_nosync, + .tlb_sync = arm_smmu_tlb_sync_context, }; static const struct arm_smmu_flush_ops arm_smmu_s2_tlb_ops_v2 = { @@ -599,9 +600,9 @@ static const struct arm_smmu_flush_ops arm_smmu_s2_tlb_ops_v2 = { .tlb_flush_walk = arm_smmu_tlb_inv_walk, .tlb_flush_leaf = arm_smmu_tlb_inv_leaf, .tlb_add_page = arm_smmu_tlb_add_page, - .tlb_sync = arm_smmu_tlb_sync_context, }, .tlb_inv_range = arm_smmu_tlb_inv_range_nosync, + .tlb_sync = arm_smmu_tlb_sync_context, }; static const struct arm_smmu_flush_ops arm_smmu_s2_tlb_ops_v1 = { @@ -610,9 +611,9 @@ static const struct arm_smmu_flush_ops arm_smmu_s2_tlb_ops_v1 = { .tlb_flush_walk = arm_smmu_tlb_inv_walk, .tlb_flush_leaf = arm_smmu_tlb_inv_leaf, .tlb_add_page = arm_smmu_tlb_add_page, - .tlb_sync = arm_smmu_tlb_sync_vmid, }, .tlb_inv_range = arm_smmu_tlb_inv_vmid_nosync, + .tlb_sync = arm_smmu_tlb_sync_vmid, }; static irqreturn_t arm_smmu_context_fault(int irq, void *dev) @@ -1387,7 +1388,7 @@ static void arm_smmu_iotlb_sync(struct iommu_domain *domain, if (smmu_domain->flush_ops) { arm_smmu_rpm_get(smmu); - smmu_domain->flush_ops->tlb.tlb_sync(smmu_domain); + smmu_domain->flush_ops->tlb_sync(smmu_domain); arm_smmu_rpm_put(smmu); } } diff --git a/drivers/iommu/io-pgtable-arm-v7s.c b/drivers/iommu/io-pgtable-arm-v7s.c index b3f975c95f76..203894fb6765 100644 --- a/drivers/iommu/io-pgtable-arm-v7s.c +++ b/drivers/iommu/io-pgtable-arm-v7s.c @@ -813,17 +813,11 @@ static void dummy_tlb_add_page(unsigned long iova, size_t granule, void *cookie) dummy_tlb_flush(iova, granule, granule, cookie); } -static void dummy_tlb_sync(void *cookie) -{ - WARN_ON(cookie != cfg_cookie); -} - static const struct iommu_flush_ops dummy_tlb_ops = { .tlb_flush_all = dummy_tlb_flush_all, .tlb_flush_walk = dummy_tlb_flush, .tlb_flush_leaf = dummy_tlb_flush, .tlb_add_page = dummy_tlb_add_page, - .tlb_sync = dummy_tlb_sync, }; #define __FAIL(ops) ({ \ diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c index a5c0db01533e..f35516744965 100644 --- a/drivers/iommu/io-pgtable-arm.c +++ b/drivers/iommu/io-pgtable-arm.c @@ -1080,17 +1080,11 @@ static void dummy_tlb_add_page(unsigned long iova, size_t granule, void *cookie) dummy_tlb_flush(iova, granule, granule, cookie); } -static void dummy_tlb_sync(void *cookie) -{ - WARN_ON(cookie != cfg_cookie); -} - static const struct iommu_flush_ops dummy_tlb_ops __initconst = { .tlb_flush_all = dummy_tlb_flush_all, .tlb_flush_walk = dummy_tlb_flush, .tlb_flush_leaf = dummy_tlb_flush, .tlb_add_page = dummy_tlb_add_page, - .tlb_sync = dummy_tlb_sync, }; static void __init arm_lpae_dump_ops(struct io_pgtable_ops *ops) diff --git a/drivers/iommu/ipmmu-vmsa.c b/drivers/iommu/ipmmu-vmsa.c index c4da271af90e..a2b8eff4c1f7 100644 --- a/drivers/iommu/ipmmu-vmsa.c +++ b/drivers/iommu/ipmmu-vmsa.c @@ -371,7 +371,6 @@ static const struct iommu_flush_ops ipmmu_flush_ops = { .tlb_flush_all = ipmmu_tlb_flush_all, .tlb_flush_walk = ipmmu_tlb_flush, .tlb_flush_leaf = ipmmu_tlb_flush, - .tlb_sync = ipmmu_tlb_flush_all, }; /* ----------------------------------------------------------------------------- diff --git a/drivers/iommu/msm_iommu.c b/drivers/iommu/msm_iommu.c index 2cd83295a841..ccfc7ed230ef 100644 --- a/drivers/iommu/msm_iommu.c +++ b/drivers/iommu/msm_iommu.c @@ -168,28 +168,16 @@ fail: return; } -static void __flush_iotlb_sync(void *cookie) -{ - /* - * Nothing is needed here, the barrier to guarantee - * completion of the tlb sync operation is implicitly - * taken care when the iommu client does a writel before - * kick starting the other master. - */ -} - static void __flush_iotlb_walk(unsigned long iova, size_t size, size_t granule, void *cookie) { __flush_iotlb_range(iova, size, granule, false, cookie); - __flush_iotlb_sync(cookie); } static void __flush_iotlb_leaf(unsigned long iova, size_t size, size_t granule, void *cookie) { __flush_iotlb_range(iova, size, granule, true, cookie); - __flush_iotlb_sync(cookie); } static void __flush_iotlb_page(unsigned long iova, size_t granule, void *cookie) @@ -202,7 +190,6 @@ static const struct iommu_flush_ops msm_iommu_flush_ops = { .tlb_flush_walk = __flush_iotlb_walk, .tlb_flush_leaf = __flush_iotlb_leaf, .tlb_add_page = __flush_iotlb_page, - .tlb_sync = __flush_iotlb_sync, }; static int msm_iommu_alloc_ctx(unsigned long *map, int start, int end) @@ -712,6 +699,13 @@ static struct iommu_ops msm_iommu_ops = { .detach_dev = msm_iommu_detach_dev, .map = msm_iommu_map, .unmap = msm_iommu_unmap, + /* + * Nothing is needed here, the barrier to guarantee + * completion of the tlb sync operation is implicitly + * taken care when the iommu client does a writel before + * kick starting the other master. + */ + .iotlb_sync = NULL, .iova_to_phys = msm_iommu_iova_to_phys, .add_device = msm_iommu_add_device, .remove_device = msm_iommu_remove_device, diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c index a0b4b4dc4b90..3785750bdb44 100644 --- a/drivers/iommu/mtk_iommu.c +++ b/drivers/iommu/mtk_iommu.c @@ -213,7 +213,6 @@ static const struct iommu_flush_ops mtk_iommu_flush_ops = { .tlb_flush_walk = mtk_iommu_tlb_flush_walk, .tlb_flush_leaf = mtk_iommu_tlb_flush_leaf, .tlb_add_page = mtk_iommu_tlb_flush_page_nosync, - .tlb_sync = mtk_iommu_tlb_sync, }; static irqreturn_t mtk_iommu_isr(int irq, void *dev_id) diff --git a/drivers/iommu/qcom_iommu.c b/drivers/iommu/qcom_iommu.c index 7d8411dee4cf..0b8a6d6bb475 100644 --- a/drivers/iommu/qcom_iommu.c +++ b/drivers/iommu/qcom_iommu.c @@ -189,7 +189,6 @@ static const struct iommu_flush_ops qcom_flush_ops = { .tlb_flush_walk = qcom_iommu_tlb_flush_walk, .tlb_flush_leaf = qcom_iommu_tlb_flush_leaf, .tlb_add_page = qcom_iommu_tlb_add_page, - .tlb_sync = qcom_iommu_tlb_sync, }; static irqreturn_t qcom_iommu_fault(int irq, void *dev) diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h index 99e04bd2baa1..843310484fe2 100644 --- a/include/linux/io-pgtable.h +++ b/include/linux/io-pgtable.h @@ -30,9 +30,6 @@ enum io_pgtable_fmt { * for IOMMUs that cannot batch TLB invalidation operations * efficiently and are therefore better suited to issuing them * early rather than deferring them until iommu_tlb_sync(). - * @tlb_sync: Ensure any queued TLB invalidation has taken effect, and - * any corresponding page table updates are visible to the - * IOMMU. * * Note that these can all be called in atomic context and must therefore * not block. @@ -44,7 +41,6 @@ struct iommu_flush_ops { void (*tlb_flush_leaf)(unsigned long iova, size_t size, size_t granule, void *cookie); void (*tlb_add_page)(unsigned long iova, size_t granule, void *cookie); - void (*tlb_sync)(void *cookie); }; /** @@ -218,11 +214,6 @@ io_pgtable_tlb_add_page(struct io_pgtable *iop, unsigned long iova, iop->cfg.tlb->tlb_add_page(iova, granule, iop->cookie); } -static inline void io_pgtable_tlb_sync(struct io_pgtable *iop) -{ - iop->cfg.tlb->tlb_sync(iop->cookie); -} - /** * struct io_pgtable_init_fns - Alloc/free a set of page tables for a * particular format. -- cgit v1.2.3 From a2d3a382d6c682e22b263c9e7f0d857c3fa6c9d6 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 2 Jul 2019 16:44:58 +0100 Subject: iommu/io-pgtable: Pass struct iommu_iotlb_gather to ->unmap() Update the io-pgtable ->unmap() function to take an iommu_iotlb_gather pointer as an argument, and update the callers as appropriate. Signed-off-by: Will Deacon --- drivers/gpu/drm/panfrost/panfrost_mmu.c | 2 +- drivers/iommu/arm-smmu-v3.c | 2 +- drivers/iommu/arm-smmu.c | 2 +- drivers/iommu/io-pgtable-arm-v7s.c | 6 +++--- drivers/iommu/io-pgtable-arm.c | 7 +++---- drivers/iommu/ipmmu-vmsa.c | 2 +- drivers/iommu/msm_iommu.c | 2 +- drivers/iommu/mtk_iommu.c | 2 +- drivers/iommu/qcom_iommu.c | 2 +- include/linux/io-pgtable.h | 4 +++- 10 files changed, 16 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/panfrost/panfrost_mmu.c b/drivers/gpu/drm/panfrost/panfrost_mmu.c index de22a2276e00..6e8145c36e93 100644 --- a/drivers/gpu/drm/panfrost/panfrost_mmu.c +++ b/drivers/gpu/drm/panfrost/panfrost_mmu.c @@ -222,7 +222,7 @@ void panfrost_mmu_unmap(struct panfrost_gem_object *bo) size_t unmapped_page; size_t pgsize = get_pgsize(iova, len - unmapped_len); - unmapped_page = ops->unmap(ops, iova, pgsize); + unmapped_page = ops->unmap(ops, iova, pgsize, NULL); if (!unmapped_page) break; diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c index 231093413ff9..8e2e53079f48 100644 --- a/drivers/iommu/arm-smmu-v3.c +++ b/drivers/iommu/arm-smmu-v3.c @@ -2015,7 +2015,7 @@ static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova, if (!ops) return 0; - ret = ops->unmap(ops, iova, size); + ret = ops->unmap(ops, iova, size, gather); if (ret && arm_smmu_atc_inv_domain(smmu_domain, 0, iova, size)) return 0; diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c index 07a267c437d6..f6689956ab6e 100644 --- a/drivers/iommu/arm-smmu.c +++ b/drivers/iommu/arm-smmu.c @@ -1362,7 +1362,7 @@ static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova, return 0; arm_smmu_rpm_get(smmu); - ret = ops->unmap(ops, iova, size); + ret = ops->unmap(ops, iova, size, gather); arm_smmu_rpm_put(smmu); return ret; diff --git a/drivers/iommu/io-pgtable-arm-v7s.c b/drivers/iommu/io-pgtable-arm-v7s.c index 203894fb6765..a7776e982b6c 100644 --- a/drivers/iommu/io-pgtable-arm-v7s.c +++ b/drivers/iommu/io-pgtable-arm-v7s.c @@ -666,7 +666,7 @@ static size_t __arm_v7s_unmap(struct arm_v7s_io_pgtable *data, } static size_t arm_v7s_unmap(struct io_pgtable_ops *ops, unsigned long iova, - size_t size) + size_t size, struct iommu_iotlb_gather *gather) { struct arm_v7s_io_pgtable *data = io_pgtable_ops_to_data(ops); @@ -892,7 +892,7 @@ static int __init arm_v7s_do_selftests(void) size = 1UL << __ffs(cfg.pgsize_bitmap); while (i < loopnr) { iova_start = i * SZ_16M; - if (ops->unmap(ops, iova_start + size, size) != size) + if (ops->unmap(ops, iova_start + size, size, NULL) != size) return __FAIL(ops); /* Remap of partial unmap */ @@ -910,7 +910,7 @@ static int __init arm_v7s_do_selftests(void) for_each_set_bit(i, &cfg.pgsize_bitmap, BITS_PER_LONG) { size = 1UL << i; - if (ops->unmap(ops, iova, size) != size) + if (ops->unmap(ops, iova, size, NULL) != size) return __FAIL(ops); if (ops->iova_to_phys(ops, iova + 42)) diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c index f35516744965..325430f8a0a1 100644 --- a/drivers/iommu/io-pgtable-arm.c +++ b/drivers/iommu/io-pgtable-arm.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include @@ -642,7 +641,7 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data, } static size_t arm_lpae_unmap(struct io_pgtable_ops *ops, unsigned long iova, - size_t size) + size_t size, struct iommu_iotlb_gather *gather) { struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops); arm_lpae_iopte *ptep = data->pgd; @@ -1167,7 +1166,7 @@ static int __init arm_lpae_run_tests(struct io_pgtable_cfg *cfg) /* Partial unmap */ size = 1UL << __ffs(cfg->pgsize_bitmap); - if (ops->unmap(ops, SZ_1G + size, size) != size) + if (ops->unmap(ops, SZ_1G + size, size, NULL) != size) return __FAIL(ops, i); /* Remap of partial unmap */ @@ -1182,7 +1181,7 @@ static int __init arm_lpae_run_tests(struct io_pgtable_cfg *cfg) for_each_set_bit(j, &cfg->pgsize_bitmap, BITS_PER_LONG) { size = 1UL << j; - if (ops->unmap(ops, iova, size) != size) + if (ops->unmap(ops, iova, size, NULL) != size) return __FAIL(ops, i); if (ops->iova_to_phys(ops, iova + 42)) diff --git a/drivers/iommu/ipmmu-vmsa.c b/drivers/iommu/ipmmu-vmsa.c index a2b8eff4c1f7..76a8ec343d53 100644 --- a/drivers/iommu/ipmmu-vmsa.c +++ b/drivers/iommu/ipmmu-vmsa.c @@ -737,7 +737,7 @@ static size_t ipmmu_unmap(struct iommu_domain *io_domain, unsigned long iova, { struct ipmmu_vmsa_domain *domain = to_vmsa_domain(io_domain); - return domain->iop->unmap(domain->iop, iova, size); + return domain->iop->unmap(domain->iop, iova, size, gather); } static void ipmmu_flush_iotlb_all(struct iommu_domain *io_domain) diff --git a/drivers/iommu/msm_iommu.c b/drivers/iommu/msm_iommu.c index ccfc7ed230ef..8a0dcaf0a9e9 100644 --- a/drivers/iommu/msm_iommu.c +++ b/drivers/iommu/msm_iommu.c @@ -523,7 +523,7 @@ static size_t msm_iommu_unmap(struct iommu_domain *domain, unsigned long iova, unsigned long flags; spin_lock_irqsave(&priv->pgtlock, flags); - len = priv->iop->unmap(priv->iop, iova, len); + len = priv->iop->unmap(priv->iop, iova, len, gather); spin_unlock_irqrestore(&priv->pgtlock, flags); return len; diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c index 3785750bdb44..b73cffd63262 100644 --- a/drivers/iommu/mtk_iommu.c +++ b/drivers/iommu/mtk_iommu.c @@ -400,7 +400,7 @@ static size_t mtk_iommu_unmap(struct iommu_domain *domain, size_t unmapsz; spin_lock_irqsave(&dom->pgtlock, flags); - unmapsz = dom->iop->unmap(dom->iop, iova, size); + unmapsz = dom->iop->unmap(dom->iop, iova, size, gather); spin_unlock_irqrestore(&dom->pgtlock, flags); return unmapsz; diff --git a/drivers/iommu/qcom_iommu.c b/drivers/iommu/qcom_iommu.c index 0b8a6d6bb475..48b288ef74b4 100644 --- a/drivers/iommu/qcom_iommu.c +++ b/drivers/iommu/qcom_iommu.c @@ -455,7 +455,7 @@ static size_t qcom_iommu_unmap(struct iommu_domain *domain, unsigned long iova, */ pm_runtime_get_sync(qcom_domain->iommu->dev); spin_lock_irqsave(&qcom_domain->pgtbl_lock, flags); - ret = ops->unmap(ops, iova, size); + ret = ops->unmap(ops, iova, size, gather); spin_unlock_irqrestore(&qcom_domain->pgtbl_lock, flags); pm_runtime_put_sync(qcom_domain->iommu->dev); diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h index 843310484fe2..fe27d93c8ad9 100644 --- a/include/linux/io-pgtable.h +++ b/include/linux/io-pgtable.h @@ -1,7 +1,9 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __IO_PGTABLE_H #define __IO_PGTABLE_H + #include +#include /* * Public API for use by IOMMU drivers @@ -136,7 +138,7 @@ struct io_pgtable_ops { int (*map)(struct io_pgtable_ops *ops, unsigned long iova, phys_addr_t paddr, size_t size, int prot); size_t (*unmap)(struct io_pgtable_ops *ops, unsigned long iova, - size_t size); + size_t size, struct iommu_iotlb_gather *gather); phys_addr_t (*iova_to_phys)(struct io_pgtable_ops *ops, unsigned long iova); }; -- cgit v1.2.3 From 3951c41af4a65ba418e6b1b973d398552bedb84f Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 2 Jul 2019 16:45:15 +0100 Subject: iommu/io-pgtable: Pass struct iommu_iotlb_gather to ->tlb_add_page() With all the pieces in place, we can finally propagate the iommu_iotlb_gather structure from the call to unmap() down to the IOMMU drivers' implementation of ->tlb_add_page(). Currently everybody ignores it, but the machinery is now there to defer invalidation. Signed-off-by: Will Deacon --- drivers/iommu/arm-smmu-v3.c | 3 ++- drivers/iommu/arm-smmu.c | 3 ++- drivers/iommu/io-pgtable-arm-v7s.c | 23 ++++++++++++++--------- drivers/iommu/io-pgtable-arm.c | 22 ++++++++++++++-------- drivers/iommu/msm_iommu.c | 3 ++- drivers/iommu/mtk_iommu.c | 3 ++- drivers/iommu/qcom_iommu.c | 3 ++- include/linux/io-pgtable.h | 16 +++++++++------- 8 files changed, 47 insertions(+), 29 deletions(-) (limited to 'include') diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c index 8e2e53079f48..d1ebc7103065 100644 --- a/drivers/iommu/arm-smmu-v3.c +++ b/drivers/iommu/arm-smmu-v3.c @@ -1596,7 +1596,8 @@ static void arm_smmu_tlb_inv_range_nosync(unsigned long iova, size_t size, } while (size -= granule); } -static void arm_smmu_tlb_inv_page_nosync(unsigned long iova, size_t granule, +static void arm_smmu_tlb_inv_page_nosync(struct iommu_iotlb_gather *gather, + unsigned long iova, size_t granule, void *cookie) { arm_smmu_tlb_inv_range_nosync(iova, granule, granule, true, cookie); diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c index f6689956ab6e..5598d0ff71a8 100644 --- a/drivers/iommu/arm-smmu.c +++ b/drivers/iommu/arm-smmu.c @@ -574,7 +574,8 @@ static void arm_smmu_tlb_inv_leaf(unsigned long iova, size_t size, ops->tlb_sync(cookie); } -static void arm_smmu_tlb_add_page(unsigned long iova, size_t granule, +static void arm_smmu_tlb_add_page(struct iommu_iotlb_gather *gather, + unsigned long iova, size_t granule, void *cookie) { struct arm_smmu_domain *smmu_domain = cookie; diff --git a/drivers/iommu/io-pgtable-arm-v7s.c b/drivers/iommu/io-pgtable-arm-v7s.c index a7776e982b6c..18e7d212c7de 100644 --- a/drivers/iommu/io-pgtable-arm-v7s.c +++ b/drivers/iommu/io-pgtable-arm-v7s.c @@ -362,7 +362,8 @@ static bool arm_v7s_pte_is_cont(arm_v7s_iopte pte, int lvl) return false; } -static size_t __arm_v7s_unmap(struct arm_v7s_io_pgtable *, unsigned long, +static size_t __arm_v7s_unmap(struct arm_v7s_io_pgtable *, + struct iommu_iotlb_gather *, unsigned long, size_t, int, arm_v7s_iopte *); static int arm_v7s_init_pte(struct arm_v7s_io_pgtable *data, @@ -383,7 +384,7 @@ static int arm_v7s_init_pte(struct arm_v7s_io_pgtable *data, size_t sz = ARM_V7S_BLOCK_SIZE(lvl); tblp = ptep - ARM_V7S_LVL_IDX(iova, lvl); - if (WARN_ON(__arm_v7s_unmap(data, iova + i * sz, + if (WARN_ON(__arm_v7s_unmap(data, NULL, iova + i * sz, sz, lvl, tblp) != sz)) return -EINVAL; } else if (ptep[i]) { @@ -545,6 +546,7 @@ static arm_v7s_iopte arm_v7s_split_cont(struct arm_v7s_io_pgtable *data, } static size_t arm_v7s_split_blk_unmap(struct arm_v7s_io_pgtable *data, + struct iommu_iotlb_gather *gather, unsigned long iova, size_t size, arm_v7s_iopte blk_pte, arm_v7s_iopte *ptep) @@ -581,14 +583,15 @@ static size_t arm_v7s_split_blk_unmap(struct arm_v7s_io_pgtable *data, return 0; tablep = iopte_deref(pte, 1); - return __arm_v7s_unmap(data, iova, size, 2, tablep); + return __arm_v7s_unmap(data, gather, iova, size, 2, tablep); } - io_pgtable_tlb_add_page(&data->iop, iova, size); + io_pgtable_tlb_add_page(&data->iop, gather, iova, size); return size; } static size_t __arm_v7s_unmap(struct arm_v7s_io_pgtable *data, + struct iommu_iotlb_gather *gather, unsigned long iova, size_t size, int lvl, arm_v7s_iopte *ptep) { @@ -647,7 +650,7 @@ static size_t __arm_v7s_unmap(struct arm_v7s_io_pgtable *data, */ smp_wmb(); } else { - io_pgtable_tlb_add_page(iop, iova, blk_size); + io_pgtable_tlb_add_page(iop, gather, iova, blk_size); } iova += blk_size; } @@ -657,12 +660,13 @@ static size_t __arm_v7s_unmap(struct arm_v7s_io_pgtable *data, * Insert a table at the next level to map the old region, * minus the part we want to unmap */ - return arm_v7s_split_blk_unmap(data, iova, size, pte[0], ptep); + return arm_v7s_split_blk_unmap(data, gather, iova, size, pte[0], + ptep); } /* Keep on walkin' */ ptep = iopte_deref(pte[0], lvl); - return __arm_v7s_unmap(data, iova, size, lvl + 1, ptep); + return __arm_v7s_unmap(data, gather, iova, size, lvl + 1, ptep); } static size_t arm_v7s_unmap(struct io_pgtable_ops *ops, unsigned long iova, @@ -673,7 +677,7 @@ static size_t arm_v7s_unmap(struct io_pgtable_ops *ops, unsigned long iova, if (WARN_ON(upper_32_bits(iova))) return 0; - return __arm_v7s_unmap(data, iova, size, 1, data->pgd); + return __arm_v7s_unmap(data, gather, iova, size, 1, data->pgd); } static phys_addr_t arm_v7s_iova_to_phys(struct io_pgtable_ops *ops, @@ -808,7 +812,8 @@ static void dummy_tlb_flush(unsigned long iova, size_t size, size_t granule, WARN_ON(!(size & cfg_cookie->pgsize_bitmap)); } -static void dummy_tlb_add_page(unsigned long iova, size_t granule, void *cookie) +static void dummy_tlb_add_page(struct iommu_iotlb_gather *gather, + unsigned long iova, size_t granule, void *cookie) { dummy_tlb_flush(iova, granule, granule, cookie); } diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c index 325430f8a0a1..4c91359057c5 100644 --- a/drivers/iommu/io-pgtable-arm.c +++ b/drivers/iommu/io-pgtable-arm.c @@ -289,6 +289,7 @@ static void __arm_lpae_set_pte(arm_lpae_iopte *ptep, arm_lpae_iopte pte, } static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data, + struct iommu_iotlb_gather *gather, unsigned long iova, size_t size, int lvl, arm_lpae_iopte *ptep); @@ -334,8 +335,10 @@ static int arm_lpae_init_pte(struct arm_lpae_io_pgtable *data, size_t sz = ARM_LPAE_BLOCK_SIZE(lvl, data); tblp = ptep - ARM_LPAE_LVL_IDX(iova, lvl, data); - if (WARN_ON(__arm_lpae_unmap(data, iova, sz, lvl, tblp) != sz)) + if (__arm_lpae_unmap(data, NULL, iova, sz, lvl, tblp) != sz) { + WARN_ON(1); return -EINVAL; + } } __arm_lpae_init_pte(data, paddr, prot, lvl, ptep); @@ -536,6 +539,7 @@ static void arm_lpae_free_pgtable(struct io_pgtable *iop) } static size_t arm_lpae_split_blk_unmap(struct arm_lpae_io_pgtable *data, + struct iommu_iotlb_gather *gather, unsigned long iova, size_t size, arm_lpae_iopte blk_pte, int lvl, arm_lpae_iopte *ptep) @@ -581,14 +585,15 @@ static size_t arm_lpae_split_blk_unmap(struct arm_lpae_io_pgtable *data, tablep = iopte_deref(pte, data); } else if (unmap_idx >= 0) { - io_pgtable_tlb_add_page(&data->iop, iova, size); + io_pgtable_tlb_add_page(&data->iop, gather, iova, size); return size; } - return __arm_lpae_unmap(data, iova, size, lvl, tablep); + return __arm_lpae_unmap(data, gather, iova, size, lvl, tablep); } static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data, + struct iommu_iotlb_gather *gather, unsigned long iova, size_t size, int lvl, arm_lpae_iopte *ptep) { @@ -622,7 +627,7 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data, */ smp_wmb(); } else { - io_pgtable_tlb_add_page(iop, iova, size); + io_pgtable_tlb_add_page(iop, gather, iova, size); } return size; @@ -631,13 +636,13 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data, * Insert a table at the next level to map the old region, * minus the part we want to unmap */ - return arm_lpae_split_blk_unmap(data, iova, size, pte, + return arm_lpae_split_blk_unmap(data, gather, iova, size, pte, lvl + 1, ptep); } /* Keep on walkin' */ ptep = iopte_deref(pte, data); - return __arm_lpae_unmap(data, iova, size, lvl + 1, ptep); + return __arm_lpae_unmap(data, gather, iova, size, lvl + 1, ptep); } static size_t arm_lpae_unmap(struct io_pgtable_ops *ops, unsigned long iova, @@ -650,7 +655,7 @@ static size_t arm_lpae_unmap(struct io_pgtable_ops *ops, unsigned long iova, if (WARN_ON(iova >= (1ULL << data->iop.cfg.ias))) return 0; - return __arm_lpae_unmap(data, iova, size, lvl, ptep); + return __arm_lpae_unmap(data, gather, iova, size, lvl, ptep); } static phys_addr_t arm_lpae_iova_to_phys(struct io_pgtable_ops *ops, @@ -1074,7 +1079,8 @@ static void dummy_tlb_flush(unsigned long iova, size_t size, size_t granule, WARN_ON(!(size & cfg_cookie->pgsize_bitmap)); } -static void dummy_tlb_add_page(unsigned long iova, size_t granule, void *cookie) +static void dummy_tlb_add_page(struct iommu_iotlb_gather *gather, + unsigned long iova, size_t granule, void *cookie) { dummy_tlb_flush(iova, granule, granule, cookie); } diff --git a/drivers/iommu/msm_iommu.c b/drivers/iommu/msm_iommu.c index 8a0dcaf0a9e9..4c0be5b75c28 100644 --- a/drivers/iommu/msm_iommu.c +++ b/drivers/iommu/msm_iommu.c @@ -180,7 +180,8 @@ static void __flush_iotlb_leaf(unsigned long iova, size_t size, __flush_iotlb_range(iova, size, granule, true, cookie); } -static void __flush_iotlb_page(unsigned long iova, size_t granule, void *cookie) +static void __flush_iotlb_page(struct iommu_iotlb_gather *gather, + unsigned long iova, size_t granule, void *cookie) { __flush_iotlb_range(iova, granule, granule, true, cookie); } diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c index b73cffd63262..0827d51936fa 100644 --- a/drivers/iommu/mtk_iommu.c +++ b/drivers/iommu/mtk_iommu.c @@ -202,7 +202,8 @@ static void mtk_iommu_tlb_flush_leaf(unsigned long iova, size_t size, mtk_iommu_tlb_sync(cookie); } -static void mtk_iommu_tlb_flush_page_nosync(unsigned long iova, size_t granule, +static void mtk_iommu_tlb_flush_page_nosync(struct iommu_iotlb_gather *gather, + unsigned long iova, size_t granule, void *cookie) { mtk_iommu_tlb_add_flush_nosync(iova, granule, granule, true, cookie); diff --git a/drivers/iommu/qcom_iommu.c b/drivers/iommu/qcom_iommu.c index 48b288ef74b4..eac760cdbb28 100644 --- a/drivers/iommu/qcom_iommu.c +++ b/drivers/iommu/qcom_iommu.c @@ -178,7 +178,8 @@ static void qcom_iommu_tlb_flush_leaf(unsigned long iova, size_t size, qcom_iommu_tlb_sync(cookie); } -static void qcom_iommu_tlb_add_page(unsigned long iova, size_t granule, +static void qcom_iommu_tlb_add_page(struct iommu_iotlb_gather *gather, + unsigned long iova, size_t granule, void *cookie) { qcom_iommu_tlb_inv_range_nosync(iova, granule, granule, true, cookie); diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h index fe27d93c8ad9..6b1b8be3ebec 100644 --- a/include/linux/io-pgtable.h +++ b/include/linux/io-pgtable.h @@ -28,10 +28,10 @@ enum io_pgtable_fmt { * @tlb_flush_leaf: Synchronously invalidate all leaf TLB state for a virtual * address range. * @tlb_add_page: Optional callback to queue up leaf TLB invalidation for a - * single page. This function exists purely as an optimisation - * for IOMMUs that cannot batch TLB invalidation operations - * efficiently and are therefore better suited to issuing them - * early rather than deferring them until iommu_tlb_sync(). + * single page. IOMMUs that cannot batch TLB invalidation + * operations efficiently will typically issue them here, but + * others may decide to update the iommu_iotlb_gather structure + * and defer the invalidation until iommu_tlb_sync() instead. * * Note that these can all be called in atomic context and must therefore * not block. @@ -42,7 +42,8 @@ struct iommu_flush_ops { void *cookie); void (*tlb_flush_leaf)(unsigned long iova, size_t size, size_t granule, void *cookie); - void (*tlb_add_page)(unsigned long iova, size_t granule, void *cookie); + void (*tlb_add_page)(struct iommu_iotlb_gather *gather, + unsigned long iova, size_t granule, void *cookie); }; /** @@ -209,11 +210,12 @@ io_pgtable_tlb_flush_leaf(struct io_pgtable *iop, unsigned long iova, } static inline void -io_pgtable_tlb_add_page(struct io_pgtable *iop, unsigned long iova, +io_pgtable_tlb_add_page(struct io_pgtable *iop, + struct iommu_iotlb_gather * gather, unsigned long iova, size_t granule) { if (iop->cfg.tlb->tlb_add_page) - iop->cfg.tlb->tlb_add_page(iova, granule, iop->cookie); + iop->cfg.tlb->tlb_add_page(gather, iova, granule, iop->cookie); } /** -- cgit v1.2.3 From 156189a6d7a797d6fa0b13dd6b4f32b11d234a99 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 29 Jul 2019 01:40:15 +0900 Subject: leds: netxbig: remove legacy board-file support Since commit ebc278f15759 ("ARM: mvebu: remove static LED setup for netxbig boards"), no one in upstream passes in the platform data to this driver. Squash leds-kirkwood-netxbig.h into the driver, and remove the legacy board-file support. Link: https://lkml.org/lkml/2019/7/20/83 Suggested-by: Arnd Bergmann Signed-off-by: Masahiro Yamada Signed-off-by: Jacek Anaszewski --- drivers/leds/Kconfig | 1 + drivers/leds/leds-netxbig.c | 70 +++++++++++++++------- .../linux/platform_data/leds-kirkwood-netxbig.h | 54 ----------------- 3 files changed, 51 insertions(+), 74 deletions(-) delete mode 100644 include/linux/platform_data/leds-kirkwood-netxbig.h (limited to 'include') diff --git a/drivers/leds/Kconfig b/drivers/leds/Kconfig index f7a3dd7ecf3d..1988de1d64c0 100644 --- a/drivers/leds/Kconfig +++ b/drivers/leds/Kconfig @@ -590,6 +590,7 @@ config LEDS_NETXBIG tristate "LED support for Big Network series LEDs" depends on LEDS_CLASS depends on MACH_KIRKWOOD + depends on OF_GPIO default y help This option enables support for LEDs found on the LaCie 2Big diff --git a/drivers/leds/leds-netxbig.c b/drivers/leds/leds-netxbig.c index 10497a466775..0944cb111c34 100644 --- a/drivers/leds/leds-netxbig.c +++ b/drivers/leds/leds-netxbig.c @@ -15,7 +15,48 @@ #include #include #include -#include + +struct netxbig_gpio_ext { + unsigned int *addr; + int num_addr; + unsigned int *data; + int num_data; + unsigned int enable; +}; + +enum netxbig_led_mode { + NETXBIG_LED_OFF, + NETXBIG_LED_ON, + NETXBIG_LED_SATA, + NETXBIG_LED_TIMER1, + NETXBIG_LED_TIMER2, + NETXBIG_LED_MODE_NUM, +}; + +#define NETXBIG_LED_INVALID_MODE NETXBIG_LED_MODE_NUM + +struct netxbig_led_timer { + unsigned long delay_on; + unsigned long delay_off; + enum netxbig_led_mode mode; +}; + +struct netxbig_led { + const char *name; + const char *default_trigger; + int mode_addr; + int *mode_val; + int bright_addr; + int bright_max; +}; + +struct netxbig_led_platform_data { + struct netxbig_gpio_ext *gpio_ext; + struct netxbig_led_timer *timer; + int num_timer; + struct netxbig_led *leds; + int num_leds; +}; /* * GPIO extension bus. @@ -306,7 +347,6 @@ static int create_netxbig_led(struct platform_device *pdev, return devm_led_classdev_register(&pdev->dev, &led_dat->cdev); } -#ifdef CONFIG_OF_GPIO static int gpio_ext_get_of_pdata(struct device *dev, struct device_node *np, struct netxbig_gpio_ext *gpio_ext) { @@ -522,30 +562,20 @@ static const struct of_device_id of_netxbig_leds_match[] = { {}, }; MODULE_DEVICE_TABLE(of, of_netxbig_leds_match); -#else -static inline int -netxbig_leds_get_of_pdata(struct device *dev, - struct netxbig_led_platform_data *pdata) -{ - return -ENODEV; -} -#endif /* CONFIG_OF_GPIO */ static int netxbig_led_probe(struct platform_device *pdev) { - struct netxbig_led_platform_data *pdata = dev_get_platdata(&pdev->dev); + struct netxbig_led_platform_data *pdata; struct netxbig_led_data *leds_data; int i; int ret; - if (!pdata) { - pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL); - if (!pdata) - return -ENOMEM; - ret = netxbig_leds_get_of_pdata(&pdev->dev, pdata); - if (ret) - return ret; - } + pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL); + if (!pdata) + return -ENOMEM; + ret = netxbig_leds_get_of_pdata(&pdev->dev, pdata); + if (ret) + return ret; leds_data = devm_kcalloc(&pdev->dev, pdata->num_leds, sizeof(*leds_data), @@ -571,7 +601,7 @@ static struct platform_driver netxbig_led_driver = { .probe = netxbig_led_probe, .driver = { .name = "leds-netxbig", - .of_match_table = of_match_ptr(of_netxbig_leds_match), + .of_match_table = of_netxbig_leds_match, }, }; diff --git a/include/linux/platform_data/leds-kirkwood-netxbig.h b/include/linux/platform_data/leds-kirkwood-netxbig.h deleted file mode 100644 index 3c85a735c380..000000000000 --- a/include/linux/platform_data/leds-kirkwood-netxbig.h +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Platform data structure for netxbig LED driver - * - * This file is licensed under the terms of the GNU General Public - * License version 2. This program is licensed "as is" without any - * warranty of any kind, whether express or implied. - */ - -#ifndef __LEDS_KIRKWOOD_NETXBIG_H -#define __LEDS_KIRKWOOD_NETXBIG_H - -struct netxbig_gpio_ext { - unsigned *addr; - int num_addr; - unsigned *data; - int num_data; - unsigned enable; -}; - -enum netxbig_led_mode { - NETXBIG_LED_OFF, - NETXBIG_LED_ON, - NETXBIG_LED_SATA, - NETXBIG_LED_TIMER1, - NETXBIG_LED_TIMER2, - NETXBIG_LED_MODE_NUM, -}; - -#define NETXBIG_LED_INVALID_MODE NETXBIG_LED_MODE_NUM - -struct netxbig_led_timer { - unsigned long delay_on; - unsigned long delay_off; - enum netxbig_led_mode mode; -}; - -struct netxbig_led { - const char *name; - const char *default_trigger; - int mode_addr; - int *mode_val; - int bright_addr; - int bright_max; -}; - -struct netxbig_led_platform_data { - struct netxbig_gpio_ext *gpio_ext; - struct netxbig_led_timer *timer; - int num_timer; - struct netxbig_led *leds; - int num_leds; -}; - -#endif /* __LEDS_KIRKWOOD_NETXBIG_H */ -- cgit v1.2.3 From 6dbff13ca8a2ad2fddd904c2e789dd5e59a8644c Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Fri, 26 Jul 2019 18:06:52 +0200 Subject: include/bpf.h: Remove map_insert_ctx() stubs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When we changed the device and CPU maps to use linked lists instead of bitmaps, we also removed the need for the map_insert_ctx() helpers to keep track of the bitmaps inside each map. However, it seems I forgot to remove the function definitions stubs, so remove those here. Signed-off-by: Toke Høiland-Jørgensen Acked-by: Yonghong Song Acked-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 18f4cc2c6acd..bfdb54dd2ad1 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -713,7 +713,6 @@ struct xdp_buff; struct sk_buff; struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key); -void __dev_map_insert_ctx(struct bpf_map *map, u32 index); void __dev_map_flush(struct bpf_map *map); int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, struct net_device *dev_rx); @@ -721,7 +720,6 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, struct bpf_prog *xdp_prog); struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key); -void __cpu_map_insert_ctx(struct bpf_map *map, u32 index); void __cpu_map_flush(struct bpf_map *map); int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, struct net_device *dev_rx); @@ -801,10 +799,6 @@ static inline struct net_device *__dev_map_lookup_elem(struct bpf_map *map, return NULL; } -static inline void __dev_map_insert_ctx(struct bpf_map *map, u32 index) -{ -} - static inline void __dev_map_flush(struct bpf_map *map) { } @@ -834,10 +828,6 @@ struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key) return NULL; } -static inline void __cpu_map_insert_ctx(struct bpf_map *map, u32 index) -{ -} - static inline void __cpu_map_flush(struct bpf_map *map) { } -- cgit v1.2.3 From 6f9d451ab1a33728adb72d7ff66a7b374d665176 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Fri, 26 Jul 2019 18:06:55 +0200 Subject: xdp: Add devmap_hash map type for looking up devices by hashed index MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A common pattern when using xdp_redirect_map() is to create a device map where the lookup key is simply ifindex. Because device maps are arrays, this leaves holes in the map, and the map has to be sized to fit the largest ifindex, regardless of how many devices actually are actually needed in the map. This patch adds a second type of device map where the key is looked up using a hashmap, instead of being used as an array index. This allows maps to be densely packed, so they can be smaller. Signed-off-by: Toke Høiland-Jørgensen Acked-by: Yonghong Song Acked-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 7 ++ include/linux/bpf_types.h | 1 + include/trace/events/xdp.h | 3 +- include/uapi/linux/bpf.h | 1 + kernel/bpf/devmap.c | 200 +++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 2 + net/core/filter.c | 9 +- 7 files changed, 220 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index bfdb54dd2ad1..f9a506147c8a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -713,6 +713,7 @@ struct xdp_buff; struct sk_buff; struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key); +struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key); void __dev_map_flush(struct bpf_map *map); int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, struct net_device *dev_rx); @@ -799,6 +800,12 @@ static inline struct net_device *__dev_map_lookup_elem(struct bpf_map *map, return NULL; } +static inline struct net_device *__dev_map_hash_lookup_elem(struct bpf_map *map, + u32 key) +{ + return NULL; +} + static inline void __dev_map_flush(struct bpf_map *map) { } diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index eec5aeeeaf92..36a9c2325176 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -62,6 +62,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY_OF_MAPS, array_of_maps_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops) #ifdef CONFIG_NET BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP_HASH, dev_map_hash_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_SK_STORAGE, sk_storage_map_ops) #if defined(CONFIG_BPF_STREAM_PARSER) BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops) diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index 68899fdc985b..8c8420230a10 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -175,7 +175,8 @@ struct _bpf_dtab_netdev { #endif /* __DEVMAP_OBJ_TYPE */ #define devmap_ifindex(fwd, map) \ - ((map->map_type == BPF_MAP_TYPE_DEVMAP) ? \ + ((map->map_type == BPF_MAP_TYPE_DEVMAP || \ + map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) ? \ ((struct _bpf_dtab_netdev *)fwd)->dev->ifindex : 0) #define _trace_xdp_redirect_map(dev, xdp, fwd, map, idx) \ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e985f07a98ed..6bbef0c7f585 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -134,6 +134,7 @@ enum bpf_map_type { BPF_MAP_TYPE_QUEUE, BPF_MAP_TYPE_STACK, BPF_MAP_TYPE_SK_STORAGE, + BPF_MAP_TYPE_DEVMAP_HASH, }; /* Note that tracing related programs such as diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index a0501266bdb8..9af048a932b5 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -37,6 +37,12 @@ * notifier hook walks the map we know that new dev references can not be * added by the user because core infrastructure ensures dev_get_by_index() * calls will fail at this point. + * + * The devmap_hash type is a map type which interprets keys as ifindexes and + * indexes these using a hashmap. This allows maps that use ifindex as key to be + * densely packed instead of having holes in the lookup array for unused + * ifindexes. The setup and packet enqueue/send code is shared between the two + * types of devmap; only the lookup and insertion is different. */ #include #include @@ -59,6 +65,7 @@ struct xdp_bulk_queue { struct bpf_dtab_netdev { struct net_device *dev; /* must be first member, due to tracepoint */ + struct hlist_node index_hlist; struct bpf_dtab *dtab; struct xdp_bulk_queue __percpu *bulkq; struct rcu_head rcu; @@ -70,11 +77,30 @@ struct bpf_dtab { struct bpf_dtab_netdev **netdev_map; struct list_head __percpu *flush_list; struct list_head list; + + /* these are only used for DEVMAP_HASH type maps */ + struct hlist_head *dev_index_head; + spinlock_t index_lock; + unsigned int items; + u32 n_buckets; }; static DEFINE_SPINLOCK(dev_map_lock); static LIST_HEAD(dev_map_list); +static struct hlist_head *dev_map_create_hash(unsigned int entries) +{ + int i; + struct hlist_head *hash; + + hash = kmalloc_array(entries, sizeof(*hash), GFP_KERNEL); + if (hash != NULL) + for (i = 0; i < entries; i++) + INIT_HLIST_HEAD(&hash[i]); + + return hash; +} + static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) { int err, cpu; @@ -97,6 +123,14 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); cost += sizeof(struct list_head) * num_possible_cpus(); + if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { + dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries); + + if (!dtab->n_buckets) /* Overflow check */ + return -EINVAL; + cost += sizeof(struct hlist_head) * dtab->n_buckets; + } + /* if map size is larger than memlock limit, reject it */ err = bpf_map_charge_init(&dtab->map.memory, cost); if (err) @@ -115,8 +149,18 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) if (!dtab->netdev_map) goto free_percpu; + if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { + dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets); + if (!dtab->dev_index_head) + goto free_map_area; + + spin_lock_init(&dtab->index_lock); + } + return 0; +free_map_area: + bpf_map_area_free(dtab->netdev_map); free_percpu: free_percpu(dtab->flush_list); free_charge: @@ -198,6 +242,7 @@ static void dev_map_free(struct bpf_map *map) free_percpu(dtab->flush_list); bpf_map_area_free(dtab->netdev_map); + kfree(dtab->dev_index_head); kfree(dtab); } @@ -218,6 +263,70 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) return 0; } +static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab, + int idx) +{ + return &dtab->dev_index_head[idx & (dtab->n_buckets - 1)]; +} + +struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key) +{ + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + struct hlist_head *head = dev_map_index_hash(dtab, key); + struct bpf_dtab_netdev *dev; + + hlist_for_each_entry_rcu(dev, head, index_hlist) + if (dev->idx == key) + return dev; + + return NULL; +} + +static int dev_map_hash_get_next_key(struct bpf_map *map, void *key, + void *next_key) +{ + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + u32 idx, *next = next_key; + struct bpf_dtab_netdev *dev, *next_dev; + struct hlist_head *head; + int i = 0; + + if (!key) + goto find_first; + + idx = *(u32 *)key; + + dev = __dev_map_hash_lookup_elem(map, idx); + if (!dev) + goto find_first; + + next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&dev->index_hlist)), + struct bpf_dtab_netdev, index_hlist); + + if (next_dev) { + *next = next_dev->idx; + return 0; + } + + i = idx & (dtab->n_buckets - 1); + i++; + + find_first: + for (; i < dtab->n_buckets; i++) { + head = dev_map_index_hash(dtab, i); + + next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)), + struct bpf_dtab_netdev, + index_hlist); + if (next_dev) { + *next = next_dev->idx; + return 0; + } + } + + return -ENOENT; +} + static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags, bool in_napi_ctx) { @@ -373,6 +482,15 @@ static void *dev_map_lookup_elem(struct bpf_map *map, void *key) return dev ? &dev->ifindex : NULL; } +static void *dev_map_hash_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_dtab_netdev *obj = __dev_map_hash_lookup_elem(map, + *(u32 *)key); + struct net_device *dev = obj ? obj->dev : NULL; + + return dev ? &dev->ifindex : NULL; +} + static void dev_map_flush_old(struct bpf_dtab_netdev *dev) { if (dev->dev->netdev_ops->ndo_xdp_xmit) { @@ -422,6 +540,28 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key) return 0; } +static int dev_map_hash_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + struct bpf_dtab_netdev *old_dev; + int k = *(u32 *)key; + unsigned long flags; + int ret = -ENOENT; + + spin_lock_irqsave(&dtab->index_lock, flags); + + old_dev = __dev_map_hash_lookup_elem(map, k); + if (old_dev) { + dtab->items--; + hlist_del_init_rcu(&old_dev->index_hlist); + call_rcu(&old_dev->rcu, __dev_map_entry_free); + ret = 0; + } + spin_unlock_irqrestore(&dtab->index_lock, flags); + + return ret; +} + static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, struct bpf_dtab *dtab, u32 ifindex, @@ -502,6 +642,56 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, map, key, value, map_flags); } +static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map, + void *key, void *value, u64 map_flags) +{ + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + struct bpf_dtab_netdev *dev, *old_dev; + u32 ifindex = *(u32 *)value; + u32 idx = *(u32 *)key; + unsigned long flags; + + if (unlikely(map_flags > BPF_EXIST || !ifindex)) + return -EINVAL; + + old_dev = __dev_map_hash_lookup_elem(map, idx); + if (old_dev && (map_flags & BPF_NOEXIST)) + return -EEXIST; + + dev = __dev_map_alloc_node(net, dtab, ifindex, idx); + if (IS_ERR(dev)) + return PTR_ERR(dev); + + spin_lock_irqsave(&dtab->index_lock, flags); + + if (old_dev) { + hlist_del_rcu(&old_dev->index_hlist); + } else { + if (dtab->items >= dtab->map.max_entries) { + spin_unlock_irqrestore(&dtab->index_lock, flags); + call_rcu(&dev->rcu, __dev_map_entry_free); + return -E2BIG; + } + dtab->items++; + } + + hlist_add_head_rcu(&dev->index_hlist, + dev_map_index_hash(dtab, idx)); + spin_unlock_irqrestore(&dtab->index_lock, flags); + + if (old_dev) + call_rcu(&old_dev->rcu, __dev_map_entry_free); + + return 0; +} + +static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + return __dev_map_hash_update_elem(current->nsproxy->net_ns, + map, key, value, map_flags); +} + const struct bpf_map_ops dev_map_ops = { .map_alloc = dev_map_alloc, .map_free = dev_map_free, @@ -512,6 +702,16 @@ const struct bpf_map_ops dev_map_ops = { .map_check_btf = map_check_no_btf, }; +const struct bpf_map_ops dev_map_hash_ops = { + .map_alloc = dev_map_alloc, + .map_free = dev_map_free, + .map_get_next_key = dev_map_hash_get_next_key, + .map_lookup_elem = dev_map_hash_lookup_elem, + .map_update_elem = dev_map_hash_update_elem, + .map_delete_elem = dev_map_hash_delete_elem, + .map_check_btf = map_check_no_btf, +}; + static int dev_map_notification(struct notifier_block *notifier, ulong event, void *ptr) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5900cbb966b1..cef851cd5c36 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3457,6 +3457,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, goto error; break; case BPF_MAP_TYPE_DEVMAP: + case BPF_MAP_TYPE_DEVMAP_HASH: if (func_id != BPF_FUNC_redirect_map && func_id != BPF_FUNC_map_lookup_elem) goto error; @@ -3539,6 +3540,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, break; case BPF_FUNC_redirect_map: if (map->map_type != BPF_MAP_TYPE_DEVMAP && + map->map_type != BPF_MAP_TYPE_DEVMAP_HASH && map->map_type != BPF_MAP_TYPE_CPUMAP && map->map_type != BPF_MAP_TYPE_XSKMAP) goto error; diff --git a/net/core/filter.c b/net/core/filter.c index 3961437ccc50..1eee70f80fba 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3517,7 +3517,8 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, int err; switch (map->map_type) { - case BPF_MAP_TYPE_DEVMAP: { + case BPF_MAP_TYPE_DEVMAP: + case BPF_MAP_TYPE_DEVMAP_HASH: { struct bpf_dtab_netdev *dst = fwd; err = dev_map_enqueue(dst, xdp, dev_rx); @@ -3554,6 +3555,7 @@ void xdp_do_flush_map(void) if (map) { switch (map->map_type) { case BPF_MAP_TYPE_DEVMAP: + case BPF_MAP_TYPE_DEVMAP_HASH: __dev_map_flush(map); break; case BPF_MAP_TYPE_CPUMAP: @@ -3574,6 +3576,8 @@ static inline void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index) switch (map->map_type) { case BPF_MAP_TYPE_DEVMAP: return __dev_map_lookup_elem(map, index); + case BPF_MAP_TYPE_DEVMAP_HASH: + return __dev_map_hash_lookup_elem(map, index); case BPF_MAP_TYPE_CPUMAP: return __cpu_map_lookup_elem(map, index); case BPF_MAP_TYPE_XSKMAP: @@ -3655,7 +3659,8 @@ static int xdp_do_generic_redirect_map(struct net_device *dev, ri->tgt_value = NULL; WRITE_ONCE(ri->map, NULL); - if (map->map_type == BPF_MAP_TYPE_DEVMAP) { + if (map->map_type == BPF_MAP_TYPE_DEVMAP || + map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { struct bpf_dtab_netdev *dst = fwd; err = dev_map_generic_redirect(dst, skb, xdp_prog); -- cgit v1.2.3 From f3e4ff28b8685d856f381ee6bcf88b6149a6db5b Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Wed, 24 Jul 2019 11:00:54 +0200 Subject: scsi: libfc: Whitespace cleanup in libfc.h No functional change. [mkp: typo] Signed-off-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Signed-off-by: Martin K. Petersen --- include/scsi/libfc.h | 52 ++++++++++++++++++++++++++-------------------------- 1 file changed, 26 insertions(+), 26 deletions(-) (limited to 'include') diff --git a/include/scsi/libfc.h b/include/scsi/libfc.h index 2d64b53f947c..9b87e1a1c646 100644 --- a/include/scsi/libfc.h +++ b/include/scsi/libfc.h @@ -115,7 +115,7 @@ struct fc_disc_port { struct fc_lport *lp; struct list_head peers; struct work_struct rport_work; - u32 port_id; + u32 port_id; }; /** @@ -155,14 +155,14 @@ struct fc_rport_operations { */ struct fc_rport_libfc_priv { struct fc_lport *local_port; - enum fc_rport_state rp_state; + enum fc_rport_state rp_state; u16 flags; #define FC_RP_FLAGS_REC_SUPPORTED (1 << 0) #define FC_RP_FLAGS_RETRY (1 << 1) #define FC_RP_STARTED (1 << 2) #define FC_RP_FLAGS_CONF_REQ (1 << 3) - unsigned int e_d_tov; - unsigned int r_a_tov; + unsigned int e_d_tov; + unsigned int r_a_tov; }; /** @@ -191,24 +191,24 @@ struct fc_rport_priv { struct fc_lport *local_port; struct fc_rport *rport; struct kref kref; - enum fc_rport_state rp_state; + enum fc_rport_state rp_state; struct fc_rport_identifiers ids; u16 flags; - u16 max_seq; + u16 max_seq; u16 disc_id; u16 maxframe_size; - unsigned int retries; - unsigned int major_retries; - unsigned int e_d_tov; - unsigned int r_a_tov; - struct mutex rp_mutex; + unsigned int retries; + unsigned int major_retries; + unsigned int e_d_tov; + unsigned int r_a_tov; + struct mutex rp_mutex; struct delayed_work retry_work; - enum fc_rport_event event; + enum fc_rport_event event; struct fc_rport_operations *ops; - struct list_head peers; - struct work_struct event_work; + struct list_head peers; + struct work_struct event_work; u32 supported_classes; - u16 prli_count; + u16 prli_count; struct rcu_head rcu; u16 sp_features; u8 spp_type; @@ -618,12 +618,12 @@ struct libfc_function_template { * @disc_callback: Callback routine called when discovery completes */ struct fc_disc { - unsigned char retry_count; - unsigned char pending; - unsigned char requested; - unsigned short seq_count; - unsigned char buf_len; - u16 disc_id; + unsigned char retry_count; + unsigned char pending; + unsigned char requested; + unsigned short seq_count; + unsigned char buf_len; + u16 disc_id; struct list_head rports; void *priv; @@ -697,7 +697,7 @@ struct fc_lport { struct fc_rport_priv *ms_rdata; struct fc_rport_priv *ptp_rdata; void *scsi_priv; - struct fc_disc disc; + struct fc_disc disc; /* Virtual port information */ struct list_head vports; @@ -715,7 +715,7 @@ struct fc_lport { u8 retry_count; /* Fabric information */ - u32 port_id; + u32 port_id; u64 wwpn; u64 wwnn; unsigned int service_params; @@ -743,11 +743,11 @@ struct fc_lport { struct fc_ns_fts fcts; /* Miscellaneous */ - struct mutex lp_mutex; - struct list_head list; + struct mutex lp_mutex; + struct list_head list; struct delayed_work retry_work; void *prov[FC_FC4_PROV_SIZE]; - struct list_head lport_list; + struct list_head lport_list; }; /** -- cgit v1.2.3 From 023358b136d490ca91735ac6490db3741af5a8bd Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Wed, 24 Jul 2019 11:00:55 +0200 Subject: scsi: fcoe: Embed fc_rport_priv in fcoe_rport structure Gcc-9 complains for a memset across pointer boundaries, which happens as the code tries to allocate a flexible array on the stack. Turns out we cannot do this without relying on gcc-isms, so with this patch we'll embed the fc_rport_priv structure into fcoe_rport, can use the normal 'container_of' outcast, and will only have to do a memset over one structure. Signed-off-by: Hannes Reinecke Signed-off-by: Martin K. Petersen --- drivers/scsi/fcoe/fcoe_ctlr.c | 51 +++++++++++++++++-------------------------- drivers/scsi/libfc/fc_rport.c | 5 ++++- include/scsi/libfcoe.h | 1 + 3 files changed, 25 insertions(+), 32 deletions(-) (limited to 'include') diff --git a/drivers/scsi/fcoe/fcoe_ctlr.c b/drivers/scsi/fcoe/fcoe_ctlr.c index 1a85fe9e4b7b..fc32b5d76821 100644 --- a/drivers/scsi/fcoe/fcoe_ctlr.c +++ b/drivers/scsi/fcoe/fcoe_ctlr.c @@ -2005,7 +2005,7 @@ EXPORT_SYMBOL_GPL(fcoe_wwn_from_mac); */ static inline struct fcoe_rport *fcoe_ctlr_rport(struct fc_rport_priv *rdata) { - return (struct fcoe_rport *)(rdata + 1); + return container_of(rdata, struct fcoe_rport, rdata); } /** @@ -2269,7 +2269,7 @@ static void fcoe_ctlr_vn_start(struct fcoe_ctlr *fip) */ static int fcoe_ctlr_vn_parse(struct fcoe_ctlr *fip, struct sk_buff *skb, - struct fc_rport_priv *rdata) + struct fcoe_rport *frport) { struct fip_header *fiph; struct fip_desc *desc = NULL; @@ -2277,16 +2277,12 @@ static int fcoe_ctlr_vn_parse(struct fcoe_ctlr *fip, struct fip_wwn_desc *wwn = NULL; struct fip_vn_desc *vn = NULL; struct fip_size_desc *size = NULL; - struct fcoe_rport *frport; size_t rlen; size_t dlen; u32 desc_mask = 0; u32 dtype; u8 sub; - memset(rdata, 0, sizeof(*rdata) + sizeof(*frport)); - frport = fcoe_ctlr_rport(rdata); - fiph = (struct fip_header *)skb->data; frport->flags = ntohs(fiph->fip_flags); @@ -2349,15 +2345,17 @@ static int fcoe_ctlr_vn_parse(struct fcoe_ctlr *fip, if (dlen != sizeof(struct fip_wwn_desc)) goto len_err; wwn = (struct fip_wwn_desc *)desc; - rdata->ids.node_name = get_unaligned_be64(&wwn->fd_wwn); + frport->rdata.ids.node_name = + get_unaligned_be64(&wwn->fd_wwn); break; case FIP_DT_VN_ID: if (dlen != sizeof(struct fip_vn_desc)) goto len_err; vn = (struct fip_vn_desc *)desc; memcpy(frport->vn_mac, vn->fd_mac, ETH_ALEN); - rdata->ids.port_id = ntoh24(vn->fd_fc_id); - rdata->ids.port_name = get_unaligned_be64(&vn->fd_wwpn); + frport->rdata.ids.port_id = ntoh24(vn->fd_fc_id); + frport->rdata.ids.port_name = + get_unaligned_be64(&vn->fd_wwpn); break; case FIP_DT_FC4F: if (dlen != sizeof(struct fip_fc4_feat)) @@ -2738,10 +2736,7 @@ static int fcoe_ctlr_vn_recv(struct fcoe_ctlr *fip, struct sk_buff *skb) { struct fip_header *fiph; enum fip_vn2vn_subcode sub; - struct { - struct fc_rport_priv rdata; - struct fcoe_rport frport; - } buf; + struct fcoe_rport frport = { }; int rc, vlan_id = 0; fiph = (struct fip_header *)skb->data; @@ -2757,7 +2752,7 @@ static int fcoe_ctlr_vn_recv(struct fcoe_ctlr *fip, struct sk_buff *skb) goto drop; } - rc = fcoe_ctlr_vn_parse(fip, skb, &buf.rdata); + rc = fcoe_ctlr_vn_parse(fip, skb, &frport); if (rc) { LIBFCOE_FIP_DBG(fip, "vn_recv vn_parse error %d\n", rc); goto drop; @@ -2766,19 +2761,19 @@ static int fcoe_ctlr_vn_recv(struct fcoe_ctlr *fip, struct sk_buff *skb) mutex_lock(&fip->ctlr_mutex); switch (sub) { case FIP_SC_VN_PROBE_REQ: - fcoe_ctlr_vn_probe_req(fip, &buf.rdata); + fcoe_ctlr_vn_probe_req(fip, &frport.rdata); break; case FIP_SC_VN_PROBE_REP: - fcoe_ctlr_vn_probe_reply(fip, &buf.rdata); + fcoe_ctlr_vn_probe_reply(fip, &frport.rdata); break; case FIP_SC_VN_CLAIM_NOTIFY: - fcoe_ctlr_vn_claim_notify(fip, &buf.rdata); + fcoe_ctlr_vn_claim_notify(fip, &frport.rdata); break; case FIP_SC_VN_CLAIM_REP: - fcoe_ctlr_vn_claim_resp(fip, &buf.rdata); + fcoe_ctlr_vn_claim_resp(fip, &frport.rdata); break; case FIP_SC_VN_BEACON: - fcoe_ctlr_vn_beacon(fip, &buf.rdata); + fcoe_ctlr_vn_beacon(fip, &frport.rdata); break; default: LIBFCOE_FIP_DBG(fip, "vn_recv unknown subcode %d\n", sub); @@ -2802,22 +2797,18 @@ drop: */ static int fcoe_ctlr_vlan_parse(struct fcoe_ctlr *fip, struct sk_buff *skb, - struct fc_rport_priv *rdata) + struct fcoe_rport *frport) { struct fip_header *fiph; struct fip_desc *desc = NULL; struct fip_mac_desc *macd = NULL; struct fip_wwn_desc *wwn = NULL; - struct fcoe_rport *frport; size_t rlen; size_t dlen; u32 desc_mask = 0; u32 dtype; u8 sub; - memset(rdata, 0, sizeof(*rdata) + sizeof(*frport)); - frport = fcoe_ctlr_rport(rdata); - fiph = (struct fip_header *)skb->data; frport->flags = ntohs(fiph->fip_flags); @@ -2871,7 +2862,8 @@ static int fcoe_ctlr_vlan_parse(struct fcoe_ctlr *fip, if (dlen != sizeof(struct fip_wwn_desc)) goto len_err; wwn = (struct fip_wwn_desc *)desc; - rdata->ids.node_name = get_unaligned_be64(&wwn->fd_wwn); + frport->rdata.ids.node_name = + get_unaligned_be64(&wwn->fd_wwn); break; default: LIBFCOE_FIP_DBG(fip, "unexpected descriptor type %x " @@ -2982,22 +2974,19 @@ static int fcoe_ctlr_vlan_recv(struct fcoe_ctlr *fip, struct sk_buff *skb) { struct fip_header *fiph; enum fip_vlan_subcode sub; - struct { - struct fc_rport_priv rdata; - struct fcoe_rport frport; - } buf; + struct fcoe_rport frport = { }; int rc; fiph = (struct fip_header *)skb->data; sub = fiph->fip_subcode; - rc = fcoe_ctlr_vlan_parse(fip, skb, &buf.rdata); + rc = fcoe_ctlr_vlan_parse(fip, skb, &frport); if (rc) { LIBFCOE_FIP_DBG(fip, "vlan_recv vlan_parse error %d\n", rc); goto drop; } mutex_lock(&fip->ctlr_mutex); if (sub == FIP_SC_VL_REQ) - fcoe_ctlr_vlan_disc_reply(fip, &buf.rdata); + fcoe_ctlr_vlan_disc_reply(fip, &frport.rdata); mutex_unlock(&fip->ctlr_mutex); drop: diff --git a/drivers/scsi/libfc/fc_rport.c b/drivers/scsi/libfc/fc_rport.c index e0f3852fdad1..da6e97d8dc3b 100644 --- a/drivers/scsi/libfc/fc_rport.c +++ b/drivers/scsi/libfc/fc_rport.c @@ -128,6 +128,7 @@ EXPORT_SYMBOL(fc_rport_lookup); struct fc_rport_priv *fc_rport_create(struct fc_lport *lport, u32 port_id) { struct fc_rport_priv *rdata; + size_t rport_priv_size = sizeof(*rdata); lockdep_assert_held(&lport->disc.disc_mutex); @@ -135,7 +136,9 @@ struct fc_rport_priv *fc_rport_create(struct fc_lport *lport, u32 port_id) if (rdata) return rdata; - rdata = kzalloc(sizeof(*rdata) + lport->rport_priv_size, GFP_KERNEL); + if (lport->rport_priv_size > 0) + rport_priv_size = lport->rport_priv_size; + rdata = kzalloc(rport_priv_size, GFP_KERNEL); if (!rdata) return NULL; diff --git a/include/scsi/libfcoe.h b/include/scsi/libfcoe.h index dc14b52577f7..2568cb0627ec 100644 --- a/include/scsi/libfcoe.h +++ b/include/scsi/libfcoe.h @@ -229,6 +229,7 @@ struct fcoe_fcf { * @vn_mac: VN_Node assigned MAC address for data */ struct fcoe_rport { + struct fc_rport_priv rdata; unsigned long time; u16 fcoe_len; u16 flags; -- cgit v1.2.3 From 9091373ab7ea27cad381ce71aa37de6b9e687e81 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 30 Jul 2019 14:43:47 +0900 Subject: gpio: remove less important #ifdef around declarations The whole struct/function declarations in this header are surrounded by #ifdef. As far as I understood, the motivation of this is probably to break the build earlier if a driver misses to select or depend on correct CONFIG options in Kconfig. Since commit 94bed2a9c4ae ("Add -Werror-implicit-function-declaration") no one cannot call functions that have not been declared. So, I see some benefit in doing this in the cost of uglier headers. In reality, it would not be so easy to catch missed 'select' or 'depends on' because GPIOLIB, GPIOLIB_IRQCHIP etc. are already selected by someone else eventually. So, this kind of error, if any, will be caught by randconfig bots. In summary, I am not a big fan of cluttered #ifdef nesting, and this does not matter for normal developers. The code readability wins. Signed-off-by: Masahiro Yamada Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/driver.h | 27 ++++++++------------------- 1 file changed, 8 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index 6a0e420915a3..f28f534f451a 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -20,9 +20,6 @@ struct module; enum gpiod_flags; enum gpio_lookup_flags; -#ifdef CONFIG_GPIOLIB - -#ifdef CONFIG_GPIOLIB_IRQCHIP /** * struct gpio_irq_chip - GPIO interrupt controller */ @@ -161,7 +158,6 @@ struct gpio_irq_chip { */ void (*irq_disable)(struct irq_data *data); }; -#endif /* CONFIG_GPIOLIB_IRQCHIP */ /** * struct gpio_chip - abstract a GPIO controller @@ -441,16 +437,12 @@ bool gpiochip_line_is_valid(const struct gpio_chip *chip, unsigned int offset); /* get driver data */ void *gpiochip_get_data(struct gpio_chip *chip); -struct gpio_chip *gpiod_to_chip(const struct gpio_desc *desc); - struct bgpio_pdata { const char *label; int base; int ngpio; }; -#if IS_ENABLED(CONFIG_GPIO_GENERIC) - int bgpio_init(struct gpio_chip *gc, struct device *dev, unsigned long sz, void __iomem *dat, void __iomem *set, void __iomem *clr, void __iomem *dirout, void __iomem *dirin, @@ -463,10 +455,6 @@ int bgpio_init(struct gpio_chip *gc, struct device *dev, #define BGPIOF_READ_OUTPUT_REG_SET BIT(4) /* reg_set stores output value */ #define BGPIOF_NO_OUTPUT BIT(5) /* only input */ -#endif /* CONFIG_GPIO_GENERIC */ - -#ifdef CONFIG_GPIOLIB_IRQCHIP - int gpiochip_irq_map(struct irq_domain *d, unsigned int irq, irq_hw_number_t hwirq); void gpiochip_irq_unmap(struct irq_domain *d, unsigned int irq); @@ -555,15 +543,11 @@ static inline int gpiochip_irqchip_add_nested(struct gpio_chip *gpiochip, } #endif /* CONFIG_LOCKDEP */ -#endif /* CONFIG_GPIOLIB_IRQCHIP */ - int gpiochip_generic_request(struct gpio_chip *chip, unsigned offset); void gpiochip_generic_free(struct gpio_chip *chip, unsigned offset); int gpiochip_generic_config(struct gpio_chip *chip, unsigned offset, unsigned long config); -#ifdef CONFIG_PINCTRL - /** * struct gpio_pin_range - pin range controlled by a gpio chip * @node: list for maintaining set of pin ranges, used internally @@ -576,6 +560,8 @@ struct gpio_pin_range { struct pinctrl_gpio_range range; }; +#ifdef CONFIG_PINCTRL + int gpiochip_add_pin_range(struct gpio_chip *chip, const char *pinctl_name, unsigned int gpio_offset, unsigned int pin_offset, unsigned int npins); @@ -586,8 +572,6 @@ void gpiochip_remove_pin_ranges(struct gpio_chip *chip); #else /* ! CONFIG_PINCTRL */ -struct pinctrl_dev; - static inline int gpiochip_add_pin_range(struct gpio_chip *chip, const char *pinctl_name, unsigned int gpio_offset, unsigned int pin_offset, @@ -619,6 +603,11 @@ void gpiochip_free_own_desc(struct gpio_desc *desc); void devprop_gpiochip_set_names(struct gpio_chip *chip, const struct fwnode_handle *fwnode); + +#ifdef CONFIG_GPIOLIB + +struct gpio_chip *gpiod_to_chip(const struct gpio_desc *desc); + #else /* CONFIG_GPIOLIB */ static inline struct gpio_chip *gpiod_to_chip(const struct gpio_desc *desc) @@ -630,4 +619,4 @@ static inline struct gpio_chip *gpiod_to_chip(const struct gpio_desc *desc) #endif /* CONFIG_GPIOLIB */ -#endif +#endif /* __LINUX_GPIO_DRIVER_H */ -- cgit v1.2.3 From 10a08fd65ec1a68ccd86b19ec822ed5f2e50113f Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 30 Jul 2019 11:55:59 +0200 Subject: ACPI: PM: Set up EC GPE for system wakeup from drivers that need it The EC GPE needs to be set up for system wakeup only if there is a driver depending on it, either intel-hid or intel-vbtn, bound to a button device that is expected to wake up the system from sleep (such as the power button on some Dell systems, like the XPS13 9360). It doesn't need to be set up for waking up the system from sleep in any other cases and whether or not it is expected to wake up the system from sleep doesn't depend on whether or not the LPS0 device is present in the ACPI namespace. For this reason, rearrange the ACPI suspend-to-idle code to make the drivers depending on the EC GPE wakeup take care of setting it up and decouple that from the LPS0 device handling. While at it, make intel-hid and intel-vbtn prepare for system wakeup only if they are allowed to wake up the system from sleep by user space (via sysfs). [Note that acpi_ec_mark_gpe_for_wake() and acpi_ec_set_gpe_wake_mask() are there to prevent the EC GPE from being disabled by the acpi_enable_all_wakeup_gpes() call in acpi_s2idle_prepare(), so on systems with either intel-hid or intel-vbtn this change doesn't affect any interactions with the hardware or platform firmware.] Signed-off-by: Rafael J. Wysocki Reviewed-by: Andy Shevchenko --- drivers/acpi/ec.c | 7 ++++++- drivers/acpi/internal.h | 2 -- drivers/acpi/sleep.c | 13 ++----------- drivers/platform/x86/intel-hid.c | 20 ++++++++++++++++---- drivers/platform/x86/intel-vbtn.c | 20 ++++++++++++++++---- include/linux/acpi.h | 4 ++++ include/linux/suspend.h | 1 + 7 files changed, 45 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c index 58c7ad402d8d..b996ca5f253f 100644 --- a/drivers/acpi/ec.c +++ b/drivers/acpi/ec.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -1048,17 +1049,21 @@ void acpi_ec_unblock_transactions(void) acpi_ec_start(first_ec, true); } +#ifdef CONFIG_PM_SLEEP void acpi_ec_mark_gpe_for_wake(void) { if (first_ec && !ec_no_wakeup) acpi_mark_gpe_for_wake(NULL, first_ec->gpe); } +EXPORT_SYMBOL_GPL(acpi_ec_mark_gpe_for_wake); void acpi_ec_set_gpe_wake_mask(u8 action) { - if (first_ec && !ec_no_wakeup) + if (pm_suspend_no_platform() && first_ec && !ec_no_wakeup) acpi_set_gpe_wake_mask(NULL, first_ec->gpe, action); } +EXPORT_SYMBOL_GPL(acpi_ec_set_gpe_wake_mask); +#endif bool acpi_ec_dispatch_gpe(void) { diff --git a/drivers/acpi/internal.h b/drivers/acpi/internal.h index 1b5f9ac06ea8..bcc080511197 100644 --- a/drivers/acpi/internal.h +++ b/drivers/acpi/internal.h @@ -194,8 +194,6 @@ void acpi_ec_ecdt_probe(void); void acpi_ec_dsdt_probe(void); void acpi_ec_block_transactions(void); void acpi_ec_unblock_transactions(void); -void acpi_ec_mark_gpe_for_wake(void); -void acpi_ec_set_gpe_wake_mask(u8 action); bool acpi_ec_dispatch_gpe(void); int acpi_ec_add_query_handler(struct acpi_ec *ec, u8 query_bit, acpi_handle handle, acpi_ec_query_func func, diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c index 970ae7c7a3f7..9cb0532f7471 100644 --- a/drivers/acpi/sleep.c +++ b/drivers/acpi/sleep.c @@ -930,8 +930,6 @@ static int lps0_device_attach(struct acpi_device *adev, acpi_handle_debug(adev->handle, "_DSM function mask: 0x%x\n", bitmask); - - acpi_ec_mark_gpe_for_wake(); } else { acpi_handle_debug(adev->handle, "_DSM function 0 evaluation failed\n"); @@ -960,8 +958,6 @@ static int acpi_s2idle_prepare(void) if (lps0_device_handle) { acpi_sleep_run_lps0_dsm(ACPI_LPS0_SCREEN_OFF); acpi_sleep_run_lps0_dsm(ACPI_LPS0_ENTRY); - - acpi_ec_set_gpe_wake_mask(ACPI_GPE_ENABLE); } if (acpi_sci_irq_valid()) @@ -979,10 +975,7 @@ static int acpi_s2idle_prepare(void) static void acpi_s2idle_wake(void) { - if (!lps0_device_handle) - return; - - if (pm_debug_messages_on) + if (lps0_device_handle && pm_debug_messages_on) lpi_check_constraints(); /* @@ -1031,8 +1024,6 @@ static void acpi_s2idle_restore(void) disable_irq_wake(acpi_sci_irq); if (lps0_device_handle) { - acpi_ec_set_gpe_wake_mask(ACPI_GPE_DISABLE); - acpi_sleep_run_lps0_dsm(ACPI_LPS0_EXIT); acpi_sleep_run_lps0_dsm(ACPI_LPS0_SCREEN_ON); } @@ -1081,7 +1072,7 @@ bool acpi_s2idle_wakeup(void) bool acpi_sleep_no_ec_events(void) { - return !s2idle_in_progress || !lps0_device_handle; + return !s2idle_in_progress; } #ifdef CONFIG_PM_SLEEP diff --git a/drivers/platform/x86/intel-hid.c b/drivers/platform/x86/intel-hid.c index bc0d55a59015..e51c72b92cbd 100644 --- a/drivers/platform/x86/intel-hid.c +++ b/drivers/platform/x86/intel-hid.c @@ -253,9 +253,12 @@ static void intel_button_array_enable(struct device *device, bool enable) static int intel_hid_pm_prepare(struct device *device) { - struct intel_hid_priv *priv = dev_get_drvdata(device); + if (device_may_wakeup(device)) { + struct intel_hid_priv *priv = dev_get_drvdata(device); - priv->wakeup_mode = true; + priv->wakeup_mode = true; + acpi_ec_set_gpe_wake_mask(ACPI_GPE_ENABLE); + } return 0; } @@ -270,9 +273,12 @@ static int intel_hid_pl_suspend_handler(struct device *device) static int intel_hid_pl_resume_handler(struct device *device) { - struct intel_hid_priv *priv = dev_get_drvdata(device); + if (device_may_wakeup(device)) { + struct intel_hid_priv *priv = dev_get_drvdata(device); - priv->wakeup_mode = false; + acpi_ec_set_gpe_wake_mask(ACPI_GPE_DISABLE); + priv->wakeup_mode = false; + } if (pm_resume_via_firmware()) { intel_hid_set_enable(device, true); intel_button_array_enable(device, true); @@ -491,6 +497,12 @@ static int intel_hid_probe(struct platform_device *device) } device_init_wakeup(&device->dev, true); + /* + * In order for system wakeup to work, the EC GPE has to be marked as + * a wakeup one, so do that here (this setting will persist, but it has + * no effect until the wakeup mask is set for the EC GPE). + */ + acpi_ec_mark_gpe_for_wake(); return 0; err_remove_notify: diff --git a/drivers/platform/x86/intel-vbtn.c b/drivers/platform/x86/intel-vbtn.c index a0d0cecff55f..ab84e1bbdedd 100644 --- a/drivers/platform/x86/intel-vbtn.c +++ b/drivers/platform/x86/intel-vbtn.c @@ -176,6 +176,12 @@ static int intel_vbtn_probe(struct platform_device *device) return -EBUSY; device_init_wakeup(&device->dev, true); + /* + * In order for system wakeup to work, the EC GPE has to be marked as + * a wakeup one, so do that here (this setting will persist, but it has + * no effect until the wakeup mask is set for the EC GPE). + */ + acpi_ec_mark_gpe_for_wake(); return 0; } @@ -195,17 +201,23 @@ static int intel_vbtn_remove(struct platform_device *device) static int intel_vbtn_pm_prepare(struct device *dev) { - struct intel_vbtn_priv *priv = dev_get_drvdata(dev); + if (device_may_wakeup(dev)) { + struct intel_vbtn_priv *priv = dev_get_drvdata(dev); - priv->wakeup_mode = true; + priv->wakeup_mode = true; + acpi_ec_set_gpe_wake_mask(ACPI_GPE_ENABLE); + } return 0; } static int intel_vbtn_pm_resume(struct device *dev) { - struct intel_vbtn_priv *priv = dev_get_drvdata(dev); + if (device_may_wakeup(dev)) { + struct intel_vbtn_priv *priv = dev_get_drvdata(dev); - priv->wakeup_mode = false; + acpi_ec_set_gpe_wake_mask(ACPI_GPE_DISABLE); + priv->wakeup_mode = false; + } return 0; } diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 9426b9aaed86..e65a4c5bbeae 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -931,6 +931,8 @@ int acpi_subsys_suspend_noirq(struct device *dev); int acpi_subsys_suspend(struct device *dev); int acpi_subsys_freeze(struct device *dev); int acpi_subsys_poweroff(struct device *dev); +void acpi_ec_mark_gpe_for_wake(void); +void acpi_ec_set_gpe_wake_mask(u8 action); #else static inline int acpi_subsys_prepare(struct device *dev) { return 0; } static inline void acpi_subsys_complete(struct device *dev) {} @@ -939,6 +941,8 @@ static inline int acpi_subsys_suspend_noirq(struct device *dev) { return 0; } static inline int acpi_subsys_suspend(struct device *dev) { return 0; } static inline int acpi_subsys_freeze(struct device *dev) { return 0; } static inline int acpi_subsys_poweroff(struct device *dev) { return 0; } +static inline void acpi_ec_mark_gpe_for_wake(void) {} +static inline void acpi_ec_set_gpe_wake_mask(u8 action) {} #endif #ifdef CONFIG_ACPI diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 66ce3871ed61..f0c4a8445140 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -335,6 +335,7 @@ static inline void pm_set_suspend_via_firmware(void) {} static inline void pm_set_resume_via_firmware(void) {} static inline bool pm_suspend_via_firmware(void) { return false; } static inline bool pm_resume_via_firmware(void) { return false; } +static inline bool pm_suspend_no_platform(void) { return false; } static inline bool pm_suspend_default_s2idle(void) { return false; } static inline void suspend_set_ops(const struct platform_suspend_ops *ops) {} -- cgit v1.2.3 From 6cda08a20dbde45b021091230c8a359fa08c5103 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Tue, 23 Jul 2019 23:18:32 +0100 Subject: drivers: Introduce device lookup variants by name Add a helper to match the device name for device lookup. Also reuse this generic exported helper for the existing bus_find_device_by_name(). and add similar variants for driver/class. Cc: Alessandro Zummo Cc: Alexander Aring Cc: Alexander Shishkin Cc: Arnd Bergmann Cc: Dan Murphy Cc: Harald Freudenberger Cc: Heiko Carstens Cc: Jacek Anaszewski Cc: Lee Jones Cc: linux-leds@vger.kernel.org Cc: linux-rtc@vger.kernel.org Cc: linux-usb@vger.kernel.org Cc: linux-wpan@vger.kernel.org Cc: Maxime Coquelin Cc: Pavel Machek Cc: Peter Oberparleiter Cc: "Rafael J. Wysocki" Cc: Stefan Schmidt Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Signed-off-by: Suzuki K Poulose Reviewed-by: Heikki Krogerus Acked-by: Alexandre Belloni Link: https://lore.kernel.org/r/20190723221838.12024-2-suzuki.poulose@arm.com Signed-off-by: Greg Kroah-Hartman --- drivers/base/bus.c | 24 ----------------------- drivers/base/core.c | 6 ++++++ drivers/hwtracing/stm/core.c | 9 +-------- drivers/leds/led-class.c | 9 +-------- drivers/rtc/interface.c | 11 +---------- drivers/s390/cio/ccwgroup.c | 10 +--------- drivers/s390/cio/device.c | 15 +------------- drivers/s390/crypto/zcrypt_api.c | 11 +---------- drivers/usb/roles/class.c | 8 +------- drivers/usb/typec/class.c | 8 +------- include/linux/device.h | 42 +++++++++++++++++++++++++++++++++++++--- net/ieee802154/core.c | 7 +------ 12 files changed, 54 insertions(+), 106 deletions(-) (limited to 'include') diff --git a/drivers/base/bus.c b/drivers/base/bus.c index df3cac739813..a1d1e8256324 100644 --- a/drivers/base/bus.c +++ b/drivers/base/bus.c @@ -342,30 +342,6 @@ struct device *bus_find_device(struct bus_type *bus, } EXPORT_SYMBOL_GPL(bus_find_device); -static int match_name(struct device *dev, const void *data) -{ - const char *name = data; - - return sysfs_streq(name, dev_name(dev)); -} - -/** - * bus_find_device_by_name - device iterator for locating a particular device of a specific name - * @bus: bus type - * @start: Device to begin with - * @name: name of the device to match - * - * This is similar to the bus_find_device() function above, but it handles - * searching by a name automatically, no need to write another strcmp matching - * function. - */ -struct device *bus_find_device_by_name(struct bus_type *bus, - struct device *start, const char *name) -{ - return bus_find_device(bus, start, (void *)name, match_name); -} -EXPORT_SYMBOL_GPL(bus_find_device_by_name); - /** * subsys_find_device_by_id - find a device with a specific enumeration number * @subsys: subsystem diff --git a/drivers/base/core.c b/drivers/base/core.c index da84a73f2ba6..fb83647d685a 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -3357,6 +3357,12 @@ void device_set_of_node_from_dev(struct device *dev, const struct device *dev2) } EXPORT_SYMBOL_GPL(device_set_of_node_from_dev); +int device_match_name(struct device *dev, const void *name) +{ + return sysfs_streq(dev_name(dev), name); +} +EXPORT_SYMBOL_GPL(device_match_name); + int device_match_of_node(struct device *dev, const void *np) { return dev->of_node == np; diff --git a/drivers/hwtracing/stm/core.c b/drivers/hwtracing/stm/core.c index e55b902560de..2b6bd42632e8 100644 --- a/drivers/hwtracing/stm/core.c +++ b/drivers/hwtracing/stm/core.c @@ -89,13 +89,6 @@ static struct class stm_class = { .dev_groups = stm_groups, }; -static int stm_dev_match(struct device *dev, const void *data) -{ - const char *name = data; - - return sysfs_streq(name, dev_name(dev)); -} - /** * stm_find_device() - find stm device by name * @buf: character buffer containing the name @@ -116,7 +109,7 @@ struct stm_device *stm_find_device(const char *buf) if (!stm_core_up) return NULL; - dev = class_find_device(&stm_class, NULL, buf, stm_dev_match); + dev = class_find_device_by_name(&stm_class, buf); if (!dev) return NULL; diff --git a/drivers/leds/led-class.c b/drivers/leds/led-class.c index 4793e77808e2..d54c8e4d8954 100644 --- a/drivers/leds/led-class.c +++ b/drivers/leds/led-class.c @@ -213,13 +213,6 @@ static int led_resume(struct device *dev) static SIMPLE_DEV_PM_OPS(leds_class_dev_pm_ops, led_suspend, led_resume); -static int match_name(struct device *dev, const void *data) -{ - if (!dev_name(dev)) - return 0; - return !strcmp(dev_name(dev), (char *)data); -} - static int led_classdev_next_name(const char *init_name, char *name, size_t len) { @@ -230,7 +223,7 @@ static int led_classdev_next_name(const char *init_name, char *name, strlcpy(name, init_name, len); while ((ret < len) && - (dev = class_find_device(leds_class, NULL, name, match_name))) { + (dev = class_find_device_by_name(leds_class, name))) { put_device(dev); ret = snprintf(name, len, "%s_%u", init_name, ++i); } diff --git a/drivers/rtc/interface.c b/drivers/rtc/interface.c index 72b7ddc43116..c93ef33b01d3 100644 --- a/drivers/rtc/interface.c +++ b/drivers/rtc/interface.c @@ -663,21 +663,12 @@ void rtc_update_irq(struct rtc_device *rtc, } EXPORT_SYMBOL_GPL(rtc_update_irq); -static int __rtc_match(struct device *dev, const void *data) -{ - const char *name = data; - - if (strcmp(dev_name(dev), name) == 0) - return 1; - return 0; -} - struct rtc_device *rtc_class_open(const char *name) { struct device *dev; struct rtc_device *rtc = NULL; - dev = class_find_device(rtc_class, NULL, name, __rtc_match); + dev = class_find_device_by_name(rtc_class, name); if (dev) rtc = to_rtc_device(dev); diff --git a/drivers/s390/cio/ccwgroup.c b/drivers/s390/cio/ccwgroup.c index c522e9313c50..d843e362c167 100644 --- a/drivers/s390/cio/ccwgroup.c +++ b/drivers/s390/cio/ccwgroup.c @@ -608,13 +608,6 @@ void ccwgroup_driver_unregister(struct ccwgroup_driver *cdriver) } EXPORT_SYMBOL(ccwgroup_driver_unregister); -static int __ccwgroupdev_check_busid(struct device *dev, const void *id) -{ - const char *bus_id = id; - - return (strcmp(bus_id, dev_name(dev)) == 0); -} - /** * get_ccwgroupdev_by_busid() - obtain device from a bus id * @gdrv: driver the device is owned by @@ -631,8 +624,7 @@ struct ccwgroup_device *get_ccwgroupdev_by_busid(struct ccwgroup_driver *gdrv, { struct device *dev; - dev = driver_find_device(&gdrv->driver, NULL, bus_id, - __ccwgroupdev_check_busid); + dev = driver_find_device_by_name(&gdrv->driver, bus_id); return dev ? to_ccwgroupdev(dev) : NULL; } diff --git a/drivers/s390/cio/device.c b/drivers/s390/cio/device.c index c421899be20f..131430bd48d9 100644 --- a/drivers/s390/cio/device.c +++ b/drivers/s390/cio/device.c @@ -1695,18 +1695,6 @@ int ccw_device_force_console(struct ccw_device *cdev) EXPORT_SYMBOL_GPL(ccw_device_force_console); #endif -/* - * get ccw_device matching the busid, but only if owned by cdrv - */ -static int -__ccwdev_check_busid(struct device *dev, const void *id) -{ - const char *bus_id = id; - - return (strcmp(bus_id, dev_name(dev)) == 0); -} - - /** * get_ccwdev_by_busid() - obtain device from a bus id * @cdrv: driver the device is owned by @@ -1723,8 +1711,7 @@ struct ccw_device *get_ccwdev_by_busid(struct ccw_driver *cdrv, { struct device *dev; - dev = driver_find_device(&cdrv->driver, NULL, (void *)bus_id, - __ccwdev_check_busid); + dev = driver_find_device_by_name(&cdrv->driver, bus_id); return dev ? to_ccwdev(dev) : NULL; } diff --git a/drivers/s390/crypto/zcrypt_api.c b/drivers/s390/crypto/zcrypt_api.c index 1058b4b5cc1e..38a5a47b8c9c 100644 --- a/drivers/s390/crypto/zcrypt_api.c +++ b/drivers/s390/crypto/zcrypt_api.c @@ -133,12 +133,6 @@ struct zcdn_device { static int zcdn_create(const char *name); static int zcdn_destroy(const char *name); -/* helper function, matches the name for find_zcdndev_by_name() */ -static int __match_zcdn_name(struct device *dev, const void *data) -{ - return strcmp(dev_name(dev), (const char *)data) == 0; -} - /* helper function, matches the devt value for find_zcdndev_by_devt() */ static int __match_zcdn_devt(struct device *dev, const void *data) { @@ -152,10 +146,7 @@ static int __match_zcdn_devt(struct device *dev, const void *data) */ static inline struct zcdn_device *find_zcdndev_by_name(const char *name) { - struct device *dev = - class_find_device(zcrypt_class, NULL, - (void *) name, - __match_zcdn_name); + struct device *dev = class_find_device_by_name(zcrypt_class, name); return dev ? to_zcdn_dev(dev) : NULL; } diff --git a/drivers/usb/roles/class.c b/drivers/usb/roles/class.c index 86defca6623e..c8efe60e2465 100644 --- a/drivers/usb/roles/class.c +++ b/drivers/usb/roles/class.c @@ -90,11 +90,6 @@ static int switch_fwnode_match(struct device *dev, const void *fwnode) return dev_fwnode(dev) == fwnode; } -static int switch_name_match(struct device *dev, const void *name) -{ - return !strcmp((const char *)name, dev_name(dev)); -} - static void *usb_role_switch_match(struct device_connection *con, int ep, void *data) { @@ -107,8 +102,7 @@ static void *usb_role_switch_match(struct device_connection *con, int ep, dev = class_find_device(role_class, NULL, con->fwnode, switch_fwnode_match); } else { - dev = class_find_device(role_class, NULL, con->endpoint[ep], - switch_name_match); + dev = class_find_device_by_name(role_class, con->endpoint[ep]); } return dev ? to_role_switch(dev) : ERR_PTR(-EPROBE_DEFER); diff --git a/drivers/usb/typec/class.c b/drivers/usb/typec/class.c index a18285a990a8..9b0d15b487e5 100644 --- a/drivers/usb/typec/class.c +++ b/drivers/usb/typec/class.c @@ -210,11 +210,6 @@ static int typec_port_fwnode_match(struct device *dev, const void *fwnode) return dev_fwnode(dev) == fwnode; } -static int typec_port_name_match(struct device *dev, const void *name) -{ - return !strcmp((const char *)name, dev_name(dev)); -} - static void *typec_port_match(struct device_connection *con, int ep, void *data) { struct device *dev; @@ -227,8 +222,7 @@ static void *typec_port_match(struct device_connection *con, int ep, void *data) return class_find_device(typec_class, NULL, con->fwnode, typec_port_fwnode_match); - dev = class_find_device(typec_class, NULL, con->endpoint[ep], - typec_port_name_match); + dev = class_find_device_by_name(typec_class, con->endpoint[ep]); return dev ? dev : ERR_PTR(-EPROBE_DEFER); } diff --git a/include/linux/device.h b/include/linux/device.h index c330b75c6c57..3ba376b8b456 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -164,6 +164,7 @@ void subsys_dev_iter_init(struct subsys_dev_iter *iter, struct device *subsys_dev_iter_next(struct subsys_dev_iter *iter); void subsys_dev_iter_exit(struct subsys_dev_iter *iter); +int device_match_name(struct device *dev, const void *name); int device_match_of_node(struct device *dev, const void *np); int bus_for_each_dev(struct bus_type *bus, struct device *start, void *data, @@ -171,9 +172,20 @@ int bus_for_each_dev(struct bus_type *bus, struct device *start, void *data, struct device *bus_find_device(struct bus_type *bus, struct device *start, const void *data, int (*match)(struct device *dev, const void *data)); -struct device *bus_find_device_by_name(struct bus_type *bus, - struct device *start, - const char *name); +/** + * bus_find_device_by_name - device iterator for locating a particular device + * of a specific name. + * @bus: bus type + * @start: Device to begin with + * @name: name of the device to match + */ +static inline struct device *bus_find_device_by_name(struct bus_type *bus, + struct device *start, + const char *name) +{ + return bus_find_device(bus, start, name, device_match_name); +} + struct device *subsys_find_device_by_id(struct bus_type *bus, unsigned int id, struct device *hint); int bus_for_each_drv(struct bus_type *bus, struct device_driver *start, @@ -342,6 +354,18 @@ struct device *driver_find_device(struct device_driver *drv, struct device *start, const void *data, int (*match)(struct device *dev, const void *data)); +/** + * driver_find_device_by_name - device iterator for locating a particular device + * of a specific name. + * @driver: the driver we're iterating + * @name: name of the device to match + */ +static inline struct device *driver_find_device_by_name(struct device_driver *drv, + const char *name) +{ + return driver_find_device(drv, NULL, name, device_match_name); +} + void driver_deferred_probe_add(struct device *dev); int driver_deferred_probe_check_state(struct device *dev); int driver_deferred_probe_check_state_continue(struct device *dev); @@ -471,6 +495,18 @@ extern struct device *class_find_device(struct class *class, struct device *start, const void *data, int (*match)(struct device *, const void *)); +/** + * class_find_device_by_name - device iterator for locating a particular device + * of a specific name. + * @class: class type + * @name: name of the device to match + */ +static inline struct device *class_find_device_by_name(struct class *class, + const char *name) +{ + return class_find_device(class, NULL, name, device_match_name); +} + struct class_attribute { struct attribute attr; ssize_t (*show)(struct class *class, struct class_attribute *attr, diff --git a/net/ieee802154/core.c b/net/ieee802154/core.c index 60b7ac56a1f5..de259b5170ab 100644 --- a/net/ieee802154/core.c +++ b/net/ieee802154/core.c @@ -23,11 +23,6 @@ LIST_HEAD(cfg802154_rdev_list); int cfg802154_rdev_list_generation; -static int wpan_phy_match(struct device *dev, const void *data) -{ - return !strcmp(dev_name(dev), (const char *)data); -} - struct wpan_phy *wpan_phy_find(const char *str) { struct device *dev; @@ -35,7 +30,7 @@ struct wpan_phy *wpan_phy_find(const char *str) if (WARN_ON(!str)) return NULL; - dev = class_find_device(&wpan_phy_class, NULL, str, wpan_phy_match); + dev = class_find_device_by_name(&wpan_phy_class, str); if (!dev) return NULL; -- cgit v1.2.3 From cfba5de9b99f8bbb8b4ea11b3049784e78b8759b Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Tue, 23 Jul 2019 23:18:33 +0100 Subject: drivers: Introduce device lookup variants by of_node Introduce wrappers for {bus/driver/class}_find_device() to locate devices by its of_node. Cc: Maarten Lankhorst Cc: Maxime Ripard Cc: dri-devel@lists.freedesktop.org Cc: David Airlie Cc: Daniel Vetter Cc: devicetree@vger.kernel.org Cc: Florian Fainelli Cc: Frank Rowand Cc: Heiko Stuebner Cc: Liam Girdwood Cc: linux-i2c@vger.kernel.org Cc: linux-rockchip@lists.infradead.org Cc: linux-spi@vger.kernel.org Cc: Mathieu Poirier Cc: Rob Herring Cc: Srinivas Kandagatla Cc: Takashi Iwai Cc: Alan Tull Cc: linux-fpga@vger.kernel.org Cc: Peter Rosin Cc: Florian Fainelli Cc: Heiner Kallweit Cc: "David S. Miller" Cc: Andrew Lunn Cc: Liam Girdwood Cc: "Rafael J. Wysocki" Cc: Thor Thayer Cc: Jiri Slaby Cc: Andrew Lunn Cc: Peter Rosin Signed-off-by: Suzuki K Poulose Acked-by: Lee Jones Acked-by: Wolfram Sang # I2C part Acked-by: Moritz Fischer # For FPGA part Acked-by: Mark Brown Link: https://lore.kernel.org/r/20190723221838.12024-3-suzuki.poulose@arm.com Signed-off-by: Greg Kroah-Hartman --- drivers/amba/tegra-ahb.c | 11 +---------- drivers/fpga/fpga-bridge.c | 8 +------- drivers/fpga/fpga-mgr.c | 8 +------- drivers/gpu/drm/drm_mipi_dsi.c | 7 +------ drivers/i2c/i2c-core-of.c | 7 +------ drivers/mfd/altera-sysmgr.c | 14 ++----------- drivers/mux/core.c | 7 +------ drivers/net/phy/mdio_bus.c | 9 +-------- drivers/nvmem/core.c | 7 +------ drivers/of/of_mdio.c | 8 +------- drivers/of/platform.c | 7 +------ drivers/regulator/of_regulator.c | 7 +------ drivers/spi/spi.c | 20 ++++--------------- include/linux/device.h | 37 +++++++++++++++++++++++++++++++++++ sound/soc/rockchip/rk3399_gru_sound.c | 9 ++------- 15 files changed, 56 insertions(+), 110 deletions(-) (limited to 'include') diff --git a/drivers/amba/tegra-ahb.c b/drivers/amba/tegra-ahb.c index aa64eece77a6..57d3b2e2d007 100644 --- a/drivers/amba/tegra-ahb.c +++ b/drivers/amba/tegra-ahb.c @@ -134,22 +134,13 @@ static inline void gizmo_writel(struct tegra_ahb *ahb, u32 value, u32 offset) } #ifdef CONFIG_TEGRA_IOMMU_SMMU -static int tegra_ahb_match_by_smmu(struct device *dev, const void *data) -{ - struct tegra_ahb *ahb = dev_get_drvdata(dev); - const struct device_node *dn = data; - - return (ahb->dev->of_node == dn) ? 1 : 0; -} - int tegra_ahb_enable_smmu(struct device_node *dn) { struct device *dev; u32 val; struct tegra_ahb *ahb; - dev = driver_find_device(&tegra_ahb_driver.driver, NULL, dn, - tegra_ahb_match_by_smmu); + dev = driver_find_device_by_of_node(&tegra_ahb_driver.driver, dn); if (!dev) return -EPROBE_DEFER; ahb = dev_get_drvdata(dev); diff --git a/drivers/fpga/fpga-bridge.c b/drivers/fpga/fpga-bridge.c index 80bd8f1b2aa6..4bab9028940a 100644 --- a/drivers/fpga/fpga-bridge.c +++ b/drivers/fpga/fpga-bridge.c @@ -19,11 +19,6 @@ static struct class *fpga_bridge_class; /* Lock for adding/removing bridges to linked lists*/ static spinlock_t bridge_list_lock; -static int fpga_bridge_of_node_match(struct device *dev, const void *data) -{ - return dev->of_node == data; -} - /** * fpga_bridge_enable - Enable transactions on the bridge * @@ -104,8 +99,7 @@ struct fpga_bridge *of_fpga_bridge_get(struct device_node *np, { struct device *dev; - dev = class_find_device(fpga_bridge_class, NULL, np, - fpga_bridge_of_node_match); + dev = class_find_device_by_of_node(fpga_bridge_class, np); if (!dev) return ERR_PTR(-ENODEV); diff --git a/drivers/fpga/fpga-mgr.c b/drivers/fpga/fpga-mgr.c index c3866816456a..e05104f5e40c 100644 --- a/drivers/fpga/fpga-mgr.c +++ b/drivers/fpga/fpga-mgr.c @@ -482,11 +482,6 @@ struct fpga_manager *fpga_mgr_get(struct device *dev) } EXPORT_SYMBOL_GPL(fpga_mgr_get); -static int fpga_mgr_of_node_match(struct device *dev, const void *data) -{ - return dev->of_node == data; -} - /** * of_fpga_mgr_get - Given a device node, get a reference to a fpga mgr. * @@ -498,8 +493,7 @@ struct fpga_manager *of_fpga_mgr_get(struct device_node *node) { struct device *dev; - dev = class_find_device(fpga_mgr_class, NULL, node, - fpga_mgr_of_node_match); + dev = class_find_device_by_of_node(fpga_mgr_class, node); if (!dev) return ERR_PTR(-ENODEV); diff --git a/drivers/gpu/drm/drm_mipi_dsi.c b/drivers/gpu/drm/drm_mipi_dsi.c index ad19df0686c9..bd2498bbd74a 100644 --- a/drivers/gpu/drm/drm_mipi_dsi.c +++ b/drivers/gpu/drm/drm_mipi_dsi.c @@ -93,11 +93,6 @@ static struct bus_type mipi_dsi_bus_type = { .pm = &mipi_dsi_device_pm_ops, }; -static int of_device_match(struct device *dev, const void *data) -{ - return dev->of_node == data; -} - /** * of_find_mipi_dsi_device_by_node() - find the MIPI DSI device matching a * device tree node @@ -110,7 +105,7 @@ struct mipi_dsi_device *of_find_mipi_dsi_device_by_node(struct device_node *np) { struct device *dev; - dev = bus_find_device(&mipi_dsi_bus_type, NULL, np, of_device_match); + dev = bus_find_device_by_of_node(&mipi_dsi_bus_type, np); return dev ? to_mipi_dsi_device(dev) : NULL; } diff --git a/drivers/i2c/i2c-core-of.c b/drivers/i2c/i2c-core-of.c index d1c48dec7118..6f632d543fcc 100644 --- a/drivers/i2c/i2c-core-of.c +++ b/drivers/i2c/i2c-core-of.c @@ -113,11 +113,6 @@ void of_i2c_register_devices(struct i2c_adapter *adap) of_node_put(bus); } -static int of_dev_node_match(struct device *dev, const void *data) -{ - return dev->of_node == data; -} - static int of_dev_or_parent_node_match(struct device *dev, const void *data) { if (dev->of_node == data) @@ -135,7 +130,7 @@ struct i2c_client *of_find_i2c_device_by_node(struct device_node *node) struct device *dev; struct i2c_client *client; - dev = bus_find_device(&i2c_bus_type, NULL, node, of_dev_node_match); + dev = bus_find_device_by_of_node(&i2c_bus_type, node); if (!dev) return NULL; diff --git a/drivers/mfd/altera-sysmgr.c b/drivers/mfd/altera-sysmgr.c index 2ee14d8a6d31..d2a13a547a3c 100644 --- a/drivers/mfd/altera-sysmgr.c +++ b/drivers/mfd/altera-sysmgr.c @@ -87,16 +87,6 @@ static struct regmap_config altr_sysmgr_regmap_cfg = { .use_single_write = true, }; -/** - * sysmgr_match_phandle - * Matching function used by driver_find_device(). - * Return: True if match is found, otherwise false. - */ -static int sysmgr_match_phandle(struct device *dev, const void *data) -{ - return dev->of_node == (const struct device_node *)data; -} - /** * altr_sysmgr_regmap_lookup_by_phandle * Find the sysmgr previous configured in probe() and return regmap property. @@ -117,8 +107,8 @@ struct regmap *altr_sysmgr_regmap_lookup_by_phandle(struct device_node *np, if (!sysmgr_np) return ERR_PTR(-ENODEV); - dev = driver_find_device(&altr_sysmgr_driver.driver, NULL, - (void *)sysmgr_np, sysmgr_match_phandle); + dev = driver_find_device_by_of_node(&altr_sysmgr_driver.driver, + (void *)sysmgr_np); of_node_put(sysmgr_np); if (!dev) return ERR_PTR(-EPROBE_DEFER); diff --git a/drivers/mux/core.c b/drivers/mux/core.c index d1271c1ee23c..1fb22388e7e0 100644 --- a/drivers/mux/core.c +++ b/drivers/mux/core.c @@ -405,17 +405,12 @@ int mux_control_deselect(struct mux_control *mux) } EXPORT_SYMBOL_GPL(mux_control_deselect); -static int of_dev_node_match(struct device *dev, const void *data) -{ - return dev->of_node == data; -} - /* Note this function returns a reference to the mux_chip dev. */ static struct mux_chip *of_find_mux_chip_by_node(struct device_node *np) { struct device *dev; - dev = class_find_device(&mux_class, NULL, np, of_dev_node_match); + dev = class_find_device_by_of_node(&mux_class, np); return dev ? to_mux_chip(dev) : NULL; } diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c index bd04fe762056..ce940871331e 100644 --- a/drivers/net/phy/mdio_bus.c +++ b/drivers/net/phy/mdio_bus.c @@ -262,11 +262,6 @@ static struct class mdio_bus_class = { }; #if IS_ENABLED(CONFIG_OF_MDIO) -/* Helper function for of_mdio_find_bus */ -static int of_mdio_bus_match(struct device *dev, const void *mdio_bus_np) -{ - return dev->of_node == mdio_bus_np; -} /** * of_mdio_find_bus - Given an mii_bus node, find the mii_bus. * @mdio_bus_np: Pointer to the mii_bus. @@ -287,9 +282,7 @@ struct mii_bus *of_mdio_find_bus(struct device_node *mdio_bus_np) if (!mdio_bus_np) return NULL; - d = class_find_device(&mdio_bus_class, NULL, mdio_bus_np, - of_mdio_bus_match); - + d = class_find_device_by_of_node(&mdio_bus_class, mdio_bus_np); return d ? to_mii_bus(d) : NULL; } EXPORT_SYMBOL(of_mdio_find_bus); diff --git a/drivers/nvmem/core.c b/drivers/nvmem/core.c index ac5d945be88a..057d1ff87d5d 100644 --- a/drivers/nvmem/core.c +++ b/drivers/nvmem/core.c @@ -76,11 +76,6 @@ static struct bus_type nvmem_bus_type = { .name = "nvmem", }; -static int of_nvmem_match(struct device *dev, const void *nvmem_np) -{ - return dev->of_node == nvmem_np; -} - static struct nvmem_device *of_nvmem_find(struct device_node *nvmem_np) { struct device *d; @@ -88,7 +83,7 @@ static struct nvmem_device *of_nvmem_find(struct device_node *nvmem_np) if (!nvmem_np) return NULL; - d = bus_find_device(&nvmem_bus_type, NULL, nvmem_np, of_nvmem_match); + d = bus_find_device_by_of_node(&nvmem_bus_type, nvmem_np); if (!d) return NULL; diff --git a/drivers/of/of_mdio.c b/drivers/of/of_mdio.c index 44f53496cab1..000b95787df1 100644 --- a/drivers/of/of_mdio.c +++ b/drivers/of/of_mdio.c @@ -280,12 +280,6 @@ unregister: } EXPORT_SYMBOL(of_mdiobus_register); -/* Helper function for of_phy_find_device */ -static int of_phy_match(struct device *dev, const void *phy_np) -{ - return dev->of_node == phy_np; -} - /** * of_phy_find_device - Give a PHY node, find the phy_device * @phy_np: Pointer to the phy's device tree node @@ -301,7 +295,7 @@ struct phy_device *of_phy_find_device(struct device_node *phy_np) if (!phy_np) return NULL; - d = bus_find_device(&mdio_bus_type, NULL, phy_np, of_phy_match); + d = bus_find_device_by_of_node(&mdio_bus_type, phy_np); if (d) { mdiodev = to_mdio_device(d); if (mdiodev->flags & MDIO_DEVICE_FLAG_PHY) diff --git a/drivers/of/platform.c b/drivers/of/platform.c index 7801e25e6895..b47a2292fe8e 100644 --- a/drivers/of/platform.c +++ b/drivers/of/platform.c @@ -37,11 +37,6 @@ static const struct of_device_id of_skipped_node_table[] = { {} /* Empty terminated list */ }; -static int of_dev_node_match(struct device *dev, const void *data) -{ - return dev->of_node == data; -} - /** * of_find_device_by_node - Find the platform_device associated with a node * @np: Pointer to device tree node @@ -55,7 +50,7 @@ struct platform_device *of_find_device_by_node(struct device_node *np) { struct device *dev; - dev = bus_find_device(&platform_bus_type, NULL, np, of_dev_node_match); + dev = bus_find_device_by_of_node(&platform_bus_type, np); return dev ? to_platform_device(dev) : NULL; } EXPORT_SYMBOL(of_find_device_by_node); diff --git a/drivers/regulator/of_regulator.c b/drivers/regulator/of_regulator.c index 397918ebba55..20dcc9c03adc 100644 --- a/drivers/regulator/of_regulator.c +++ b/drivers/regulator/of_regulator.c @@ -460,16 +460,11 @@ error: return NULL; } -static int of_node_match(struct device *dev, const void *data) -{ - return dev->of_node == data; -} - struct regulator_dev *of_find_regulator_by_node(struct device_node *np) { struct device *dev; - dev = class_find_device(®ulator_class, NULL, np, of_node_match); + dev = class_find_device_by_of_node(®ulator_class, np); return dev ? dev_to_rdev(dev) : NULL; } diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index 75ac046cae52..a591da87981a 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -3652,37 +3652,25 @@ EXPORT_SYMBOL_GPL(spi_write_then_read); /*-------------------------------------------------------------------------*/ #if IS_ENABLED(CONFIG_OF) -static int __spi_of_device_match(struct device *dev, const void *data) -{ - return dev->of_node == data; -} - /* must call put_device() when done with returned spi_device device */ struct spi_device *of_find_spi_device_by_node(struct device_node *node) { - struct device *dev = bus_find_device(&spi_bus_type, NULL, node, - __spi_of_device_match); + struct device *dev = bus_find_device_by_of_node(&spi_bus_type, node); + return dev ? to_spi_device(dev) : NULL; } EXPORT_SYMBOL_GPL(of_find_spi_device_by_node); #endif /* IS_ENABLED(CONFIG_OF) */ #if IS_ENABLED(CONFIG_OF_DYNAMIC) -static int __spi_of_controller_match(struct device *dev, const void *data) -{ - return dev->of_node == data; -} - /* the spi controllers are not using spi_bus, so we find it with another way */ static struct spi_controller *of_find_spi_controller_by_node(struct device_node *node) { struct device *dev; - dev = class_find_device(&spi_master_class, NULL, node, - __spi_of_controller_match); + dev = class_find_device_by_of_node(&spi_master_class, node); if (!dev && IS_ENABLED(CONFIG_SPI_SLAVE)) - dev = class_find_device(&spi_slave_class, NULL, node, - __spi_of_controller_match); + dev = class_find_device_by_of_node(&spi_slave_class, node); if (!dev) return NULL; diff --git a/include/linux/device.h b/include/linux/device.h index 3ba376b8b456..29d8d7ad41e6 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -186,6 +186,18 @@ static inline struct device *bus_find_device_by_name(struct bus_type *bus, return bus_find_device(bus, start, name, device_match_name); } +/** + * bus_find_device_by_of_node : device iterator for locating a particular device + * matching the of_node. + * @bus: bus type + * @np: of_node of the device to match. + */ +static inline struct device * +bus_find_device_by_of_node(struct bus_type *bus, const struct device_node *np) +{ + return bus_find_device(bus, NULL, np, device_match_of_node); +} + struct device *subsys_find_device_by_id(struct bus_type *bus, unsigned int id, struct device *hint); int bus_for_each_drv(struct bus_type *bus, struct device_driver *start, @@ -366,6 +378,19 @@ static inline struct device *driver_find_device_by_name(struct device_driver *dr return driver_find_device(drv, NULL, name, device_match_name); } +/** + * driver_find_device_by_of_node- device iterator for locating a particular device + * by of_node pointer. + * @driver: the driver we're iterating + * @np: of_node pointer to match. + */ +static inline struct device * +driver_find_device_by_of_node(struct device_driver *drv, + const struct device_node *np) +{ + return driver_find_device(drv, NULL, np, device_match_of_node); +} + void driver_deferred_probe_add(struct device *dev); int driver_deferred_probe_check_state(struct device *dev); int driver_deferred_probe_check_state_continue(struct device *dev); @@ -507,6 +532,18 @@ static inline struct device *class_find_device_by_name(struct class *class, return class_find_device(class, NULL, name, device_match_name); } +/** + * class_find_device_by_of_node : device iterator for locating a particular device + * matching the of_node. + * @class: class type + * @np: of_node of the device to match. + */ +static inline struct device * +class_find_device_by_of_node(struct class *class, const struct device_node *np) +{ + return class_find_device(class, NULL, np, device_match_of_node); +} + struct class_attribute { struct attribute attr; ssize_t (*show)(struct class *class, struct class_attribute *attr, diff --git a/sound/soc/rockchip/rk3399_gru_sound.c b/sound/soc/rockchip/rk3399_gru_sound.c index c16b0ffe8cfc..d951100bf770 100644 --- a/sound/soc/rockchip/rk3399_gru_sound.c +++ b/sound/soc/rockchip/rk3399_gru_sound.c @@ -422,11 +422,6 @@ static const struct dailink_match_data dailink_match[] = { }, }; -static int of_dev_node_match(struct device *dev, const void *data) -{ - return dev->of_node == data; -} - static int rockchip_sound_codec_node_match(struct device_node *np_codec) { struct device *dev; @@ -438,8 +433,8 @@ static int rockchip_sound_codec_node_match(struct device_node *np_codec) continue; if (dailink_match[i].bus_type) { - dev = bus_find_device(dailink_match[i].bus_type, NULL, - np_codec, of_dev_node_match); + dev = bus_find_device_by_of_node(dailink_match[i].bus_type, + np_codec); if (!dev) continue; put_device(dev); -- cgit v1.2.3 From 67843bbaf36eb087714f40e783ee78e99e9e4b86 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Tue, 23 Jul 2019 23:18:34 +0100 Subject: drivers: Introduce device lookup variants by fwnode Add a helper to match the firmware node handle of a device and provide wrappers for {bus/class/driver}_find_device() APIs to avoid proliferation of duplicate custom match functions. Cc: "David S. Miller" Cc: Doug Ledford Cc: Jason Gunthorpe Cc: linux-usb@vger.kernel.org Cc: "Rafael J. Wysocki" Cc: Ulf Hansson Cc: Joe Perches Cc: Will Deacon Cc: Joerg Roedel Signed-off-by: Suzuki K Poulose Acked-by: Robin Murphy Reviewed-by: Mathieu Poirier Reviewed-by: Heikki Krogerus Link: https://lore.kernel.org/r/20190723221838.12024-4-suzuki.poulose@arm.com Signed-off-by: Greg Kroah-Hartman --- drivers/base/core.c | 6 ++++ drivers/base/devcon.c | 8 +---- drivers/hwtracing/coresight/coresight-platform.c | 11 ++---- drivers/hwtracing/coresight/coresight-priv.h | 2 -- drivers/hwtracing/coresight/coresight.c | 4 +-- drivers/infiniband/hw/hns/hns_roce_hw_v1.c | 8 +---- drivers/iommu/arm-smmu-v3.c | 9 ++--- drivers/iommu/arm-smmu.c | 9 ++--- drivers/net/ethernet/hisilicon/hns/hns_dsaf_misc.c | 8 +---- drivers/usb/roles/class.c | 8 +---- drivers/usb/typec/class.c | 8 +---- include/linux/device.h | 39 ++++++++++++++++++++++ 12 files changed, 57 insertions(+), 63 deletions(-) (limited to 'include') diff --git a/drivers/base/core.c b/drivers/base/core.c index fb83647d685a..e8f81a667545 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -3368,3 +3368,9 @@ int device_match_of_node(struct device *dev, const void *np) return dev->of_node == np; } EXPORT_SYMBOL_GPL(device_match_of_node); + +int device_match_fwnode(struct device *dev, const void *fwnode) +{ + return dev_fwnode(dev) == fwnode; +} +EXPORT_SYMBOL_GPL(device_match_fwnode); diff --git a/drivers/base/devcon.c b/drivers/base/devcon.c index 09f28479b243..1d488dc5dd0c 100644 --- a/drivers/base/devcon.c +++ b/drivers/base/devcon.c @@ -133,19 +133,13 @@ static struct bus_type *generic_match_buses[] = { NULL, }; -static int device_fwnode_match(struct device *dev, const void *fwnode) -{ - return dev_fwnode(dev) == fwnode; -} - static void *device_connection_fwnode_match(struct device_connection *con) { struct bus_type *bus; struct device *dev; for (bus = generic_match_buses[0]; bus; bus++) { - dev = bus_find_device(bus, NULL, (void *)con->fwnode, - device_fwnode_match); + dev = bus_find_device_by_fwnode(bus, con->fwnode); if (dev && !strncmp(dev_name(dev), con->id, strlen(con->id))) return dev; diff --git a/drivers/hwtracing/coresight/coresight-platform.c b/drivers/hwtracing/coresight/coresight-platform.c index dad7d96c5943..3c5bee429105 100644 --- a/drivers/hwtracing/coresight/coresight-platform.c +++ b/drivers/hwtracing/coresight/coresight-platform.c @@ -37,11 +37,6 @@ static int coresight_alloc_conns(struct device *dev, return 0; } -int coresight_device_fwnode_match(struct device *dev, const void *fwnode) -{ - return dev_fwnode(dev) == fwnode; -} - static struct device * coresight_find_device_by_fwnode(struct fwnode_handle *fwnode) { @@ -51,8 +46,7 @@ coresight_find_device_by_fwnode(struct fwnode_handle *fwnode) * If we have a non-configurable replicator, it will be found on the * platform bus. */ - dev = bus_find_device(&platform_bus_type, NULL, - fwnode, coresight_device_fwnode_match); + dev = bus_find_device_by_fwnode(&platform_bus_type, fwnode); if (dev) return dev; @@ -60,8 +54,7 @@ coresight_find_device_by_fwnode(struct fwnode_handle *fwnode) * We have a configurable component - circle through the AMBA bus * looking for the device that matches the endpoint node. */ - return bus_find_device(&amba_bustype, NULL, - fwnode, coresight_device_fwnode_match); + return bus_find_device_by_fwnode(&amba_bustype, fwnode); } #ifdef CONFIG_OF diff --git a/drivers/hwtracing/coresight/coresight-priv.h b/drivers/hwtracing/coresight/coresight-priv.h index 7d401790dd7e..61d7f9ff054d 100644 --- a/drivers/hwtracing/coresight/coresight-priv.h +++ b/drivers/hwtracing/coresight/coresight-priv.h @@ -202,6 +202,4 @@ static inline void *coresight_get_uci_data(const struct amba_id *id) void coresight_release_platform_data(struct coresight_platform_data *pdata); -int coresight_device_fwnode_match(struct device *dev, const void *fwnode); - #endif diff --git a/drivers/hwtracing/coresight/coresight.c b/drivers/hwtracing/coresight/coresight.c index 55db77f6410b..6453c67a4d01 100644 --- a/drivers/hwtracing/coresight/coresight.c +++ b/drivers/hwtracing/coresight/coresight.c @@ -1046,9 +1046,7 @@ static void coresight_fixup_device_conns(struct coresight_device *csdev) struct coresight_connection *conn = &csdev->pdata->conns[i]; struct device *dev = NULL; - dev = bus_find_device(&coresight_bustype, NULL, - (void *)conn->child_fwnode, - coresight_device_fwnode_match); + dev = bus_find_device_by_fwnode(&coresight_bustype, conn->child_fwnode); if (dev) { conn->child_dev = to_coresight_device(dev); /* and put reference from 'bus_find_device()' */ diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c index 81e6dedb1e02..fa05e943038a 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c @@ -4499,19 +4499,13 @@ static const struct acpi_device_id hns_roce_acpi_match[] = { }; MODULE_DEVICE_TABLE(acpi, hns_roce_acpi_match); -static int hns_roce_node_match(struct device *dev, const void *fwnode) -{ - return dev->fwnode == fwnode; -} - static struct platform_device *hns_roce_find_pdev(struct fwnode_handle *fwnode) { struct device *dev; /* get the 'device' corresponding to the matching 'fwnode' */ - dev = bus_find_device(&platform_bus_type, NULL, - fwnode, hns_roce_node_match); + dev = bus_find_device_by_fwnode(&platform_bus_type, fwnode); /* get the platform device */ return dev ? to_platform_device(dev) : NULL; } diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c index a9a9fabd3968..6f0e13fa5e1a 100644 --- a/drivers/iommu/arm-smmu-v3.c +++ b/drivers/iommu/arm-smmu-v3.c @@ -2034,16 +2034,11 @@ arm_smmu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova) static struct platform_driver arm_smmu_driver; -static int arm_smmu_match_node(struct device *dev, const void *data) -{ - return dev->fwnode == data; -} - static struct arm_smmu_device *arm_smmu_get_by_fwnode(struct fwnode_handle *fwnode) { - struct device *dev = driver_find_device(&arm_smmu_driver.driver, NULL, - fwnode, arm_smmu_match_node); + struct device *dev = driver_find_device_by_fwnode(&arm_smmu_driver.driver, + fwnode); put_device(dev); return dev ? dev_get_drvdata(dev) : NULL; } diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c index 64977c131ee6..aa06498f291d 100644 --- a/drivers/iommu/arm-smmu.c +++ b/drivers/iommu/arm-smmu.c @@ -1426,16 +1426,11 @@ static bool arm_smmu_capable(enum iommu_cap cap) } } -static int arm_smmu_match_node(struct device *dev, const void *data) -{ - return dev->fwnode == data; -} - static struct arm_smmu_device *arm_smmu_get_by_fwnode(struct fwnode_handle *fwnode) { - struct device *dev = driver_find_device(&arm_smmu_driver.driver, NULL, - fwnode, arm_smmu_match_node); + struct device *dev = driver_find_device_by_fwnode(&arm_smmu_driver.driver, + fwnode); put_device(dev); return dev ? dev_get_drvdata(dev) : NULL; } diff --git a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_misc.c b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_misc.c index bb6586d0e5af..ed3829ae4ef1 100644 --- a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_misc.c +++ b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_misc.c @@ -754,17 +754,11 @@ struct dsaf_misc_op *hns_misc_op_get(struct dsaf_device *dsaf_dev) return (void *)misc_op; } -static int hns_dsaf_dev_match(struct device *dev, const void *fwnode) -{ - return dev->fwnode == fwnode; -} - struct platform_device *hns_dsaf_find_platform_device(struct fwnode_handle *fwnode) { struct device *dev; - dev = bus_find_device(&platform_bus_type, NULL, - fwnode, hns_dsaf_dev_match); + dev = bus_find_device_by_fwnode(&platform_bus_type, fwnode); return dev ? to_platform_device(dev) : NULL; } diff --git a/drivers/usb/roles/class.c b/drivers/usb/roles/class.c index c8efe60e2465..0526efbc4922 100644 --- a/drivers/usb/roles/class.c +++ b/drivers/usb/roles/class.c @@ -85,11 +85,6 @@ enum usb_role usb_role_switch_get_role(struct usb_role_switch *sw) } EXPORT_SYMBOL_GPL(usb_role_switch_get_role); -static int switch_fwnode_match(struct device *dev, const void *fwnode) -{ - return dev_fwnode(dev) == fwnode; -} - static void *usb_role_switch_match(struct device_connection *con, int ep, void *data) { @@ -99,8 +94,7 @@ static void *usb_role_switch_match(struct device_connection *con, int ep, if (con->id && !fwnode_property_present(con->fwnode, con->id)) return NULL; - dev = class_find_device(role_class, NULL, con->fwnode, - switch_fwnode_match); + dev = class_find_device_by_fwnode(role_class, con->fwnode); } else { dev = class_find_device_by_name(role_class, con->endpoint[ep]); } diff --git a/drivers/usb/typec/class.c b/drivers/usb/typec/class.c index 9b0d15b487e5..94a3eda62add 100644 --- a/drivers/usb/typec/class.c +++ b/drivers/usb/typec/class.c @@ -205,11 +205,6 @@ static void typec_altmode_put_partner(struct altmode *altmode) put_device(&adev->dev); } -static int typec_port_fwnode_match(struct device *dev, const void *fwnode) -{ - return dev_fwnode(dev) == fwnode; -} - static void *typec_port_match(struct device_connection *con, int ep, void *data) { struct device *dev; @@ -219,8 +214,7 @@ static void *typec_port_match(struct device_connection *con, int ep, void *data) * we need to return ERR_PTR(-PROBE_DEFER) when there is no device. */ if (con->fwnode) - return class_find_device(typec_class, NULL, con->fwnode, - typec_port_fwnode_match); + return class_find_device_by_fwnode(typec_class, con->fwnode); dev = class_find_device_by_name(typec_class, con->endpoint[ep]); diff --git a/include/linux/device.h b/include/linux/device.h index 29d8d7ad41e6..7133fc1c285d 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -166,6 +166,7 @@ void subsys_dev_iter_exit(struct subsys_dev_iter *iter); int device_match_name(struct device *dev, const void *name); int device_match_of_node(struct device *dev, const void *np); +int device_match_fwnode(struct device *dev, const void *fwnode); int bus_for_each_dev(struct bus_type *bus, struct device *start, void *data, int (*fn)(struct device *dev, void *data)); @@ -198,6 +199,18 @@ bus_find_device_by_of_node(struct bus_type *bus, const struct device_node *np) return bus_find_device(bus, NULL, np, device_match_of_node); } +/** + * bus_find_device_by_fwnode : device iterator for locating a particular device + * matching the fwnode. + * @bus: bus type + * @fwnode: fwnode of the device to match. + */ +static inline struct device * +bus_find_device_by_fwnode(struct bus_type *bus, const struct fwnode_handle *fwnode) +{ + return bus_find_device(bus, NULL, fwnode, device_match_fwnode); +} + struct device *subsys_find_device_by_id(struct bus_type *bus, unsigned int id, struct device *hint); int bus_for_each_drv(struct bus_type *bus, struct device_driver *start, @@ -391,6 +404,19 @@ driver_find_device_by_of_node(struct device_driver *drv, return driver_find_device(drv, NULL, np, device_match_of_node); } +/** + * driver_find_device_by_fwnode- device iterator for locating a particular device + * by fwnode pointer. + * @driver: the driver we're iterating + * @fwnode: fwnode pointer to match. + */ +static inline struct device * +driver_find_device_by_fwnode(struct device_driver *drv, + const struct fwnode_handle *fwnode) +{ + return driver_find_device(drv, NULL, fwnode, device_match_fwnode); +} + void driver_deferred_probe_add(struct device *dev); int driver_deferred_probe_check_state(struct device *dev); int driver_deferred_probe_check_state_continue(struct device *dev); @@ -544,6 +570,19 @@ class_find_device_by_of_node(struct class *class, const struct device_node *np) return class_find_device(class, NULL, np, device_match_of_node); } +/** + * class_find_device_by_fwnode : device iterator for locating a particular device + * matching the fwnode. + * @class: class type + * @fwnode: fwnode of the device to match. + */ +static inline struct device * +class_find_device_by_fwnode(struct class *class, + const struct fwnode_handle *fwnode) +{ + return class_find_device(class, NULL, fwnode, device_match_fwnode); +} + struct class_attribute { struct attribute attr; ssize_t (*show)(struct class *class, struct class_attribute *attr, -- cgit v1.2.3 From 4495dfdd6193d9712b7b8f5d699d89d5996e6aaa Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Tue, 23 Jul 2019 23:18:35 +0100 Subject: drivers: Introduce device lookup variants by device type Add a helper to match a device by its type and provide wrappers for {bus/class/driver}_find_device() APIs. Cc: Alexander Shishkin Cc: Arnd Bergmann Cc: Harald Freudenberger Cc: Heiko Carstens Cc: linux-usb@vger.kernel.org Cc: Oliver Neukum Cc: Sebastian Andrzej Siewior Cc: Tomas Winkler Cc: "Rafael J. Wysocki" Cc: Greg Kroah-Hartman Cc: Ulf Hansson Cc: Joe Perches Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20190723221838.12024-5-suzuki.poulose@arm.com Signed-off-by: Greg Kroah-Hartman --- drivers/base/core.c | 15 +++++++-------- drivers/hwtracing/intel_th/core.c | 10 +--------- drivers/misc/mei/main.c | 9 +-------- drivers/s390/crypto/zcrypt_api.c | 11 +---------- drivers/tty/tty_io.c | 8 +------- drivers/usb/core/devio.c | 8 +------- include/linux/device.h | 37 +++++++++++++++++++++++++++++++++++++ 7 files changed, 49 insertions(+), 49 deletions(-) (limited to 'include') diff --git a/drivers/base/core.c b/drivers/base/core.c index e8f81a667545..3abc32b60c0a 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -2867,13 +2867,6 @@ struct device *device_create_with_groups(struct class *class, } EXPORT_SYMBOL_GPL(device_create_with_groups); -static int __match_devt(struct device *dev, const void *data) -{ - const dev_t *devt = data; - - return dev->devt == *devt; -} - /** * device_destroy - removes a device that was created with device_create() * @class: pointer to the struct class that this device was registered with @@ -2886,7 +2879,7 @@ void device_destroy(struct class *class, dev_t devt) { struct device *dev; - dev = class_find_device(class, NULL, &devt, __match_devt); + dev = class_find_device_by_devt(class, devt); if (dev) { put_device(dev); device_unregister(dev); @@ -3374,3 +3367,9 @@ int device_match_fwnode(struct device *dev, const void *fwnode) return dev_fwnode(dev) == fwnode; } EXPORT_SYMBOL_GPL(device_match_fwnode); + +int device_match_devt(struct device *dev, const void *pdevt) +{ + return dev->devt == *(dev_t *)pdevt; +} +EXPORT_SYMBOL_GPL(device_match_devt); diff --git a/drivers/hwtracing/intel_th/core.c b/drivers/hwtracing/intel_th/core.c index 55922896d862..d5c1821b31c6 100644 --- a/drivers/hwtracing/intel_th/core.c +++ b/drivers/hwtracing/intel_th/core.c @@ -789,12 +789,6 @@ static int intel_th_populate(struct intel_th *th) return 0; } -static int match_devt(struct device *dev, const void *data) -{ - dev_t devt = (dev_t)(unsigned long)(void *)data; - return dev->devt == devt; -} - static int intel_th_output_open(struct inode *inode, struct file *file) { const struct file_operations *fops; @@ -802,9 +796,7 @@ static int intel_th_output_open(struct inode *inode, struct file *file) struct device *dev; int err; - dev = bus_find_device(&intel_th_bus, NULL, - (void *)(unsigned long)inode->i_rdev, - match_devt); + dev = bus_find_device_by_devt(&intel_th_bus, inode->i_rdev); if (!dev || !dev->driver) return -ENODEV; diff --git a/drivers/misc/mei/main.c b/drivers/misc/mei/main.c index f894d1f8a53e..7310b476323c 100644 --- a/drivers/misc/mei/main.c +++ b/drivers/misc/mei/main.c @@ -858,13 +858,6 @@ static ssize_t dev_state_show(struct device *device, } static DEVICE_ATTR_RO(dev_state); -static int match_devt(struct device *dev, const void *data) -{ - const dev_t *devt = data; - - return dev->devt == *devt; -} - /** * dev_set_devstate: set to new device state and notify sysfs file. * @@ -880,7 +873,7 @@ void mei_set_devstate(struct mei_device *dev, enum mei_dev_state state) dev->dev_state = state; - clsdev = class_find_device(mei_class, NULL, &dev->cdev.dev, match_devt); + clsdev = class_find_device_by_devt(mei_class, dev->cdev.dev); if (clsdev) { sysfs_notify(&clsdev->kobj, NULL, "dev_state"); put_device(clsdev); diff --git a/drivers/s390/crypto/zcrypt_api.c b/drivers/s390/crypto/zcrypt_api.c index 38a5a47b8c9c..150f6236c9bb 100644 --- a/drivers/s390/crypto/zcrypt_api.c +++ b/drivers/s390/crypto/zcrypt_api.c @@ -133,12 +133,6 @@ struct zcdn_device { static int zcdn_create(const char *name); static int zcdn_destroy(const char *name); -/* helper function, matches the devt value for find_zcdndev_by_devt() */ -static int __match_zcdn_devt(struct device *dev, const void *data) -{ - return dev->devt == *((dev_t *) data); -} - /* * Find zcdn device by name. * Returns reference to the zcdn device which needs to be released @@ -158,10 +152,7 @@ static inline struct zcdn_device *find_zcdndev_by_name(const char *name) */ static inline struct zcdn_device *find_zcdndev_by_devt(dev_t devt) { - struct device *dev = - class_find_device(zcrypt_class, NULL, - (void *) &devt, - __match_zcdn_devt); + struct device *dev = class_find_device_by_devt(zcrypt_class, devt); return dev ? to_zcdn_dev(dev) : NULL; } diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c index 566728fbaf3c..802c1210558f 100644 --- a/drivers/tty/tty_io.c +++ b/drivers/tty/tty_io.c @@ -2952,17 +2952,11 @@ void do_SAK(struct tty_struct *tty) EXPORT_SYMBOL(do_SAK); -static int dev_match_devt(struct device *dev, const void *data) -{ - const dev_t *devt = data; - return dev->devt == *devt; -} - /* Must put_device() after it's unused! */ static struct device *tty_get_device(struct tty_struct *tty) { dev_t devt = tty_devnum(tty); - return class_find_device(tty_class, NULL, &devt, dev_match_devt); + return class_find_device_by_devt(tty_class, devt); } diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c index b265ab5405f9..60268aee93a8 100644 --- a/drivers/usb/core/devio.c +++ b/drivers/usb/core/devio.c @@ -942,17 +942,11 @@ error: return ret; } -static int match_devt(struct device *dev, const void *data) -{ - return dev->devt == (dev_t)(unsigned long)(void *)data; -} - static struct usb_device *usbdev_lookup_by_devt(dev_t devt) { struct device *dev; - dev = bus_find_device(&usb_bus_type, NULL, - (void *) (unsigned long) devt, match_devt); + dev = bus_find_device_by_devt(&usb_bus_type, devt); if (!dev) return NULL; return to_usb_device(dev); diff --git a/include/linux/device.h b/include/linux/device.h index 7133fc1c285d..93b2f55ef44e 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -167,6 +167,7 @@ void subsys_dev_iter_exit(struct subsys_dev_iter *iter); int device_match_name(struct device *dev, const void *name); int device_match_of_node(struct device *dev, const void *np); int device_match_fwnode(struct device *dev, const void *fwnode); +int device_match_devt(struct device *dev, const void *pdevt); int bus_for_each_dev(struct bus_type *bus, struct device *start, void *data, int (*fn)(struct device *dev, void *data)); @@ -211,6 +212,18 @@ bus_find_device_by_fwnode(struct bus_type *bus, const struct fwnode_handle *fwno return bus_find_device(bus, NULL, fwnode, device_match_fwnode); } +/** + * bus_find_device_by_devt : device iterator for locating a particular device + * matching the device type. + * @bus: bus type + * @devt: device type of the device to match. + */ +static inline struct device *bus_find_device_by_devt(struct bus_type *bus, + dev_t devt) +{ + return bus_find_device(bus, NULL, &devt, device_match_devt); +} + struct device *subsys_find_device_by_id(struct bus_type *bus, unsigned int id, struct device *hint); int bus_for_each_drv(struct bus_type *bus, struct device_driver *start, @@ -417,6 +430,18 @@ driver_find_device_by_fwnode(struct device_driver *drv, return driver_find_device(drv, NULL, fwnode, device_match_fwnode); } +/** + * driver_find_device_by_devt- device iterator for locating a particular device + * by devt. + * @driver: the driver we're iterating + * @devt: devt pointer to match. + */ +static inline struct device *driver_find_device_by_devt(struct device_driver *drv, + dev_t devt) +{ + return driver_find_device(drv, NULL, &devt, device_match_devt); +} + void driver_deferred_probe_add(struct device *dev); int driver_deferred_probe_check_state(struct device *dev); int driver_deferred_probe_check_state_continue(struct device *dev); @@ -583,6 +608,18 @@ class_find_device_by_fwnode(struct class *class, return class_find_device(class, NULL, fwnode, device_match_fwnode); } +/** + * class_find_device_by_devt : device iterator for locating a particular device + * matching the device type. + * @class: class type + * @devt: device type of the device to match. + */ +static inline struct device *class_find_device_by_devt(struct class *class, + dev_t devt) +{ + return class_find_device(class, NULL, &devt, device_match_devt); +} + struct class_attribute { struct attribute attr; ssize_t (*show)(struct class *class, struct class_attribute *attr, -- cgit v1.2.3 From 00500147cbd3fc51353d0d003eaa9d31c72c0d50 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Tue, 23 Jul 2019 23:18:36 +0100 Subject: drivers: Introduce device lookup variants by ACPI_COMPANION device Add a generic helper to match a device by the ACPI_COMPANION device and provide wrappers for the device lookup APIs. Cc: Len Brown Cc: linux-acpi@vger.kernel.org Cc: linux-spi@vger.kernel.org Cc: Mika Westerberg Cc: linux-i2c@vger.kernel.org Cc: "Rafael J. Wysocki" Signed-off-by: Suzuki K Poulose Acked-by: Mark Brown Acked-by: Wolfram Sang # I2C parts Link: https://lore.kernel.org/r/20190723221838.12024-6-suzuki.poulose@arm.com Signed-off-by: Greg Kroah-Hartman --- drivers/base/core.c | 6 +++++ drivers/i2c/i2c-core-acpi.c | 11 ++------ drivers/spi/spi.c | 8 +----- include/linux/device.h | 65 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 74 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/drivers/base/core.c b/drivers/base/core.c index 3abc32b60c0a..57d71bc2c559 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -3373,3 +3373,9 @@ int device_match_devt(struct device *dev, const void *pdevt) return dev->devt == *(dev_t *)pdevt; } EXPORT_SYMBOL_GPL(device_match_devt); + +int device_match_acpi_dev(struct device *dev, const void *adev) +{ + return ACPI_COMPANION(dev) == adev; +} +EXPORT_SYMBOL(device_match_acpi_dev); diff --git a/drivers/i2c/i2c-core-acpi.c b/drivers/i2c/i2c-core-acpi.c index 4dbbc9a35f65..bc80aafb521f 100644 --- a/drivers/i2c/i2c-core-acpi.c +++ b/drivers/i2c/i2c-core-acpi.c @@ -354,17 +354,11 @@ static int i2c_acpi_find_match_adapter(struct device *dev, const void *data) return ACPI_HANDLE(dev) == (acpi_handle)data; } -static int i2c_acpi_find_match_device(struct device *dev, const void *data) -{ - return ACPI_COMPANION(dev) == data; -} struct i2c_adapter *i2c_acpi_find_adapter_by_handle(acpi_handle handle) { - struct device *dev; + struct device *dev = bus_find_device_by_acpi_dev(&i2c_bus_type, handle); - dev = bus_find_device(&i2c_bus_type, NULL, handle, - i2c_acpi_find_match_adapter); return dev ? i2c_verify_adapter(dev) : NULL; } EXPORT_SYMBOL_GPL(i2c_acpi_find_adapter_by_handle); @@ -373,8 +367,7 @@ static struct i2c_client *i2c_acpi_find_client_by_adev(struct acpi_device *adev) { struct device *dev; - dev = bus_find_device(&i2c_bus_type, NULL, adev, - i2c_acpi_find_match_device); + dev = bus_find_device_by_acpi_dev(&i2c_bus_type, adev); return dev ? i2c_verify_client(dev) : NULL; } diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index a591da87981a..c486a6f84c2c 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -3741,11 +3741,6 @@ static int spi_acpi_controller_match(struct device *dev, const void *data) return ACPI_COMPANION(dev->parent) == data; } -static int spi_acpi_device_match(struct device *dev, const void *data) -{ - return ACPI_COMPANION(dev) == data; -} - static struct spi_controller *acpi_spi_find_controller_by_adev(struct acpi_device *adev) { struct device *dev; @@ -3765,8 +3760,7 @@ static struct spi_device *acpi_spi_find_device_by_adev(struct acpi_device *adev) { struct device *dev; - dev = bus_find_device(&spi_bus_type, NULL, adev, spi_acpi_device_match); - + dev = bus_find_device_by_acpi_dev(&spi_bus_type, adev); return dev ? to_spi_device(dev) : NULL; } diff --git a/include/linux/device.h b/include/linux/device.h index 93b2f55ef44e..7514ef3d3f1a 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -168,6 +168,7 @@ int device_match_name(struct device *dev, const void *name); int device_match_of_node(struct device *dev, const void *np); int device_match_fwnode(struct device *dev, const void *fwnode); int device_match_devt(struct device *dev, const void *pdevt); +int device_match_acpi_dev(struct device *dev, const void *adev); int bus_for_each_dev(struct bus_type *bus, struct device *start, void *data, int (*fn)(struct device *dev, void *data)); @@ -224,6 +225,28 @@ static inline struct device *bus_find_device_by_devt(struct bus_type *bus, return bus_find_device(bus, NULL, &devt, device_match_devt); } +#ifdef CONFIG_ACPI +struct acpi_device; + +/** + * bus_find_device_by_acpi_dev : device iterator for locating a particular device + * matching the ACPI COMPANION device. + * @bus: bus type + * @adev: ACPI COMPANION device to match. + */ +static inline struct device * +bus_find_device_by_acpi_dev(struct bus_type *bus, const struct acpi_device *adev) +{ + return bus_find_device(bus, NULL, adev, device_match_acpi_dev); +} +#else +static inline struct device * +bus_find_device_by_acpi_dev(struct bus_type *bus, const void *adev) +{ + return NULL; +} +#endif + struct device *subsys_find_device_by_id(struct bus_type *bus, unsigned int id, struct device *hint); int bus_for_each_drv(struct bus_type *bus, struct device_driver *start, @@ -442,6 +465,27 @@ static inline struct device *driver_find_device_by_devt(struct device_driver *dr return driver_find_device(drv, NULL, &devt, device_match_devt); } +#ifdef CONFIG_ACPI +/** + * driver_find_device_by_acpi_dev : device iterator for locating a particular + * device matching the ACPI_COMPANION device. + * @driver: the driver we're iterating + * @adev: ACPI_COMPANION device to match. + */ +static inline struct device * +driver_find_device_by_acpi_dev(struct device_driver *drv, + const struct acpi_device *adev) +{ + return driver_find_device(drv, NULL, adev, device_match_acpi_dev); +} +#else +static inline struct device * +driver_find_device_by_acpi_dev(struct device_driver *drv, const void *adev) +{ + return NULL; +} +#endif + void driver_deferred_probe_add(struct device *dev); int driver_deferred_probe_check_state(struct device *dev); int driver_deferred_probe_check_state_continue(struct device *dev); @@ -620,6 +664,27 @@ static inline struct device *class_find_device_by_devt(struct class *class, return class_find_device(class, NULL, &devt, device_match_devt); } +#ifdef CONFIG_ACPI +struct acpi_device; +/** + * class_find_device_by_acpi_dev : device iterator for locating a particular + * device matching the ACPI_COMPANION device. + * @class: class type + * @adev: ACPI_COMPANION device to match. + */ +static inline struct device * +class_find_device_by_acpi_dev(struct class *class, const struct acpi_device *adev) +{ + return class_find_device(class, NULL, adev, device_match_acpi_dev); +} +#else +static inline struct device * +class_find_device_by_acpi_dev(struct class *class, const void *adev) +{ + return NULL; +} +#endif + struct class_attribute { struct attribute attr; ssize_t (*show)(struct class *class, struct class_attribute *attr, -- cgit v1.2.3 From 6bf85ba9e55f659ddc0747bf1bb504ec6d15f525 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Tue, 23 Jul 2019 23:18:37 +0100 Subject: drivers: Add generic helper to match any device Add a generic helper to match any/all devices. Using this introduce new wrappers {bus/driver/class}_find_next_device(). Cc: Elie Morisse Cc: "James E.J. Bottomley" Cc: "Martin K. Petersen" Cc: Nehal Shah Cc: "Rafael J. Wysocki" Cc: Shyam Sundar S K Signed-off-by: Suzuki K Poulose Acked-by: Bjorn Helgaas # PCI Link: https://lore.kernel.org/r/20190723221838.12024-7-suzuki.poulose@arm.com Signed-off-by: Greg Kroah-Hartman --- drivers/base/core.c | 6 ++++++ drivers/i2c/busses/i2c-amd-mp2-pci.c | 8 +------- drivers/pci/probe.c | 7 +------ drivers/s390/cio/ccwgroup.c | 8 +------- drivers/scsi/scsi_proc.c | 9 ++------- include/linux/device.h | 17 +++++++++++++++++ 6 files changed, 28 insertions(+), 27 deletions(-) (limited to 'include') diff --git a/drivers/base/core.c b/drivers/base/core.c index 57d71bc2c559..e22e29b3dc97 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -3379,3 +3379,9 @@ int device_match_acpi_dev(struct device *dev, const void *adev) return ACPI_COMPANION(dev) == adev; } EXPORT_SYMBOL(device_match_acpi_dev); + +int device_match_any(struct device *dev, const void *unused) +{ + return 1; +} +EXPORT_SYMBOL_GPL(device_match_any); diff --git a/drivers/i2c/busses/i2c-amd-mp2-pci.c b/drivers/i2c/busses/i2c-amd-mp2-pci.c index c7fe3b44a860..5e4800d72e00 100644 --- a/drivers/i2c/busses/i2c-amd-mp2-pci.c +++ b/drivers/i2c/busses/i2c-amd-mp2-pci.c @@ -457,18 +457,12 @@ static struct pci_driver amd_mp2_pci_driver = { }; module_pci_driver(amd_mp2_pci_driver); -static int amd_mp2_device_match(struct device *dev, const void *data) -{ - return 1; -} - struct amd_mp2_dev *amd_mp2_find_device(void) { struct device *dev; struct pci_dev *pci_dev; - dev = driver_find_device(&amd_mp2_pci_driver.driver, NULL, NULL, - amd_mp2_device_match); + dev = driver_find_next_device(&amd_mp2_pci_driver.driver, NULL); if (!dev) return NULL; diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index a3c7338fad86..dbeeb385fb9f 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -64,11 +64,6 @@ static struct resource *get_pci_domain_busn_res(int domain_nr) return &r->res; } -static int find_anything(struct device *dev, const void *data) -{ - return 1; -} - /* * Some device drivers need know if PCI is initiated. * Basically, we think PCI is not initiated when there @@ -79,7 +74,7 @@ int no_pci_devices(void) struct device *dev; int no_devices; - dev = bus_find_device(&pci_bus_type, NULL, NULL, find_anything); + dev = bus_find_next_device(&pci_bus_type, NULL); no_devices = (dev == NULL); put_device(dev); return no_devices; diff --git a/drivers/s390/cio/ccwgroup.c b/drivers/s390/cio/ccwgroup.c index d843e362c167..0005ec9285aa 100644 --- a/drivers/s390/cio/ccwgroup.c +++ b/drivers/s390/cio/ccwgroup.c @@ -581,11 +581,6 @@ int ccwgroup_driver_register(struct ccwgroup_driver *cdriver) } EXPORT_SYMBOL(ccwgroup_driver_register); -static int __ccwgroup_match_all(struct device *dev, const void *data) -{ - return 1; -} - /** * ccwgroup_driver_unregister() - deregister a ccw group driver * @cdriver: driver to be deregistered @@ -597,8 +592,7 @@ void ccwgroup_driver_unregister(struct ccwgroup_driver *cdriver) struct device *dev; /* We don't want ccwgroup devices to live longer than their driver. */ - while ((dev = driver_find_device(&cdriver->driver, NULL, NULL, - __ccwgroup_match_all))) { + while ((dev = driver_find_next_device(&cdriver->driver, NULL))) { struct ccwgroup_device *gdev = to_ccwgroupdev(dev); ccwgroup_ungroup(gdev); diff --git a/drivers/scsi/scsi_proc.c b/drivers/scsi/scsi_proc.c index c074631086a4..5b313226f11c 100644 --- a/drivers/scsi/scsi_proc.c +++ b/drivers/scsi/scsi_proc.c @@ -372,15 +372,10 @@ static ssize_t proc_scsi_write(struct file *file, const char __user *buf, return err; } -static int always_match(struct device *dev, const void *data) -{ - return 1; -} - static inline struct device *next_scsi_device(struct device *start) { - struct device *next = bus_find_device(&scsi_bus_type, start, NULL, - always_match); + struct device *next = bus_find_next_device(&scsi_bus_type, start); + put_device(start); return next; } diff --git a/include/linux/device.h b/include/linux/device.h index 7514ef3d3f1a..8ae3f4b47293 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -169,6 +169,7 @@ int device_match_of_node(struct device *dev, const void *np); int device_match_fwnode(struct device *dev, const void *fwnode); int device_match_devt(struct device *dev, const void *pdevt); int device_match_acpi_dev(struct device *dev, const void *adev); +int device_match_any(struct device *dev, const void *unused); int bus_for_each_dev(struct bus_type *bus, struct device *start, void *data, int (*fn)(struct device *dev, void *data)); @@ -225,6 +226,16 @@ static inline struct device *bus_find_device_by_devt(struct bus_type *bus, return bus_find_device(bus, NULL, &devt, device_match_devt); } +/** + * bus_find_next_device - Find the next device after a given device in a + * given bus. + */ +static inline struct device * +bus_find_next_device(struct bus_type *bus,struct device *cur) +{ + return bus_find_device(bus, cur, NULL, device_match_any); +} + #ifdef CONFIG_ACPI struct acpi_device; @@ -465,6 +476,12 @@ static inline struct device *driver_find_device_by_devt(struct device_driver *dr return driver_find_device(drv, NULL, &devt, device_match_devt); } +static inline struct device *driver_find_next_device(struct device_driver *drv, + struct device *start) +{ + return driver_find_device(drv, start, NULL, device_match_any); +} + #ifdef CONFIG_ACPI /** * driver_find_device_by_acpi_dev : device iterator for locating a particular -- cgit v1.2.3 From 36f3313d6bff91ab2a9e47698c27d15363640a4e Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Tue, 23 Jul 2019 23:18:38 +0100 Subject: platform: Add platform_find_device_by_driver() helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Provide a helper to lookup platform devices by matching device driver in order to avoid drivers trying to use platform bus internals. Cc: Eric Anholt Cc: Greg Kroah-Hartman Cc: "Heiko Stübner" Cc: Inki Dae Cc: "Rafael J. Wysocki" Cc: Sandy Huang Cc: Seung-Woo Kim Tested-by: Heiko Stuebner Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20190723221838.12024-8-suzuki.poulose@arm.com Signed-off-by: Greg Kroah-Hartman --- drivers/base/platform.c | 14 ++++++++++++++ drivers/gpu/drm/exynos/exynos_drm_drv.c | 9 +++------ drivers/gpu/drm/mcde/mcde_drv.c | 3 +-- drivers/gpu/drm/rockchip/rockchip_drm_drv.c | 3 +-- drivers/gpu/drm/vc4/vc4_drv.c | 3 +-- include/linux/platform_device.h | 3 +++ 6 files changed, 23 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/drivers/base/platform.c b/drivers/base/platform.c index 506a0175a5a7..a174ce5ea17c 100644 --- a/drivers/base/platform.c +++ b/drivers/base/platform.c @@ -1197,6 +1197,20 @@ struct bus_type platform_bus_type = { }; EXPORT_SYMBOL_GPL(platform_bus_type); +/** + * platform_find_device_by_driver - Find a platform device with a given + * driver. + * @start: The device to start the search from. + * @drv: The device driver to look for. + */ +struct device *platform_find_device_by_driver(struct device *start, + const struct device_driver *drv) +{ + return bus_find_device(&platform_bus_type, start, drv, + (void *)platform_match); +} +EXPORT_SYMBOL_GPL(platform_find_device_by_driver); + int __init platform_bus_init(void) { int error; diff --git a/drivers/gpu/drm/exynos/exynos_drm_drv.c b/drivers/gpu/drm/exynos/exynos_drm_drv.c index 58baf49d9926..badab94be2d6 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_drv.c +++ b/drivers/gpu/drm/exynos/exynos_drm_drv.c @@ -242,9 +242,7 @@ static struct component_match *exynos_drm_match_add(struct device *dev) if (!info->driver || !(info->flags & DRM_COMPONENT_DRIVER)) continue; - while ((d = bus_find_device(&platform_bus_type, p, - &info->driver->driver, - (void *)platform_bus_type.match))) { + while ((d = platform_find_device_by_driver(p, &info->driver->driver))) { put_device(p); if (!(info->flags & DRM_FIMC_DEVICE) || @@ -412,9 +410,8 @@ static void exynos_drm_unregister_devices(void) if (!info->driver || !(info->flags & DRM_VIRTUAL_DEVICE)) continue; - while ((dev = bus_find_device(&platform_bus_type, NULL, - &info->driver->driver, - (void *)platform_bus_type.match))) { + while ((dev = platform_find_device_by_driver(NULL, + &info->driver->driver))) { put_device(dev); platform_device_unregister(to_platform_device(dev)); } diff --git a/drivers/gpu/drm/mcde/mcde_drv.c b/drivers/gpu/drm/mcde/mcde_drv.c index baf63fb6850a..c07abf9e201c 100644 --- a/drivers/gpu/drm/mcde/mcde_drv.c +++ b/drivers/gpu/drm/mcde/mcde_drv.c @@ -477,8 +477,7 @@ static int mcde_probe(struct platform_device *pdev) struct device_driver *drv = &mcde_component_drivers[i]->driver; struct device *p = NULL, *d; - while ((d = bus_find_device(&platform_bus_type, p, drv, - (void *)platform_bus_type.match))) { + while ((d = platform_find_device_by_driver(p, drv))) { put_device(p); component_match_add(dev, &match, mcde_compare_dev, d); p = d; diff --git a/drivers/gpu/drm/rockchip/rockchip_drm_drv.c b/drivers/gpu/drm/rockchip/rockchip_drm_drv.c index 53d2c5bd61dc..38dc26376961 100644 --- a/drivers/gpu/drm/rockchip/rockchip_drm_drv.c +++ b/drivers/gpu/drm/rockchip/rockchip_drm_drv.c @@ -330,8 +330,7 @@ static struct component_match *rockchip_drm_match_add(struct device *dev) struct device *p = NULL, *d; do { - d = bus_find_device(&platform_bus_type, p, &drv->driver, - (void *)platform_bus_type.match); + d = platform_find_device_by_driver(p, &drv->driver); put_device(p); p = d; diff --git a/drivers/gpu/drm/vc4/vc4_drv.c b/drivers/gpu/drm/vc4/vc4_drv.c index bf11930e40e1..1551c8253bec 100644 --- a/drivers/gpu/drm/vc4/vc4_drv.c +++ b/drivers/gpu/drm/vc4/vc4_drv.c @@ -237,8 +237,7 @@ static void vc4_match_add_drivers(struct device *dev, struct device_driver *drv = &drivers[i]->driver; struct device *p = NULL, *d; - while ((d = bus_find_device(&platform_bus_type, p, drv, - (void *)platform_bus_type.match))) { + while ((d = platform_find_device_by_driver(p, drv))) { put_device(p); component_match_add(dev, match, compare_dev, d); p = d; diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h index 9bc36b589827..37e15a935a42 100644 --- a/include/linux/platform_device.h +++ b/include/linux/platform_device.h @@ -51,6 +51,9 @@ extern struct device platform_bus; extern void arch_setup_pdev_archdata(struct platform_device *); extern struct resource *platform_get_resource(struct platform_device *, unsigned int, unsigned int); +extern struct device * +platform_find_device_by_driver(struct device *start, + const struct device_driver *drv); extern void __iomem * devm_platform_ioremap_resource(struct platform_device *pdev, unsigned int index); -- cgit v1.2.3 From 2889ffcfc2522d6d25e5bda704275064062bbb21 Mon Sep 17 00:00:00 2001 From: Daniel Drake Date: Mon, 29 Jul 2019 16:27:37 +0800 Subject: platform/x86: asus-wmi: cleanup AGFN fan handling The asus-wmi driver currently uses the "AGFN" interface and the FAN_CTRL device for fan control. According to the spec, this interface is very dated and marked as pending removal from products currently in development. Clean up the way that the AGFN fan is detected and handled, also preparing the driver for the introduction of an alternate fan control method needed to support recent Asus products. Not anticipating further development of this interface, simplify the code by dropping any notion of being able to control multiple AGFN fans (this was already limited to just a single fan through only exposing a single fan in sysfs). Check for the presence of AGFN fans at probe time, simplifying the code flow in asus_hwmon_sysfs_is_visible(). Signed-off-by: Daniel Drake Signed-off-by: Andy Shevchenko --- drivers/platform/x86/asus-wmi.c | 238 +++++++++++++---------------- include/linux/platform_data/x86/asus-wmi.h | 4 +- 2 files changed, 109 insertions(+), 133 deletions(-) (limited to 'include') diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c index 0c6a810fcb72..fc2939ac1cfe 100644 --- a/drivers/platform/x86/asus-wmi.c +++ b/drivers/platform/x86/asus-wmi.c @@ -65,6 +65,8 @@ MODULE_LICENSE("GPL"); #define ASUS_FAN_MFUN 0x13 #define ASUS_FAN_SFUN_READ 0x06 #define ASUS_FAN_SFUN_WRITE 0x07 + +/* Based on standard hwmon pwmX_enable values */ #define ASUS_FAN_CTRL_MANUAL 1 #define ASUS_FAN_CTRL_AUTO 2 @@ -120,7 +122,7 @@ struct agfn_args { } __packed; /* struct used for calling fan read and write methods */ -struct fan_args { +struct agfn_fan_args { struct agfn_args agfn; /* common fields */ u8 fan; /* fan number: 0: set auto mode 1: 1st fan */ u32 speed; /* read: RPM/100 - write: 0-255 */ @@ -148,6 +150,11 @@ struct asus_rfkill { u32 dev_id; }; +enum fan_type { + FAN_TYPE_NONE = 0, + FAN_TYPE_AGFN, /* deprecated on newer platforms */ +}; + struct asus_wmi { int dsts_id; int spec; @@ -178,9 +185,9 @@ struct asus_wmi { struct asus_rfkill gps; struct asus_rfkill uwb; - bool asus_hwmon_fan_manual_mode; - int asus_hwmon_num_fans; - int asus_hwmon_pwm; + enum fan_type fan_type; + int fan_pwm_mode; + int agfn_pwm; bool fan_boost_mode_available; u8 fan_boost_mode_mask; @@ -1125,10 +1132,10 @@ static void asus_wmi_set_als(void) /* Hwmon device ***************************************************************/ -static int asus_hwmon_agfn_fan_speed_read(struct asus_wmi *asus, int fan, +static int asus_agfn_fan_speed_read(struct asus_wmi *asus, int fan, int *speed) { - struct fan_args args = { + struct agfn_fan_args args = { .agfn.len = sizeof(args), .agfn.mfun = ASUS_FAN_MFUN, .agfn.sfun = ASUS_FAN_SFUN_READ, @@ -1152,10 +1159,10 @@ static int asus_hwmon_agfn_fan_speed_read(struct asus_wmi *asus, int fan, return 0; } -static int asus_hwmon_agfn_fan_speed_write(struct asus_wmi *asus, int fan, +static int asus_agfn_fan_speed_write(struct asus_wmi *asus, int fan, int *speed) { - struct fan_args args = { + struct agfn_fan_args args = { .agfn.len = sizeof(args), .agfn.mfun = ASUS_FAN_MFUN, .agfn.sfun = ASUS_FAN_SFUN_WRITE, @@ -1175,7 +1182,7 @@ static int asus_hwmon_agfn_fan_speed_write(struct asus_wmi *asus, int fan, return -ENXIO; if (speed && fan == 1) - asus->asus_hwmon_pwm = *speed; + asus->agfn_pwm = *speed; return 0; } @@ -1184,87 +1191,75 @@ static int asus_hwmon_agfn_fan_speed_write(struct asus_wmi *asus, int fan, * Check if we can read the speed of one fan. If true we assume we can also * control it. */ -static int asus_hwmon_get_fan_number(struct asus_wmi *asus, int *num_fans) +static bool asus_wmi_has_agfn_fan(struct asus_wmi *asus) { int status; - int speed = 0; + int speed; + u32 value; - *num_fans = 0; + status = asus_agfn_fan_speed_read(asus, 1, &speed); + if (status != 0) + return false; - status = asus_hwmon_agfn_fan_speed_read(asus, 1, &speed); - if (!status) - *num_fans = 1; + status = asus_wmi_get_devstate(asus, ASUS_WMI_DEVID_FAN_CTRL, &value); + if (status != 0) + return false; - return 0; + /* + * We need to find a better way, probably using sfun, + * bits or spec ... + * Currently we disable it if: + * - ASUS_WMI_UNSUPPORTED_METHOD is returned + * - reverved bits are non-zero + * - sfun and presence bit are not set + */ + return !(value == ASUS_WMI_UNSUPPORTED_METHOD || value & 0xFFF80000 + || (!asus->sfun && !(value & ASUS_WMI_DSTS_PRESENCE_BIT))); } -static int asus_hwmon_fan_set_auto(struct asus_wmi *asus) +static int asus_fan_set_auto(struct asus_wmi *asus) { int status; - status = asus_hwmon_agfn_fan_speed_write(asus, 0, NULL); + status = asus_agfn_fan_speed_write(asus, 0, NULL); if (status) return -ENXIO; - asus->asus_hwmon_fan_manual_mode = false; - return 0; } -static int asus_hwmon_fan_rpm_show(struct device *dev, int fan) +static ssize_t pwm1_show(struct device *dev, + struct device_attribute *attr, + char *buf) { struct asus_wmi *asus = dev_get_drvdata(dev); - int value; - int ret; - - /* no speed readable on manual mode */ - if (asus->asus_hwmon_fan_manual_mode) - return -ENXIO; - - ret = asus_hwmon_agfn_fan_speed_read(asus, fan+1, &value); - if (ret) { - pr_warn("reading fan speed failed: %d\n", ret); - return -ENXIO; - } - - return value; -} - -static void asus_hwmon_pwm_show(struct asus_wmi *asus, int fan, int *value) -{ int err; + int value; - if (asus->asus_hwmon_pwm >= 0) { - *value = asus->asus_hwmon_pwm; - return; - } + /* If we already set a value then just return it */ + if (asus->agfn_pwm >= 0) + return sprintf(buf, "%d\n", asus->agfn_pwm); - err = asus_wmi_get_devstate(asus, ASUS_WMI_DEVID_FAN_CTRL, value); + /* + * If we haven't set already set a value through the AGFN interface, + * we read a current value through the (now-deprecated) FAN_CTRL device. + */ + err = asus_wmi_get_devstate(asus, ASUS_WMI_DEVID_FAN_CTRL, &value); if (err < 0) - return; + return err; - *value &= 0xFF; - - if (*value == 1) /* Low Speed */ - *value = 85; - else if (*value == 2) - *value = 170; - else if (*value == 3) - *value = 255; - else if (*value) { - pr_err("Unknown fan speed %#x\n", *value); - *value = -1; + value &= 0xFF; + + if (value == 1) /* Low Speed */ + value = 85; + else if (value == 2) + value = 170; + else if (value == 3) + value = 255; + else if (value) { + pr_err("Unknown fan speed %#x\n", value); + value = -1; } -} - -static ssize_t pwm1_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct asus_wmi *asus = dev_get_drvdata(dev); - int value; - - asus_hwmon_pwm_show(asus, 0, &value); return sprintf(buf, "%d\n", value); } @@ -1284,11 +1279,11 @@ static ssize_t pwm1_store(struct device *dev, value = clamp(value, 0, 255); - state = asus_hwmon_agfn_fan_speed_write(asus, 1, &value); + state = asus_agfn_fan_speed_write(asus, 1, &value); if (state) pr_warn("Setting fan speed failed: %d\n", state); else - asus->asus_hwmon_fan_manual_mode = true; + asus->fan_pwm_mode = ASUS_FAN_CTRL_MANUAL; return count; } @@ -1297,10 +1292,21 @@ static ssize_t fan1_input_show(struct device *dev, struct device_attribute *attr, char *buf) { - int value = asus_hwmon_fan_rpm_show(dev, 0); + struct asus_wmi *asus = dev_get_drvdata(dev); + int value; + int ret; - return sprintf(buf, "%d\n", value < 0 ? -1 : value*100); + /* no speed readable on manual mode */ + if (asus->fan_pwm_mode == ASUS_FAN_CTRL_MANUAL) + return -ENXIO; + + ret = asus_agfn_fan_speed_read(asus, 1, &value); + if (ret) { + pr_warn("reading fan speed failed: %d\n", ret); + return -ENXIO; + } + return sprintf(buf, "%d\n", value < 0 ? -1 : value*100); } static ssize_t pwm1_enable_show(struct device *dev, @@ -1309,10 +1315,7 @@ static ssize_t pwm1_enable_show(struct device *dev, { struct asus_wmi *asus = dev_get_drvdata(dev); - if (asus->asus_hwmon_fan_manual_mode) - return sprintf(buf, "%d\n", ASUS_FAN_CTRL_MANUAL); - - return sprintf(buf, "%d\n", ASUS_FAN_CTRL_AUTO); + return sprintf(buf, "%d\n", asus->fan_pwm_mode); } static ssize_t pwm1_enable_store(struct device *dev, @@ -1329,14 +1332,21 @@ static ssize_t pwm1_enable_store(struct device *dev, if (ret) return ret; - if (state == ASUS_FAN_CTRL_MANUAL) - asus->asus_hwmon_fan_manual_mode = true; - else - status = asus_hwmon_fan_set_auto(asus); + switch (state) { + case ASUS_FAN_CTRL_MANUAL: + break; - if (status) - return status; + case ASUS_FAN_CTRL_AUTO: + status = asus_fan_set_auto(asus); + if (status) + return status; + break; + default: + return -EINVAL; + } + + asus->fan_pwm_mode = state; return count; } @@ -1389,59 +1399,31 @@ static umode_t asus_hwmon_sysfs_is_visible(struct kobject *kobj, { struct device *dev = container_of(kobj, struct device, kobj); struct asus_wmi *asus = dev_get_drvdata(dev->parent); - int dev_id = -1; - int fan_attr = -1; u32 value = ASUS_WMI_UNSUPPORTED_METHOD; - bool ok = true; - - if (attr == &dev_attr_pwm1.attr) - dev_id = ASUS_WMI_DEVID_FAN_CTRL; - else if (attr == &dev_attr_temp1_input.attr) - dev_id = ASUS_WMI_DEVID_THERMAL_CTRL; - if (attr == &dev_attr_fan1_input.attr || attr == &dev_attr_fan1_label.attr || attr == &dev_attr_pwm1.attr || attr == &dev_attr_pwm1_enable.attr) { - fan_attr = 1; - } - - if (dev_id != -1) { - int err = asus_wmi_get_devstate(asus, dev_id, &value); + if (asus->fan_type == FAN_TYPE_NONE) + return 0; + } else if (attr == &dev_attr_temp1_input.attr) { + int err = asus_wmi_get_devstate(asus, + ASUS_WMI_DEVID_THERMAL_CTRL, + &value); - if (err < 0 && fan_attr == -1) + if (err < 0) return 0; /* can't return negative here */ - } - if (dev_id == ASUS_WMI_DEVID_FAN_CTRL) { - /* - * We need to find a better way, probably using sfun, - * bits or spec ... - * Currently we disable it if: - * - ASUS_WMI_UNSUPPORTED_METHOD is returned - * - reverved bits are non-zero - * - sfun and presence bit are not set - */ - if (value == ASUS_WMI_UNSUPPORTED_METHOD || value & 0xFFF80000 - || (!asus->sfun && !(value & ASUS_WMI_DSTS_PRESENCE_BIT))) - ok = false; - else - ok = fan_attr <= asus->asus_hwmon_num_fans; - } else if (dev_id == ASUS_WMI_DEVID_THERMAL_CTRL) { /* * If the temperature value in deci-Kelvin is near the absolute * zero temperature, something is clearly wrong */ if (value == 0 || value == 1) - ok = false; - } else if (fan_attr <= asus->asus_hwmon_num_fans && fan_attr != -1) { - ok = true; - } else { - ok = false; + return 0; } - return ok ? attr->mode : 0; + return attr->mode; } static const struct attribute_group hwmon_attribute_group = { @@ -1467,21 +1449,16 @@ static int asus_wmi_hwmon_init(struct asus_wmi *asus) static int asus_wmi_fan_init(struct asus_wmi *asus) { - int status; - - asus->asus_hwmon_pwm = -1; - asus->asus_hwmon_num_fans = -1; - asus->asus_hwmon_fan_manual_mode = false; + asus->fan_type = FAN_TYPE_NONE; + asus->agfn_pwm = -1; - status = asus_hwmon_get_fan_number(asus, &asus->asus_hwmon_num_fans); - if (status) { - asus->asus_hwmon_num_fans = 0; - pr_warn("Could not determine number of fans: %d\n", status); - return -ENXIO; + if (asus_wmi_has_agfn_fan(asus)) { + asus->fan_type = FAN_TYPE_AGFN; + asus_fan_set_auto(asus); + asus->fan_pwm_mode = ASUS_FAN_CTRL_AUTO; } - pr_info("Number of fans: %d\n", asus->asus_hwmon_num_fans); - return 0; + return asus->fan_type != FAN_TYPE_NONE; } /* Fan mode *******************************************************************/ @@ -2333,7 +2310,6 @@ static int asus_wmi_add(struct platform_device *pdev) goto fail_input; err = asus_wmi_fan_init(asus); /* probably no problems on error */ - asus_hwmon_fan_set_auto(asus); err = asus_wmi_hwmon_init(asus); if (err) @@ -2425,7 +2401,7 @@ static int asus_wmi_remove(struct platform_device *device) asus_wmi_rfkill_exit(asus); asus_wmi_debugfs_exit(asus); asus_wmi_sysfs_exit(asus->platform_device); - asus_hwmon_fan_set_auto(asus); + asus_fan_set_auto(asus); kfree(asus); return 0; diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h index 4802cd2c7309..5ae9c062a1f6 100644 --- a/include/linux/platform_data/x86/asus-wmi.h +++ b/include/linux/platform_data/x86/asus-wmi.h @@ -12,7 +12,7 @@ #define ASUS_WMI_METHODID_GPID 0x44495047 /* Get Panel ID?? (Resol) */ #define ASUS_WMI_METHODID_QMOD 0x444F4D51 /* Quiet MODe */ #define ASUS_WMI_METHODID_SPLV 0x4C425053 /* Set Panel Light Value */ -#define ASUS_WMI_METHODID_AGFN 0x4E464741 /* FaN? */ +#define ASUS_WMI_METHODID_AGFN 0x4E464741 /* Atk Generic FuNction */ #define ASUS_WMI_METHODID_SFUN 0x4E554653 /* FUNCtionalities */ #define ASUS_WMI_METHODID_SDSP 0x50534453 /* Set DiSPlay output */ #define ASUS_WMI_METHODID_GDSP 0x50534447 /* Get DiSPlay output */ @@ -72,7 +72,7 @@ /* Fan, Thermal */ #define ASUS_WMI_DEVID_THERMAL_CTRL 0x00110011 -#define ASUS_WMI_DEVID_FAN_CTRL 0x00110012 +#define ASUS_WMI_DEVID_FAN_CTRL 0x00110012 /* deprecated */ /* Power */ #define ASUS_WMI_DEVID_PROCESSOR_STATE 0x00120012 -- cgit v1.2.3 From e3168b874321d04c160c9eb937919eb926ae232f Mon Sep 17 00:00:00 2001 From: Daniel Drake Date: Mon, 29 Jul 2019 16:27:39 +0800 Subject: platform/x86: asus-wmi: fix CPU fan control on recent products Previously, asus-wmi was using the AGFN interface and FAN_CTRL device for CPU fan control. However, this code has been found to be not fully working on some recent products, and having checked the spec, these interfaces are marked as being removed from future products currently in development. The replacement appears to be the CPU_FAN device, added in spec version 8.3 (March 2014) and present on many modern Asus laptops. Add support for this device, and use it whenever it is detected. The older approach based on AGFN and FAN_CTRL is used as a fallback on products that do not have such device. Other than switching between automatic and full speed, there is no fan speed control through this new interface. Signed-off-by: Daniel Drake Signed-off-by: Andy Shevchenko --- drivers/platform/x86/asus-wmi.c | 125 +++++++++++++++++++++++------ include/linux/platform_data/x86/asus-wmi.h | 1 + 2 files changed, 101 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c index 25f1e256c442..34dfbed65332 100644 --- a/drivers/platform/x86/asus-wmi.c +++ b/drivers/platform/x86/asus-wmi.c @@ -67,6 +67,7 @@ MODULE_LICENSE("GPL"); #define ASUS_FAN_SFUN_WRITE 0x07 /* Based on standard hwmon pwmX_enable values */ +#define ASUS_FAN_CTRL_FULLSPEED 0 #define ASUS_FAN_CTRL_MANUAL 1 #define ASUS_FAN_CTRL_AUTO 2 @@ -153,6 +154,7 @@ struct asus_rfkill { enum fan_type { FAN_TYPE_NONE = 0, FAN_TYPE_AGFN, /* deprecated on newer platforms */ + FAN_TYPE_SPEC83, /* starting in Spec 8.3, use CPU_FAN_CTRL */ }; struct asus_wmi { @@ -1211,10 +1213,29 @@ static bool asus_wmi_has_agfn_fan(struct asus_wmi *asus) static int asus_fan_set_auto(struct asus_wmi *asus) { int status; + u32 retval; - status = asus_agfn_fan_speed_write(asus, 0, NULL); - if (status) + switch (asus->fan_type) { + case FAN_TYPE_SPEC83: + status = asus_wmi_set_devstate(ASUS_WMI_DEVID_CPU_FAN_CTRL, + 0, &retval); + if (status) + return status; + + if (retval != 1) + return -EIO; + break; + + case FAN_TYPE_AGFN: + status = asus_agfn_fan_speed_write(asus, 0, NULL); + if (status) + return -ENXIO; + break; + + default: return -ENXIO; + } + return 0; } @@ -1287,13 +1308,29 @@ static ssize_t fan1_input_show(struct device *dev, int value; int ret; - /* no speed readable on manual mode */ - if (asus->fan_pwm_mode == ASUS_FAN_CTRL_MANUAL) - return -ENXIO; + switch (asus->fan_type) { + case FAN_TYPE_SPEC83: + ret = asus_wmi_get_devstate(asus, ASUS_WMI_DEVID_CPU_FAN_CTRL, + &value); + if (ret < 0) + return ret; - ret = asus_agfn_fan_speed_read(asus, 1, &value); - if (ret) { - pr_warn("reading fan speed failed: %d\n", ret); + value &= 0xffff; + break; + + case FAN_TYPE_AGFN: + /* no speed readable on manual mode */ + if (asus->fan_pwm_mode == ASUS_FAN_CTRL_MANUAL) + return -ENXIO; + + ret = asus_agfn_fan_speed_read(asus, 1, &value); + if (ret) { + pr_warn("reading fan speed failed: %d\n", ret); + return -ENXIO; + } + break; + + default: return -ENXIO; } @@ -1306,6 +1343,15 @@ static ssize_t pwm1_enable_show(struct device *dev, { struct asus_wmi *asus = dev_get_drvdata(dev); + /* + * Just read back the cached pwm mode. + * + * For the CPU_FAN device, the spec indicates that we should be + * able to read the device status and consult bit 19 to see if we + * are in Full On or Automatic mode. However, this does not work + * in practice on X532FL at least (the bit is always 0) and there's + * also nothing in the DSDT to indicate that this behaviour exists. + */ return sprintf(buf, "%d\n", asus->fan_pwm_mode); } @@ -1316,25 +1362,48 @@ static ssize_t pwm1_enable_store(struct device *dev, struct asus_wmi *asus = dev_get_drvdata(dev); int status = 0; int state; + int value; int ret; + u32 retval; ret = kstrtouint(buf, 10, &state); if (ret) return ret; - switch (state) { - case ASUS_FAN_CTRL_MANUAL: - break; + if (asus->fan_type == FAN_TYPE_SPEC83) { + switch (state) { /* standard documented hwmon values */ + case ASUS_FAN_CTRL_FULLSPEED: + value = 1; + break; + case ASUS_FAN_CTRL_AUTO: + value = 0; + break; + default: + return -EINVAL; + } - case ASUS_FAN_CTRL_AUTO: - status = asus_fan_set_auto(asus); - if (status) - return status; - break; + ret = asus_wmi_set_devstate(ASUS_WMI_DEVID_CPU_FAN_CTRL, + value, &retval); + if (ret) + return ret; - default: - return -EINVAL; + if (retval != 1) + return -EIO; + } else if (asus->fan_type == FAN_TYPE_AGFN) { + switch (state) { + case ASUS_FAN_CTRL_MANUAL: + break; + + case ASUS_FAN_CTRL_AUTO: + status = asus_fan_set_auto(asus); + if (status) + return status; + break; + + default: + return -EINVAL; + } } asus->fan_pwm_mode = state; @@ -1392,9 +1461,11 @@ static umode_t asus_hwmon_sysfs_is_visible(struct kobject *kobj, struct asus_wmi *asus = dev_get_drvdata(dev->parent); u32 value = ASUS_WMI_UNSUPPORTED_METHOD; - if (attr == &dev_attr_fan1_input.attr + if (attr == &dev_attr_pwm1.attr) { + if (asus->fan_type != FAN_TYPE_AGFN) + return 0; + } else if (attr == &dev_attr_fan1_input.attr || attr == &dev_attr_fan1_label.attr - || attr == &dev_attr_pwm1.attr || attr == &dev_attr_pwm1_enable.attr) { if (asus->fan_type == FAN_TYPE_NONE) return 0; @@ -1443,13 +1514,17 @@ static int asus_wmi_fan_init(struct asus_wmi *asus) asus->fan_type = FAN_TYPE_NONE; asus->agfn_pwm = -1; - if (asus_wmi_has_agfn_fan(asus)) { + if (asus_wmi_dev_is_present(asus, ASUS_WMI_DEVID_CPU_FAN_CTRL)) + asus->fan_type = FAN_TYPE_SPEC83; + else if (asus_wmi_has_agfn_fan(asus)) asus->fan_type = FAN_TYPE_AGFN; - asus_fan_set_auto(asus); - asus->fan_pwm_mode = ASUS_FAN_CTRL_AUTO; - } - return asus->fan_type != FAN_TYPE_NONE; + if (asus->fan_type == FAN_TYPE_NONE) + return -ENODEV; + + asus_fan_set_auto(asus); + asus->fan_pwm_mode = ASUS_FAN_CTRL_AUTO; + return 0; } /* Fan mode *******************************************************************/ diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h index 5ae9c062a1f6..409e16064f4b 100644 --- a/include/linux/platform_data/x86/asus-wmi.h +++ b/include/linux/platform_data/x86/asus-wmi.h @@ -73,6 +73,7 @@ /* Fan, Thermal */ #define ASUS_WMI_DEVID_THERMAL_CTRL 0x00110011 #define ASUS_WMI_DEVID_FAN_CTRL 0x00110012 /* deprecated */ +#define ASUS_WMI_DEVID_CPU_FAN_CTRL 0x00110013 /* Power */ #define ASUS_WMI_DEVID_PROCESSOR_STATE 0x00120012 -- cgit v1.2.3 From c776dd50196a07a0ef943fcb74e8c86f9a5a20a8 Mon Sep 17 00:00:00 2001 From: Kelsey Skunberg Date: Wed, 24 Jul 2019 17:38:38 -0600 Subject: PCI: Make PCI_PM_* delay times private These delay time definitions: #define PCI_PM_D2_DELAY 200 #define PCI_PM_D3_WAIT 10 #define PCI_PM_D3COLD_WAIT 100 #define PCI_PM_BUS_WAIT 50 are only used in drivers/pci/ and do not need to be seen by the rest of the kernel. Move them to drivers/pci/pci.h so they're private to the PCI subsystem. Link: https://lore.kernel.org/r/20190724233848.73327-2-skunberg.kelsey@gmail.com Signed-off-by: Kelsey Skunberg Signed-off-by: Bjorn Helgaas --- drivers/pci/pci.h | 5 +++++ include/linux/pci.h | 5 ----- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 1be03a97cb92..708413632429 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -39,6 +39,11 @@ int pci_probe_reset_function(struct pci_dev *dev); int pci_bridge_secondary_bus_reset(struct pci_dev *dev); int pci_bus_error_reset(struct pci_dev *dev); +#define PCI_PM_D2_DELAY 200 +#define PCI_PM_D3_WAIT 10 +#define PCI_PM_D3COLD_WAIT 100 +#define PCI_PM_BUS_WAIT 50 + /** * struct pci_platform_pm_ops - Firmware PM callbacks * diff --git a/include/linux/pci.h b/include/linux/pci.h index 61b64fa1b0b1..9c2c6e161cfd 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -145,11 +145,6 @@ static inline const char *pci_power_name(pci_power_t state) return pci_power_names[1 + (__force int) state]; } -#define PCI_PM_D2_DELAY 200 -#define PCI_PM_D3_WAIT 10 -#define PCI_PM_D3COLD_WAIT 100 -#define PCI_PM_BUS_WAIT 50 - /** * typedef pci_channel_state_t * -- cgit v1.2.3 From 669696ebbccc40e8096a2ee9809c028fc1538001 Mon Sep 17 00:00:00 2001 From: Kelsey Skunberg Date: Wed, 24 Jul 2019 17:38:39 -0600 Subject: PCI: Make pci_check_pme_status(), pci_pme_wakeup_bus() private These interfaces: bool pci_check_pme_status(struct pci_dev *dev); void pci_pme_wakeup_bus(struct pci_bus *bus); are only used in drivers/pci/ and do not need to be seen by the rest of the kernel. Move them to drivers/pci/pci.h so they're private to the PCI subsystem. Link: https://lore.kernel.org/r/20190724233848.73327-3-skunberg.kelsey@gmail.com Signed-off-by: Kelsey Skunberg Signed-off-by: Bjorn Helgaas --- drivers/pci/pci.h | 2 ++ include/linux/pci.h | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 708413632429..ad1fe54ab8ee 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -89,6 +89,8 @@ void pci_power_up(struct pci_dev *dev); void pci_disable_enabled_device(struct pci_dev *dev); int pci_finish_runtime_suspend(struct pci_dev *dev); void pcie_clear_root_pme_status(struct pci_dev *dev); +bool pci_check_pme_status(struct pci_dev *dev); +void pci_pme_wakeup_bus(struct pci_bus *bus); int __pci_pme_wakeup(struct pci_dev *dev, void *ign); void pci_pme_restore(struct pci_dev *dev); bool pci_dev_need_resume(struct pci_dev *dev); diff --git a/include/linux/pci.h b/include/linux/pci.h index 9c2c6e161cfd..b13fb829171e 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1236,8 +1236,6 @@ int pci_wake_from_d3(struct pci_dev *dev, bool enable); int pci_prepare_to_sleep(struct pci_dev *dev); int pci_back_from_sleep(struct pci_dev *dev); bool pci_dev_run_wake(struct pci_dev *dev); -bool pci_check_pme_status(struct pci_dev *dev); -void pci_pme_wakeup_bus(struct pci_bus *bus); void pci_d3cold_enable(struct pci_dev *dev); void pci_d3cold_disable(struct pci_dev *dev); bool pcie_relaxed_ordering_enabled(struct pci_dev *dev); -- cgit v1.2.3 From 975e1ac173058b8710e5979e97fc1397233301f3 Mon Sep 17 00:00:00 2001 From: Kelsey Skunberg Date: Wed, 24 Jul 2019 17:38:40 -0600 Subject: PCI: Make pci_get_host_bridge_device(), pci_put_host_bridge_device() private These interfaces: struct device *pci_get_host_bridge_device(struct pci_dev *dev); void pci_put_host_bridge_device(struct device *dev); are only used in drivers/pci/ and do not need to be seen by the rest of the kernel. Move them to drivers/pci/pci.h so they're private to the PCI subsystem. Link: https://lore.kernel.org/r/20190724233848.73327-4-skunberg.kelsey@gmail.com Signed-off-by: Kelsey Skunberg Signed-off-by: Bjorn Helgaas --- drivers/pci/pci.h | 3 +++ include/linux/pci.h | 3 --- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index ad1fe54ab8ee..b06c750e08d7 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -243,6 +243,9 @@ enum pci_bar_type { pci_bar_mem64, /* A 64-bit memory BAR */ }; +struct device *pci_get_host_bridge_device(struct pci_dev *dev); +void pci_put_host_bridge_device(struct device *dev); + int pci_configure_extended_tags(struct pci_dev *dev, void *ign); bool pci_bus_read_dev_vendor_id(struct pci_bus *bus, int devfn, u32 *pl, int crs_timeout); diff --git a/include/linux/pci.h b/include/linux/pci.h index b13fb829171e..ffb904e7895d 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -644,9 +644,6 @@ static inline struct pci_dev *pci_upstream_bridge(struct pci_dev *dev) return dev->bus->self; } -struct device *pci_get_host_bridge_device(struct pci_dev *dev); -void pci_put_host_bridge_device(struct device *dev); - #ifdef CONFIG_PCI_MSI static inline bool pci_dev_msi_enabled(struct pci_dev *pci_dev) { -- cgit v1.2.3 From 440589dd1068fb36b2fe10ccddf97e43267e3b8d Mon Sep 17 00:00:00 2001 From: Kelsey Skunberg Date: Wed, 24 Jul 2019 17:38:41 -0600 Subject: PCI: Make pci_save_vc_state(), pci_restore_vc_state(), etc private These Virtual Channel interfaces: int pci_save_vc_state(struct pci_dev *dev); void pci_restore_vc_state(struct pci_dev *dev); void pci_allocate_vc_save_buffers(struct pci_dev *dev); are only used in drivers/pci/ and do not need to be seen by the rest of the kernel. Move them to drivers/pci/pci.h so they're private to the PCI subsystem. Link: https://lore.kernel.org/r/20190724233848.73327-5-skunberg.kelsey@gmail.com Signed-off-by: Kelsey Skunberg Signed-off-by: Bjorn Helgaas --- drivers/pci/pci.h | 5 +++++ include/linux/pci.h | 5 ----- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index b06c750e08d7..19cda6e6fccd 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -130,6 +130,11 @@ void pci_vpd_release(struct pci_dev *dev); void pcie_vpd_create_sysfs_dev_files(struct pci_dev *dev); void pcie_vpd_remove_sysfs_dev_files(struct pci_dev *dev); +/* PCI Virtual Channel */ +int pci_save_vc_state(struct pci_dev *dev); +void pci_restore_vc_state(struct pci_dev *dev); +void pci_allocate_vc_save_buffers(struct pci_dev *dev); + /* PCI /proc functions */ #ifdef CONFIG_PROC_FS int pci_proc_attach_device(struct pci_dev *dev); diff --git a/include/linux/pci.h b/include/linux/pci.h index ffb904e7895d..3ff458f89190 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1239,11 +1239,6 @@ bool pcie_relaxed_ordering_enabled(struct pci_dev *dev); void pci_wakeup_bus(struct pci_bus *bus); void pci_bus_set_current_state(struct pci_bus *bus, pci_power_t state); -/* PCI Virtual Channel */ -int pci_save_vc_state(struct pci_dev *dev); -void pci_restore_vc_state(struct pci_dev *dev); -void pci_allocate_vc_save_buffers(struct pci_dev *dev); - /* For use by arch with custom probe code */ void set_pcie_port_type(struct pci_dev *pdev); void set_pcie_hotplug_bridge(struct pci_dev *pdev); -- cgit v1.2.3 From 003d3b2c5f835d897b3b10a13a9e1340630e93f1 Mon Sep 17 00:00:00 2001 From: Kelsey Skunberg Date: Wed, 24 Jul 2019 17:38:42 -0600 Subject: PCI: Make pci_hotplug_io_size, mem_size, and bus_size private These symbols: extern unsigned long pci_hotplug_io_size; extern unsigned long pci_hotplug_mem_size; extern unsigned long pci_hotplug_bus_size; are only used in drivers/pci/ and do not need to be seen by the rest of the kernel. Move them to drivers/pci/pci.h so they're private to the PCI subsystem. Link: https://lore.kernel.org/r/20190724233848.73327-6-skunberg.kelsey@gmail.com Signed-off-by: Kelsey Skunberg Signed-off-by: Bjorn Helgaas --- drivers/pci/pci.h | 3 +++ include/linux/pci.h | 4 ---- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 19cda6e6fccd..9698fa805e28 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -208,6 +208,9 @@ extern const struct attribute_group *pcibus_groups[]; extern const struct device_type pci_dev_type; extern const struct attribute_group *pci_bus_groups[]; +extern unsigned long pci_hotplug_io_size; +extern unsigned long pci_hotplug_mem_size; +extern unsigned long pci_hotplug_bus_size; /** * pci_match_one_device - Tell if a PCI device structure has a matching diff --git a/include/linux/pci.h b/include/linux/pci.h index 3ff458f89190..b5592b9b7f38 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -2010,10 +2010,6 @@ extern unsigned long pci_cardbus_mem_size; extern u8 pci_dfl_cache_line_size; extern u8 pci_cache_line_size; -extern unsigned long pci_hotplug_io_size; -extern unsigned long pci_hotplug_mem_size; -extern unsigned long pci_hotplug_bus_size; - /* Architecture-specific versions may override these (weak) */ void pcibios_disable_device(struct pci_dev *dev); void pcibios_set_master(struct pci_dev *dev); -- cgit v1.2.3 From ecd29c1a38af4ba6e61382591dfe93f06f14ba25 Mon Sep 17 00:00:00 2001 From: Kelsey Skunberg Date: Wed, 24 Jul 2019 17:38:43 -0600 Subject: PCI: Make pci_bus_get(), pci_bus_put() private These interfaces: struct pci_bus *pci_bus_get(struct pci_bus *bus); void pci_bus_put(struct pci_bus *bus); are only used in drivers/pci/ and do not need to be seen by the rest of the kernel. Move them to drivers/pci/pci.h so they're private to the PCI subsystem. Link: https://lore.kernel.org/r/20190724233848.73327-7-skunberg.kelsey@gmail.com Signed-off-by: Kelsey Skunberg Signed-off-by: Bjorn Helgaas --- drivers/pci/pci.h | 2 ++ include/linux/pci.h | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 9698fa805e28..81e94c9f1c12 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -274,6 +274,8 @@ bool pci_bus_clip_resource(struct pci_dev *dev, int idx); void pci_reassigndev_resource_alignment(struct pci_dev *dev); void pci_disable_bridge_window(struct pci_dev *dev); +struct pci_bus *pci_bus_get(struct pci_bus *bus); +void pci_bus_put(struct pci_bus *bus); /* PCIe link information */ #define PCIE_SPEED2STR(speed) \ diff --git a/include/linux/pci.h b/include/linux/pci.h index b5592b9b7f38..4d1b6f07f8f1 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1282,8 +1282,6 @@ int pci_request_selected_regions_exclusive(struct pci_dev *, int, const char *); void pci_release_selected_regions(struct pci_dev *, int); /* drivers/pci/bus.c */ -struct pci_bus *pci_bus_get(struct pci_bus *bus); -void pci_bus_put(struct pci_bus *bus); void pci_add_resource(struct list_head *resources, struct resource *res); void pci_add_resource_offset(struct list_head *resources, struct resource *res, resource_size_t offset); -- cgit v1.2.3 From 5da78d95785daa9454336a88b2f86a8998f6c739 Mon Sep 17 00:00:00 2001 From: Kelsey Skunberg Date: Wed, 24 Jul 2019 17:38:44 -0600 Subject: PCI: Make pcie_update_link_speed() private This interface: void pcie_update_link_speed(struct pci_bus *bus, u16 link_status); is only used in drivers/pci/ and does not need to be seen by the rest of the kernel. Move it to drivers/pci/pci.h so it's private to the PCI subsystem. Link: https://lore.kernel.org/r/20190724233848.73327-8-skunberg.kelsey@gmail.com Signed-off-by: Kelsey Skunberg Signed-off-by: Bjorn Helgaas --- drivers/pci/pci.h | 1 + include/linux/pci.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 81e94c9f1c12..8459663358c5 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -299,6 +299,7 @@ u32 pcie_bandwidth_capable(struct pci_dev *dev, enum pci_bus_speed *speed, enum pcie_link_width *width); void __pcie_print_link_status(struct pci_dev *dev, bool verbose); void pcie_report_downtraining(struct pci_dev *dev); +void pcie_update_link_speed(struct pci_bus *bus, u16 link_status); /* Single Root I/O Virtualization */ struct pci_sriov { diff --git a/include/linux/pci.h b/include/linux/pci.h index 4d1b6f07f8f1..8301352a937f 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -987,7 +987,6 @@ struct pci_bus *pci_scan_root_bus(struct device *parent, int bus, int pci_scan_root_bus_bridge(struct pci_host_bridge *bridge); struct pci_bus *pci_add_new_bus(struct pci_bus *parent, struct pci_dev *dev, int busnr); -void pcie_update_link_speed(struct pci_bus *bus, u16 link_status); struct pci_slot *pci_create_slot(struct pci_bus *parent, int slot_nr, const char *name, struct hotplug_slot *hotplug); -- cgit v1.2.3 From b92b512a435da01b52de07e3dcc2f07a4ad404de Mon Sep 17 00:00:00 2001 From: Kelsey Skunberg Date: Wed, 24 Jul 2019 17:38:45 -0600 Subject: PCI: Make pci_ats_init() private This interface: void pci_ats_init(struct pci_dev *dev); is only used in drivers/pci/ and does not need to be seen by the rest of the kernel. Move it to drivers/pci/pci.h so it's private to the PCI subsystem. Link: https://lore.kernel.org/r/20190724233848.73327-9-skunberg.kelsey@gmail.com Signed-off-by: Kelsey Skunberg Signed-off-by: Bjorn Helgaas --- drivers/pci/pci.h | 7 ++++--- include/linux/pci.h | 2 -- 2 files changed, 4 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 8459663358c5..2bb59afb540e 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -439,11 +439,12 @@ static inline void pci_restore_dpc_state(struct pci_dev *dev) {} #endif #ifdef CONFIG_PCI_ATS +/* Address Translation Service */ +void pci_ats_init(struct pci_dev *dev); void pci_restore_ats_state(struct pci_dev *dev); #else -static inline void pci_restore_ats_state(struct pci_dev *dev) -{ -} +static inline void pci_ats_init(struct pci_dev *d) { } +static inline void pci_restore_ats_state(struct pci_dev *dev) { } #endif /* CONFIG_PCI_ATS */ #ifdef CONFIG_PCI_IOV diff --git a/include/linux/pci.h b/include/linux/pci.h index 8301352a937f..9b8d0c2a1c9a 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1761,13 +1761,11 @@ static inline bool pci_ats_disabled(void) { return true; } #ifdef CONFIG_PCI_ATS /* Address Translation Service */ -void pci_ats_init(struct pci_dev *dev); int pci_enable_ats(struct pci_dev *dev, int ps); void pci_disable_ats(struct pci_dev *dev); int pci_ats_queue_depth(struct pci_dev *dev); int pci_ats_page_aligned(struct pci_dev *dev); #else -static inline void pci_ats_init(struct pci_dev *d) { } static inline int pci_enable_ats(struct pci_dev *d, int ps) { return -ENODEV; } static inline void pci_disable_ats(struct pci_dev *d) { } static inline int pci_ats_queue_depth(struct pci_dev *d) { return -ENODEV; } -- cgit v1.2.3 From 72bde9ced373ca1e27332fd020d248151319a3d6 Mon Sep 17 00:00:00 2001 From: Kelsey Skunberg Date: Wed, 24 Jul 2019 17:38:46 -0600 Subject: PCI: Make pcie_set_ecrc_checking(), pcie_ecrc_get_policy() private These interfaces: void pcie_set_ecrc_checking(struct pci_dev *dev); void pcie_ecrc_get_policy(char *str); are only used in drivers/pci/ and do not need to be seen by the rest of the kernel. Move them to drivers/pci/pci.h so they're private to the PCI subsystem. Link: https://lore.kernel.org/r/20190724233848.73327-10-skunberg.kelsey@gmail.com Signed-off-by: Kelsey Skunberg Signed-off-by: Bjorn Helgaas --- drivers/pci/pci.h | 8 ++++++++ include/linux/pci.h | 8 -------- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 2bb59afb540e..d3d006c95041 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -541,6 +541,14 @@ static inline void pcie_aspm_create_sysfs_dev_files(struct pci_dev *pdev) { } static inline void pcie_aspm_remove_sysfs_dev_files(struct pci_dev *pdev) { } #endif +#ifdef CONFIG_PCIE_ECRC +void pcie_set_ecrc_checking(struct pci_dev *dev); +void pcie_ecrc_get_policy(char *str); +#else +static inline void pcie_set_ecrc_checking(struct pci_dev *dev) { } +static inline void pcie_ecrc_get_policy(char *str) { } +#endif + #ifdef CONFIG_PCIE_PTM void pci_ptm_init(struct pci_dev *dev); #else diff --git a/include/linux/pci.h b/include/linux/pci.h index 9b8d0c2a1c9a..65502c564661 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1559,14 +1559,6 @@ bool pci_aer_available(void); static inline bool pci_aer_available(void) { return false; } #endif -#ifdef CONFIG_PCIE_ECRC -void pcie_set_ecrc_checking(struct pci_dev *dev); -void pcie_ecrc_get_policy(char *str); -#else -static inline void pcie_set_ecrc_checking(struct pci_dev *dev) { } -static inline void pcie_ecrc_get_policy(char *str) { } -#endif - bool pci_ats_disabled(void); #ifdef CONFIG_PCIE_PTM -- cgit v1.2.3 From ac6c26da29c12fa511c877c273ed5c939dc9e96c Mon Sep 17 00:00:00 2001 From: Kelsey Skunberg Date: Wed, 24 Jul 2019 17:38:47 -0600 Subject: PCI: Make pci_enable_ptm() private This interface: int pci_enable_ptm(struct pci_dev *dev, u8 *granularity); is only used in drivers/pci/ and does not need to be seen by the rest of the kernel. Move it to drivers/pci/pci.h so it's private to the PCI subsystem. Link: https://lore.kernel.org/r/20190724233848.73327-11-skunberg.kelsey@gmail.com Signed-off-by: Kelsey Skunberg Signed-off-by: Bjorn Helgaas --- drivers/pci/pci.h | 3 +++ include/linux/pci.h | 7 ------- 2 files changed, 3 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index d3d006c95041..8935fc1ff446 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -551,8 +551,11 @@ static inline void pcie_ecrc_get_policy(char *str) { } #ifdef CONFIG_PCIE_PTM void pci_ptm_init(struct pci_dev *dev); +int pci_enable_ptm(struct pci_dev *dev, u8 *granularity); #else static inline void pci_ptm_init(struct pci_dev *dev) { } +static inline int pci_enable_ptm(struct pci_dev *dev, u8 *granularity) +{ return -EINVAL; } #endif struct pci_dev_reset_methods { diff --git a/include/linux/pci.h b/include/linux/pci.h index 65502c564661..ef73b0c87f01 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1561,13 +1561,6 @@ static inline bool pci_aer_available(void) { return false; } bool pci_ats_disabled(void); -#ifdef CONFIG_PCIE_PTM -int pci_enable_ptm(struct pci_dev *dev, u8 *granularity); -#else -static inline int pci_enable_ptm(struct pci_dev *dev, u8 *granularity) -{ return -EINVAL; } -#endif - void pci_cfg_access_lock(struct pci_dev *dev); bool pci_cfg_access_trylock(struct pci_dev *dev); void pci_cfg_access_unlock(struct pci_dev *dev); -- cgit v1.2.3 From 621f7e354fd8f99db0e3f7a0e8cda238caee9f3a Mon Sep 17 00:00:00 2001 From: Kelsey Skunberg Date: Wed, 24 Jul 2019 17:38:48 -0600 Subject: PCI: Make pci_set_of_node(), etc private These interfaces: void pci_set_of_node(struct pci_dev *dev); void pci_release_of_node(struct pci_dev *dev); void pci_set_bus_of_node(struct pci_bus *bus); void pci_release_bus_of_node(struct pci_bus *bus); are only used in drivers/pci/ and do not need to be seen by the rest of the kernel. Move them to drivers/pci/pci.h so they're private to the PCI subsystem. Link: https://lore.kernel.org/r/20190724233848.73327-12-skunberg.kelsey@gmail.com Signed-off-by: Kelsey Skunberg Signed-off-by: Bjorn Helgaas --- drivers/pci/pci.h | 9 +++++++++ include/linux/pci.h | 8 -------- 2 files changed, 9 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 8935fc1ff446..61bbfd611140 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -592,6 +592,10 @@ struct device_node; int of_pci_parse_bus_range(struct device_node *node, struct resource *res); int of_get_pci_domain_nr(struct device_node *node); int of_pci_get_max_link_speed(struct device_node *node); +void pci_set_of_node(struct pci_dev *dev); +void pci_release_of_node(struct pci_dev *dev); +void pci_set_bus_of_node(struct pci_bus *bus); +void pci_release_bus_of_node(struct pci_bus *bus); #else static inline int @@ -611,6 +615,11 @@ of_pci_get_max_link_speed(struct device_node *node) { return -EINVAL; } + +static inline void pci_set_of_node(struct pci_dev *dev) { } +static inline void pci_release_of_node(struct pci_dev *dev) { } +static inline void pci_set_bus_of_node(struct pci_bus *bus) { } +static inline void pci_release_bus_of_node(struct pci_bus *bus) { } #endif /* CONFIG_OF */ #if defined(CONFIG_OF_ADDRESS) diff --git a/include/linux/pci.h b/include/linux/pci.h index ef73b0c87f01..5a26b7db71b3 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -2259,10 +2259,6 @@ int pci_vpd_find_info_keyword(const u8 *buf, unsigned int off, #ifdef CONFIG_OF struct device_node; struct irq_domain; -void pci_set_of_node(struct pci_dev *dev); -void pci_release_of_node(struct pci_dev *dev); -void pci_set_bus_of_node(struct pci_bus *bus); -void pci_release_bus_of_node(struct pci_bus *bus); struct irq_domain *pci_host_bridge_of_msi_domain(struct pci_bus *bus); int pci_parse_request_of_pci_ranges(struct device *dev, struct list_head *resources, @@ -2272,10 +2268,6 @@ int pci_parse_request_of_pci_ranges(struct device *dev, struct device_node *pcibios_get_phb_of_node(struct pci_bus *bus); #else /* CONFIG_OF */ -static inline void pci_set_of_node(struct pci_dev *dev) { } -static inline void pci_release_of_node(struct pci_dev *dev) { } -static inline void pci_set_bus_of_node(struct pci_bus *bus) { } -static inline void pci_release_bus_of_node(struct pci_bus *bus) { } static inline struct irq_domain * pci_host_bridge_of_msi_domain(struct pci_bus *bus) { return NULL; } static inline int pci_parse_request_of_pci_ranges(struct device *dev, -- cgit v1.2.3 From 259231a045616c4101d023a8f4dcc8379af265a6 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 3 Jul 2019 20:51:26 -0300 Subject: cpuidle: add poll_limit_ns to cpuidle_device structure Add a poll_limit_ns variable to cpuidle_device structure. Calculate and configure it in the new cpuidle_poll_time function, in case its zero. Individual governors are allowed to override this value. Signed-off-by: Marcelo Tosatti Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/cpuidle.c | 30 ++++++++++++++++++++++++++++++ drivers/cpuidle/poll_state.c | 11 ++--------- drivers/cpuidle/sysfs.c | 7 +++++++ include/linux/cpuidle.h | 6 ++++++ 4 files changed, 45 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index 0f4b7c45df3e..0895b988fa92 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -361,6 +361,36 @@ void cpuidle_reflect(struct cpuidle_device *dev, int index) cpuidle_curr_governor->reflect(dev, index); } +/** + * cpuidle_poll_time - return amount of time to poll for, + * governors can override dev->poll_limit_ns if necessary + * + * @drv: the cpuidle driver tied with the cpu + * @dev: the cpuidle device + * + */ +u64 cpuidle_poll_time(struct cpuidle_driver *drv, + struct cpuidle_device *dev) +{ + int i; + u64 limit_ns; + + if (dev->poll_limit_ns) + return dev->poll_limit_ns; + + limit_ns = TICK_NSEC; + for (i = 1; i < drv->state_count; i++) { + if (drv->states[i].disabled || dev->states_usage[i].disable) + continue; + + limit_ns = (u64)drv->states[i].target_residency * NSEC_PER_USEC; + } + + dev->poll_limit_ns = limit_ns; + + return dev->poll_limit_ns; +} + /** * cpuidle_install_idle_handler - installs the cpuidle idle loop handler */ diff --git a/drivers/cpuidle/poll_state.c b/drivers/cpuidle/poll_state.c index 02b9315a9e96..c8fa5f41dfc4 100644 --- a/drivers/cpuidle/poll_state.c +++ b/drivers/cpuidle/poll_state.c @@ -20,16 +20,9 @@ static int __cpuidle poll_idle(struct cpuidle_device *dev, local_irq_enable(); if (!current_set_polling_and_test()) { unsigned int loop_count = 0; - u64 limit = TICK_NSEC; - int i; + u64 limit; - for (i = 1; i < drv->state_count; i++) { - if (drv->states[i].disabled || dev->states_usage[i].disable) - continue; - - limit = (u64)drv->states[i].target_residency * NSEC_PER_USEC; - break; - } + limit = cpuidle_poll_time(drv, dev); while (!need_resched()) { cpu_relax(); diff --git a/drivers/cpuidle/sysfs.c b/drivers/cpuidle/sysfs.c index eb20adb5de23..2bb2683b493c 100644 --- a/drivers/cpuidle/sysfs.c +++ b/drivers/cpuidle/sysfs.c @@ -334,6 +334,7 @@ struct cpuidle_state_kobj { struct cpuidle_state_usage *state_usage; struct completion kobj_unregister; struct kobject kobj; + struct cpuidle_device *device; }; #ifdef CONFIG_SUSPEND @@ -391,6 +392,7 @@ static inline void cpuidle_remove_s2idle_attr_group(struct cpuidle_state_kobj *k #define kobj_to_state_obj(k) container_of(k, struct cpuidle_state_kobj, kobj) #define kobj_to_state(k) (kobj_to_state_obj(k)->state) #define kobj_to_state_usage(k) (kobj_to_state_obj(k)->state_usage) +#define kobj_to_device(k) (kobj_to_state_obj(k)->device) #define attr_to_stateattr(a) container_of(a, struct cpuidle_state_attr, attr) static ssize_t cpuidle_state_show(struct kobject *kobj, struct attribute *attr, @@ -414,10 +416,14 @@ static ssize_t cpuidle_state_store(struct kobject *kobj, struct attribute *attr, struct cpuidle_state *state = kobj_to_state(kobj); struct cpuidle_state_usage *state_usage = kobj_to_state_usage(kobj); struct cpuidle_state_attr *cattr = attr_to_stateattr(attr); + struct cpuidle_device *dev = kobj_to_device(kobj); if (cattr->store) ret = cattr->store(state, state_usage, buf, size); + /* reset poll time cache */ + dev->poll_limit_ns = 0; + return ret; } @@ -468,6 +474,7 @@ static int cpuidle_add_state_sysfs(struct cpuidle_device *device) } kobj->state = &drv->states[i]; kobj->state_usage = &device->states_usage[i]; + kobj->device = device; init_completion(&kobj->kobj_unregister); ret = kobject_init_and_add(&kobj->kobj, &ktype_state_cpuidle, diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index bb9a0db89f1a..b484dd69ec21 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -86,6 +86,7 @@ struct cpuidle_device { ktime_t next_hrtimer; int last_residency; + u64 poll_limit_ns; struct cpuidle_state_usage states_usage[CPUIDLE_STATE_MAX]; struct cpuidle_state_kobj *kobjs[CPUIDLE_STATE_MAX]; struct cpuidle_driver_kobj *kobj_driver; @@ -132,6 +133,8 @@ extern int cpuidle_select(struct cpuidle_driver *drv, extern int cpuidle_enter(struct cpuidle_driver *drv, struct cpuidle_device *dev, int index); extern void cpuidle_reflect(struct cpuidle_device *dev, int index); +extern u64 cpuidle_poll_time(struct cpuidle_driver *drv, + struct cpuidle_device *dev); extern int cpuidle_register_driver(struct cpuidle_driver *drv); extern struct cpuidle_driver *cpuidle_get_driver(void); @@ -166,6 +169,9 @@ static inline int cpuidle_enter(struct cpuidle_driver *drv, struct cpuidle_device *dev, int index) {return -ENODEV; } static inline void cpuidle_reflect(struct cpuidle_device *dev, int index) { } +extern u64 cpuidle_poll_time(struct cpuidle_driver *drv, + struct cpuidle_device *dev) +{return 0; } static inline int cpuidle_register_driver(struct cpuidle_driver *drv) {return -ENODEV; } static inline struct cpuidle_driver *cpuidle_get_driver(void) {return NULL; } -- cgit v1.2.3 From 7d4daeedd575bbc3c40c87fc6708a8b88c50fe7e Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 3 Jul 2019 20:51:27 -0300 Subject: governors: unify last_state_idx Since this field is shared by all governors, move it to cpuidle device structure. Signed-off-by: Marcelo Tosatti Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/governors/ladder.c | 21 ++++++++++----------- drivers/cpuidle/governors/menu.c | 5 ++--- drivers/cpuidle/governors/teo.c | 12 ++++++------ include/linux/cpuidle.h | 1 + 4 files changed, 19 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/drivers/cpuidle/governors/ladder.c b/drivers/cpuidle/governors/ladder.c index f0dddc66af26..428eeb832fe7 100644 --- a/drivers/cpuidle/governors/ladder.c +++ b/drivers/cpuidle/governors/ladder.c @@ -38,7 +38,6 @@ struct ladder_device_state { struct ladder_device { struct ladder_device_state states[CPUIDLE_STATE_MAX]; - int last_state_idx; }; static DEFINE_PER_CPU(struct ladder_device, ladder_devices); @@ -49,12 +48,13 @@ static DEFINE_PER_CPU(struct ladder_device, ladder_devices); * @old_idx: the current state index * @new_idx: the new target state index */ -static inline void ladder_do_selection(struct ladder_device *ldev, +static inline void ladder_do_selection(struct cpuidle_device *dev, + struct ladder_device *ldev, int old_idx, int new_idx) { ldev->states[old_idx].stats.promotion_count = 0; ldev->states[old_idx].stats.demotion_count = 0; - ldev->last_state_idx = new_idx; + dev->last_state_idx = new_idx; } /** @@ -68,13 +68,13 @@ static int ladder_select_state(struct cpuidle_driver *drv, { struct ladder_device *ldev = this_cpu_ptr(&ladder_devices); struct ladder_device_state *last_state; - int last_residency, last_idx = ldev->last_state_idx; + int last_residency, last_idx = dev->last_state_idx; int first_idx = drv->states[0].flags & CPUIDLE_FLAG_POLLING ? 1 : 0; int latency_req = cpuidle_governor_latency_req(dev->cpu); /* Special case when user has set very strict latency requirement */ if (unlikely(latency_req == 0)) { - ladder_do_selection(ldev, last_idx, 0); + ladder_do_selection(dev, ldev, last_idx, 0); return 0; } @@ -91,7 +91,7 @@ static int ladder_select_state(struct cpuidle_driver *drv, last_state->stats.promotion_count++; last_state->stats.demotion_count = 0; if (last_state->stats.promotion_count >= last_state->threshold.promotion_count) { - ladder_do_selection(ldev, last_idx, last_idx + 1); + ladder_do_selection(dev, ldev, last_idx, last_idx + 1); return last_idx + 1; } } @@ -107,7 +107,7 @@ static int ladder_select_state(struct cpuidle_driver *drv, if (drv->states[i].exit_latency <= latency_req) break; } - ladder_do_selection(ldev, last_idx, i); + ladder_do_selection(dev, ldev, last_idx, i); return i; } @@ -116,7 +116,7 @@ static int ladder_select_state(struct cpuidle_driver *drv, last_state->stats.demotion_count++; last_state->stats.promotion_count = 0; if (last_state->stats.demotion_count >= last_state->threshold.demotion_count) { - ladder_do_selection(ldev, last_idx, last_idx - 1); + ladder_do_selection(dev, ldev, last_idx, last_idx - 1); return last_idx - 1; } } @@ -139,7 +139,7 @@ static int ladder_enable_device(struct cpuidle_driver *drv, struct ladder_device_state *lstate; struct cpuidle_state *state; - ldev->last_state_idx = first_idx; + dev->last_state_idx = first_idx; for (i = first_idx; i < drv->state_count; i++) { state = &drv->states[i]; @@ -167,9 +167,8 @@ static int ladder_enable_device(struct cpuidle_driver *drv, */ static void ladder_reflect(struct cpuidle_device *dev, int index) { - struct ladder_device *ldev = this_cpu_ptr(&ladder_devices); if (index > 0) - ldev->last_state_idx = index; + dev->last_state_idx = index; } static struct cpuidle_governor ladder_governor = { diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index e9a28c7846d6..dace4c7f830c 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -117,7 +117,6 @@ */ struct menu_device { - int last_state_idx; int needs_update; int tick_wakeup; @@ -455,7 +454,7 @@ static void menu_reflect(struct cpuidle_device *dev, int index) { struct menu_device *data = this_cpu_ptr(&menu_devices); - data->last_state_idx = index; + dev->last_state_idx = index; data->needs_update = 1; data->tick_wakeup = tick_nohz_idle_got_tick(); } @@ -468,7 +467,7 @@ static void menu_reflect(struct cpuidle_device *dev, int index) static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) { struct menu_device *data = this_cpu_ptr(&menu_devices); - int last_idx = data->last_state_idx; + int last_idx = dev->last_state_idx; struct cpuidle_state *target = &drv->states[last_idx]; unsigned int measured_us; unsigned int new_factor; diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index 7d05efdbd3c6..a2fd81067a13 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -96,7 +96,6 @@ struct teo_idle_state { * @time_span_ns: Time between idle state selection and post-wakeup update. * @sleep_length_ns: Time till the closest timer event (at the selection time). * @states: Idle states data corresponding to this CPU. - * @last_state: Idle state entered by the CPU last time. * @interval_idx: Index of the most recent saved idle interval. * @intervals: Saved idle duration values. */ @@ -104,7 +103,6 @@ struct teo_cpu { u64 time_span_ns; u64 sleep_length_ns; struct teo_idle_state states[CPUIDLE_STATE_MAX]; - int last_state; int interval_idx; unsigned int intervals[INTERVALS]; }; @@ -130,7 +128,9 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) */ measured_us = sleep_length_us; } else { - unsigned int lat = drv->states[cpu_data->last_state].exit_latency; + unsigned int lat; + + lat = drv->states[dev->last_state_idx].exit_latency; measured_us = ktime_to_us(cpu_data->time_span_ns); /* @@ -245,9 +245,9 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, int max_early_idx, idx, i; ktime_t delta_tick; - if (cpu_data->last_state >= 0) { + if (dev->last_state_idx >= 0) { teo_update(drv, dev); - cpu_data->last_state = -1; + dev->last_state_idx = -1; } cpu_data->time_span_ns = local_clock(); @@ -394,7 +394,7 @@ static void teo_reflect(struct cpuidle_device *dev, int state) { struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); - cpu_data->last_state = state; + dev->last_state_idx = state; /* * If the wakeup was not "natural", but triggered by one of the safety * nets, assume that the CPU might have been idle for the entire sleep diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index b484dd69ec21..ba535a1a47d5 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -85,6 +85,7 @@ struct cpuidle_device { unsigned int cpu; ktime_t next_hrtimer; + int last_state_idx; int last_residency; u64 poll_limit_ns; struct cpuidle_state_usage states_usage[CPUIDLE_STATE_MAX]; -- cgit v1.2.3 From a1c4423b02b2121108e3ea9580741e0f26309a48 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 3 Jul 2019 20:51:29 -0300 Subject: cpuidle-haltpoll: disable host side polling when kvm virtualized When performing guest side polling, it is not necessary to also perform host side polling. So disable host side polling, via the new MSR interface, when loading cpuidle-haltpoll driver. Signed-off-by: Marcelo Tosatti Signed-off-by: Rafael J. Wysocki --- arch/x86/Kconfig | 7 ++++++ arch/x86/include/asm/cpuidle_haltpoll.h | 8 +++++++ arch/x86/kernel/kvm.c | 42 +++++++++++++++++++++++++++++++++ drivers/cpuidle/cpuidle-haltpoll.c | 9 ++++++- include/linux/cpuidle_haltpoll.h | 16 +++++++++++++ 5 files changed, 81 insertions(+), 1 deletion(-) create mode 100644 arch/x86/include/asm/cpuidle_haltpoll.h create mode 100644 include/linux/cpuidle_haltpoll.h (limited to 'include') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 222855cc0158..05e78acb187c 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -794,6 +794,7 @@ config KVM_GUEST bool "KVM Guest support (including kvmclock)" depends on PARAVIRT select PARAVIRT_CLOCK + select ARCH_CPUIDLE_HALTPOLL default y ---help--- This option enables various optimizations for running under the KVM @@ -802,6 +803,12 @@ config KVM_GUEST underlying device model, the host provides the guest with timing infrastructure such as time of day, and system time +config ARCH_CPUIDLE_HALTPOLL + def_bool n + prompt "Disable host haltpoll when loading haltpoll driver" + help + If virtualized under KVM, disable host haltpoll. + config PVH bool "Support for running PVH guests" ---help--- diff --git a/arch/x86/include/asm/cpuidle_haltpoll.h b/arch/x86/include/asm/cpuidle_haltpoll.h new file mode 100644 index 000000000000..ff8607d81526 --- /dev/null +++ b/arch/x86/include/asm/cpuidle_haltpoll.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ARCH_HALTPOLL_H +#define _ARCH_HALTPOLL_H + +void arch_haltpoll_enable(void); +void arch_haltpoll_disable(void); + +#endif diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index b7f34fe2171e..f48401be8ce0 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -875,3 +875,45 @@ void __init kvm_spinlock_init(void) } #endif /* CONFIG_PARAVIRT_SPINLOCKS */ + +#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL + +static void kvm_disable_host_haltpoll(void *i) +{ + wrmsrl(MSR_KVM_POLL_CONTROL, 0); +} + +static void kvm_enable_host_haltpoll(void *i) +{ + wrmsrl(MSR_KVM_POLL_CONTROL, 1); +} + +void arch_haltpoll_enable(void) +{ + if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL)) { + printk(KERN_ERR "kvm: host does not support poll control\n"); + printk(KERN_ERR "kvm: host upgrade recommended\n"); + return; + } + + preempt_disable(); + /* Enable guest halt poll disables host halt poll */ + kvm_disable_host_haltpoll(NULL); + smp_call_function(kvm_disable_host_haltpoll, NULL, 1); + preempt_enable(); +} +EXPORT_SYMBOL_GPL(arch_haltpoll_enable); + +void arch_haltpoll_disable(void) +{ + if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL)) + return; + + preempt_disable(); + /* Enable guest halt poll disables host halt poll */ + kvm_enable_host_haltpoll(NULL); + smp_call_function(kvm_enable_host_haltpoll, NULL, 1); + preempt_enable(); +} +EXPORT_SYMBOL_GPL(arch_haltpoll_disable); +#endif diff --git a/drivers/cpuidle/cpuidle-haltpoll.c b/drivers/cpuidle/cpuidle-haltpoll.c index 35cfb53e9287..9ac093dcbb01 100644 --- a/drivers/cpuidle/cpuidle-haltpoll.c +++ b/drivers/cpuidle/cpuidle-haltpoll.c @@ -15,6 +15,7 @@ #include #include #include +#include static int default_enter_idle(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index) @@ -47,6 +48,7 @@ static struct cpuidle_driver haltpoll_driver = { static int __init haltpoll_init(void) { + int ret; struct cpuidle_driver *drv = &haltpoll_driver; cpuidle_poll_state_init(drv); @@ -54,11 +56,16 @@ static int __init haltpoll_init(void) if (!kvm_para_available()) return 0; - return cpuidle_register(&haltpoll_driver, NULL); + ret = cpuidle_register(&haltpoll_driver, NULL); + if (ret == 0) + arch_haltpoll_enable(); + + return ret; } static void __exit haltpoll_exit(void) { + arch_haltpoll_disable(); cpuidle_unregister(&haltpoll_driver); } diff --git a/include/linux/cpuidle_haltpoll.h b/include/linux/cpuidle_haltpoll.h new file mode 100644 index 000000000000..fe5954c2409e --- /dev/null +++ b/include/linux/cpuidle_haltpoll.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _CPUIDLE_HALTPOLL_H +#define _CPUIDLE_HALTPOLL_H + +#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL +#include +#else +static inline void arch_haltpoll_enable(void) +{ +} + +static inline void arch_haltpoll_disable(void) +{ +} +#endif +#endif -- cgit v1.2.3 From 64a38e840ce5940253208eaba40265c73decc4ee Mon Sep 17 00:00:00 2001 From: Dave Wysochanski Date: Fri, 26 Jul 2019 18:33:01 -0400 Subject: SUNRPC: Track writers of the 'channel' file to improve cache_listeners_exist The sunrpc cache interface is susceptible to being fooled by a rogue process just reading a 'channel' file. If this happens the kernel may think a valid daemon exists to service the cache when it does not. For example, the following may fool the kernel: cat /proc/net/rpc/auth.unix.gid/channel Change the tracking of readers to writers when considering whether a listener exists as all valid daemon processes either open a channel file O_RDWR or O_WRONLY. While this does not prevent a rogue process from "stealing" a message from the kernel, it does at least improve the kernels perception of whether a valid process servicing the cache exists. Signed-off-by: Dave Wysochanski Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/cache.h | 6 +++--- net/sunrpc/cache.c | 12 ++++++++---- 2 files changed, 11 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h index c7f38e897174..f7d086b77a21 100644 --- a/include/linux/sunrpc/cache.h +++ b/include/linux/sunrpc/cache.h @@ -107,9 +107,9 @@ struct cache_detail { /* fields for communication over channel */ struct list_head queue; - atomic_t readers; /* how many time is /chennel open */ - time_t last_close; /* if no readers, when did last close */ - time_t last_warn; /* when we last warned about no readers */ + atomic_t writers; /* how many time is /channel open */ + time_t last_close; /* if no writers, when did last close */ + time_t last_warn; /* when we last warned about no writers */ union { struct proc_dir_entry *procfs; diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 6f1528f271ee..a6a6190ad37a 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -373,7 +373,7 @@ void sunrpc_init_cache_detail(struct cache_detail *cd) spin_lock(&cache_list_lock); cd->nextcheck = 0; cd->entries = 0; - atomic_set(&cd->readers, 0); + atomic_set(&cd->writers, 0); cd->last_close = 0; cd->last_warn = -1; list_add(&cd->others, &cache_list); @@ -1029,11 +1029,13 @@ static int cache_open(struct inode *inode, struct file *filp, } rp->offset = 0; rp->q.reader = 1; - atomic_inc(&cd->readers); + spin_lock(&queue_lock); list_add(&rp->q.list, &cd->queue); spin_unlock(&queue_lock); } + if (filp->f_mode & FMODE_WRITE) + atomic_inc(&cd->writers); filp->private_data = rp; return 0; } @@ -1062,8 +1064,10 @@ static int cache_release(struct inode *inode, struct file *filp, filp->private_data = NULL; kfree(rp); + } + if (filp->f_mode & FMODE_WRITE) { + atomic_dec(&cd->writers); cd->last_close = seconds_since_boot(); - atomic_dec(&cd->readers); } module_put(cd->owner); return 0; @@ -1171,7 +1175,7 @@ static void warn_no_listener(struct cache_detail *detail) static bool cache_listeners_exist(struct cache_detail *detail) { - if (atomic_read(&detail->readers)) + if (atomic_read(&detail->writers)) return true; if (detail->last_close == 0) /* This cache was never opened */ -- cgit v1.2.3 From 509ce4c85bd055ee1013bc853b5d543428b0f017 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 29 Jul 2019 00:27:39 +0900 Subject: ppdev: add header include guard Add a header include guard just in case. Signed-off-by: Masahiro Yamada Link: https://lore.kernel.org/r/20190728152739.9249-1-yamada.masahiro@socionext.com Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/ppdev.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/ppdev.h b/include/uapi/linux/ppdev.h index 8fe3c64d149e..eb895b83f2bd 100644 --- a/include/uapi/linux/ppdev.h +++ b/include/uapi/linux/ppdev.h @@ -15,6 +15,9 @@ * Added PPGETMODES/PPGETMODE/PPGETPHASE, Fred Barnes , 03/01/2001 */ +#ifndef _UAPI_LINUX_PPDEV_H +#define _UAPI_LINUX_PPDEV_H + #define PP_IOCTL 'p' /* Set mode for read/write (e.g. IEEE1284_MODE_EPP) */ @@ -97,4 +100,4 @@ struct ppdev_frob_struct { /* only masks user-visible flags */ #define PP_FLAGMASK (PP_FASTWRITE | PP_FASTREAD | PP_W91284PIC) - +#endif /* _UAPI_LINUX_PPDEV_H */ -- cgit v1.2.3 From 89e524c04fa966330e2e80ab2bc50b9944c5847a Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 30 Jul 2019 13:10:14 +0200 Subject: loop: Fix mount(2) failure due to race with LOOP_SET_FD Commit 33ec3e53e7b1 ("loop: Don't change loop device under exclusive opener") made LOOP_SET_FD ioctl acquire exclusive block device reference while it updates loop device binding. However this can make perfectly valid mount(2) fail with EBUSY due to racing LOOP_SET_FD holding temporarily the exclusive bdev reference in cases like this: for i in {a..z}{a..z}; do dd if=/dev/zero of=$i.image bs=1k count=0 seek=1024 mkfs.ext2 $i.image mkdir mnt$i done echo "Run" for i in {a..z}{a..z}; do mount -o loop -t ext2 $i.image mnt$i & done Fix the problem by not getting full exclusive bdev reference in LOOP_SET_FD but instead just mark the bdev as being claimed while we update the binding information. This just blocks new exclusive openers instead of failing them with EBUSY thus fixing the problem. Fixes: 33ec3e53e7b1 ("loop: Don't change loop device under exclusive opener") Cc: stable@vger.kernel.org Tested-by: Kai-Heng Feng Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- drivers/block/loop.c | 16 +++++----- fs/block_dev.c | 83 ++++++++++++++++++++++++++++++++++++---------------- include/linux/fs.h | 6 ++++ 3 files changed, 73 insertions(+), 32 deletions(-) (limited to 'include') diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 44c9985f352a..3036883fc9f8 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -924,6 +924,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, struct file *file; struct inode *inode; struct address_space *mapping; + struct block_device *claimed_bdev = NULL; int lo_flags = 0; int error; loff_t size; @@ -942,10 +943,11 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, * here to avoid changing device under exclusive owner. */ if (!(mode & FMODE_EXCL)) { - bdgrab(bdev); - error = blkdev_get(bdev, mode | FMODE_EXCL, loop_set_fd); - if (error) + claimed_bdev = bd_start_claiming(bdev, loop_set_fd); + if (IS_ERR(claimed_bdev)) { + error = PTR_ERR(claimed_bdev); goto out_putf; + } } error = mutex_lock_killable(&loop_ctl_mutex); @@ -1015,15 +1017,15 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, mutex_unlock(&loop_ctl_mutex); if (partscan) loop_reread_partitions(lo, bdev); - if (!(mode & FMODE_EXCL)) - blkdev_put(bdev, mode | FMODE_EXCL); + if (claimed_bdev) + bd_abort_claiming(bdev, claimed_bdev, loop_set_fd); return 0; out_unlock: mutex_unlock(&loop_ctl_mutex); out_bdev: - if (!(mode & FMODE_EXCL)) - blkdev_put(bdev, mode | FMODE_EXCL); + if (claimed_bdev) + bd_abort_claiming(bdev, claimed_bdev, loop_set_fd); out_putf: fput(file); out: diff --git a/fs/block_dev.c b/fs/block_dev.c index c2a85b587922..22591bad9353 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1181,8 +1181,7 @@ static struct gendisk *bdev_get_gendisk(struct block_device *bdev, int *partno) * Pointer to the block device containing @bdev on success, ERR_PTR() * value on failure. */ -static struct block_device *bd_start_claiming(struct block_device *bdev, - void *holder) +struct block_device *bd_start_claiming(struct block_device *bdev, void *holder) { struct gendisk *disk; struct block_device *whole; @@ -1229,6 +1228,62 @@ static struct block_device *bd_start_claiming(struct block_device *bdev, return ERR_PTR(err); } } +EXPORT_SYMBOL(bd_start_claiming); + +static void bd_clear_claiming(struct block_device *whole, void *holder) +{ + lockdep_assert_held(&bdev_lock); + /* tell others that we're done */ + BUG_ON(whole->bd_claiming != holder); + whole->bd_claiming = NULL; + wake_up_bit(&whole->bd_claiming, 0); +} + +/** + * bd_finish_claiming - finish claiming of a block device + * @bdev: block device of interest + * @whole: whole block device (returned from bd_start_claiming()) + * @holder: holder that has claimed @bdev + * + * Finish exclusive open of a block device. Mark the device as exlusively + * open by the holder and wake up all waiters for exclusive open to finish. + */ +void bd_finish_claiming(struct block_device *bdev, struct block_device *whole, + void *holder) +{ + spin_lock(&bdev_lock); + BUG_ON(!bd_may_claim(bdev, whole, holder)); + /* + * Note that for a whole device bd_holders will be incremented twice, + * and bd_holder will be set to bd_may_claim before being set to holder + */ + whole->bd_holders++; + whole->bd_holder = bd_may_claim; + bdev->bd_holders++; + bdev->bd_holder = holder; + bd_clear_claiming(whole, holder); + spin_unlock(&bdev_lock); +} +EXPORT_SYMBOL(bd_finish_claiming); + +/** + * bd_abort_claiming - abort claiming of a block device + * @bdev: block device of interest + * @whole: whole block device (returned from bd_start_claiming()) + * @holder: holder that has claimed @bdev + * + * Abort claiming of a block device when the exclusive open failed. This can be + * also used when exclusive open is not actually desired and we just needed + * to block other exclusive openers for a while. + */ +void bd_abort_claiming(struct block_device *bdev, struct block_device *whole, + void *holder) +{ + spin_lock(&bdev_lock); + bd_clear_claiming(whole, holder); + spin_unlock(&bdev_lock); +} +EXPORT_SYMBOL(bd_abort_claiming); #ifdef CONFIG_SYSFS struct bd_holder_disk { @@ -1698,29 +1753,7 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder) /* finish claiming */ mutex_lock(&bdev->bd_mutex); - spin_lock(&bdev_lock); - - if (!res) { - BUG_ON(!bd_may_claim(bdev, whole, holder)); - /* - * Note that for a whole device bd_holders - * will be incremented twice, and bd_holder - * will be set to bd_may_claim before being - * set to holder - */ - whole->bd_holders++; - whole->bd_holder = bd_may_claim; - bdev->bd_holders++; - bdev->bd_holder = holder; - } - - /* tell others that we're done */ - BUG_ON(whole->bd_claiming != holder); - whole->bd_claiming = NULL; - wake_up_bit(&whole->bd_claiming, 0); - - spin_unlock(&bdev_lock); - + bd_finish_claiming(bdev, whole, holder); /* * Block event polling for write claims if requested. Any * write holder makes the write_holder state stick until diff --git a/include/linux/fs.h b/include/linux/fs.h index 56b8e358af5c..997a530ff4e9 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2598,6 +2598,12 @@ extern struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, void *holder); extern struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder); +extern struct block_device *bd_start_claiming(struct block_device *bdev, + void *holder); +extern void bd_finish_claiming(struct block_device *bdev, + struct block_device *whole, void *holder); +extern void bd_abort_claiming(struct block_device *bdev, + struct block_device *whole, void *holder); extern void blkdev_put(struct block_device *bdev, fmode_t mode); extern int __blkdev_reread_part(struct block_device *bdev); extern int blkdev_reread_part(struct block_device *bdev); -- cgit v1.2.3 From 7240b60c98d6309363a9f8d5a4ecd5b0626f2aff Mon Sep 17 00:00:00 2001 From: Jonathan Lemon Date: Tue, 30 Jul 2019 07:40:32 -0700 Subject: linux: Add skb_frag_t page_offset accessors Add skb_frag_off(), skb_frag_off_add(), skb_frag_off_set(), and skb_frag_off_copy() accessors for page_offset. Signed-off-by: Jonathan Lemon Signed-off-by: David S. Miller --- include/linux/skbuff.h | 67 ++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 59 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 718742b1c505..2957cdd6c032 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -312,7 +312,7 @@ extern int sysctl_max_skb_frags; typedef struct bio_vec skb_frag_t; /** - * skb_frag_size - Returns the size of a skb fragment + * skb_frag_size() - Returns the size of a skb fragment * @frag: skb fragment */ static inline unsigned int skb_frag_size(const skb_frag_t *frag) @@ -321,7 +321,7 @@ static inline unsigned int skb_frag_size(const skb_frag_t *frag) } /** - * skb_frag_size_set - Sets the size of a skb fragment + * skb_frag_size_set() - Sets the size of a skb fragment * @frag: skb fragment * @size: size of fragment */ @@ -331,7 +331,7 @@ static inline void skb_frag_size_set(skb_frag_t *frag, unsigned int size) } /** - * skb_frag_size_add - Incrementes the size of a skb fragment by %delta + * skb_frag_size_add() - Increments the size of a skb fragment by @delta * @frag: skb fragment * @delta: value to add */ @@ -341,7 +341,7 @@ static inline void skb_frag_size_add(skb_frag_t *frag, int delta) } /** - * skb_frag_size_sub - Decrements the size of a skb fragment by %delta + * skb_frag_size_sub() - Decrements the size of a skb fragment by @delta * @frag: skb fragment * @delta: value to subtract */ @@ -2857,6 +2857,46 @@ static inline void skb_propagate_pfmemalloc(struct page *page, skb->pfmemalloc = true; } +/** + * skb_frag_off() - Returns the offset of a skb fragment + * @frag: the paged fragment + */ +static inline unsigned int skb_frag_off(const skb_frag_t *frag) +{ + return frag->page_offset; +} + +/** + * skb_frag_off_add() - Increments the offset of a skb fragment by @delta + * @frag: skb fragment + * @delta: value to add + */ +static inline void skb_frag_off_add(skb_frag_t *frag, int delta) +{ + frag->page_offset += delta; +} + +/** + * skb_frag_off_set() - Sets the offset of a skb fragment + * @frag: skb fragment + * @offset: offset of fragment + */ +static inline void skb_frag_off_set(skb_frag_t *frag, unsigned int offset) +{ + frag->page_offset = offset; +} + +/** + * skb_frag_off_copy() - Sets the offset of a skb fragment from another fragment + * @fragto: skb fragment where offset is set + * @fragfrom: skb fragment offset is copied from + */ +static inline void skb_frag_off_copy(skb_frag_t *fragto, + const skb_frag_t *fragfrom) +{ + fragto->page_offset = fragfrom->page_offset; +} + /** * skb_frag_page - retrieve the page referred to by a paged fragment * @frag: the paged fragment @@ -2923,7 +2963,7 @@ static inline void skb_frag_unref(struct sk_buff *skb, int f) */ static inline void *skb_frag_address(const skb_frag_t *frag) { - return page_address(skb_frag_page(frag)) + frag->page_offset; + return page_address(skb_frag_page(frag)) + skb_frag_off(frag); } /** @@ -2939,7 +2979,18 @@ static inline void *skb_frag_address_safe(const skb_frag_t *frag) if (unlikely(!ptr)) return NULL; - return ptr + frag->page_offset; + return ptr + skb_frag_off(frag); +} + +/** + * skb_frag_page_copy() - sets the page in a fragment from another fragment + * @fragto: skb fragment where page is set + * @fragfrom: skb fragment page is copied from + */ +static inline void skb_frag_page_copy(skb_frag_t *fragto, + const skb_frag_t *fragfrom) +{ + fragto->bv_page = fragfrom->bv_page; } /** @@ -2987,7 +3038,7 @@ static inline dma_addr_t skb_frag_dma_map(struct device *dev, enum dma_data_direction dir) { return dma_map_page(dev, skb_frag_page(frag), - frag->page_offset + offset, size, dir); + skb_frag_off(frag) + offset, size, dir); } static inline struct sk_buff *pskb_copy(struct sk_buff *skb, @@ -3157,7 +3208,7 @@ static inline bool skb_can_coalesce(struct sk_buff *skb, int i, const skb_frag_t *frag = &skb_shinfo(skb)->frags[i - 1]; return page == skb_frag_page(frag) && - off == frag->page_offset + skb_frag_size(frag); + off == skb_frag_off(frag) + skb_frag_size(frag); } return false; } -- cgit v1.2.3 From 65c84f148e359ed398dcc9ed736131103f40896b Mon Sep 17 00:00:00 2001 From: Jonathan Lemon Date: Tue, 30 Jul 2019 07:40:34 -0700 Subject: linux: Remove bvec page_offset, use bv_offset Now that page_offset is referenced through accessors, remove the union, and use bv_offset. Signed-off-by: Jonathan Lemon Signed-off-by: David S. Miller --- include/linux/bvec.h | 5 +---- include/linux/skbuff.h | 10 +++++----- 2 files changed, 6 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/bvec.h b/include/linux/bvec.h index 7f2b2ea9399c..a032f01e928c 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -18,10 +18,7 @@ struct bio_vec { struct page *bv_page; unsigned int bv_len; - union { - __u32 page_offset; - unsigned int bv_offset; - }; + unsigned int bv_offset; }; struct bvec_iter { diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 2957cdd6c032..3aef8d82ea59 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2078,7 +2078,7 @@ static inline void __skb_fill_page_desc(struct sk_buff *skb, int i, * on page_is_pfmemalloc doing the right thing(tm). */ frag->bv_page = page; - frag->page_offset = off; + frag->bv_offset = off; skb_frag_size_set(frag, size); page = compound_head(page); @@ -2863,7 +2863,7 @@ static inline void skb_propagate_pfmemalloc(struct page *page, */ static inline unsigned int skb_frag_off(const skb_frag_t *frag) { - return frag->page_offset; + return frag->bv_offset; } /** @@ -2873,7 +2873,7 @@ static inline unsigned int skb_frag_off(const skb_frag_t *frag) */ static inline void skb_frag_off_add(skb_frag_t *frag, int delta) { - frag->page_offset += delta; + frag->bv_offset += delta; } /** @@ -2883,7 +2883,7 @@ static inline void skb_frag_off_add(skb_frag_t *frag, int delta) */ static inline void skb_frag_off_set(skb_frag_t *frag, unsigned int offset) { - frag->page_offset = offset; + frag->bv_offset = offset; } /** @@ -2894,7 +2894,7 @@ static inline void skb_frag_off_set(skb_frag_t *frag, unsigned int offset) static inline void skb_frag_off_copy(skb_frag_t *fragto, const skb_frag_t *fragfrom) { - fragto->page_offset = fragfrom->page_offset; + fragto->bv_offset = fragfrom->bv_offset; } /** -- cgit v1.2.3 From 055d88242a6046a1ceac3167290f054c72571cd9 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 30 Jul 2019 21:25:20 +0200 Subject: compat_ioctl: pppoe: fix PPPOEIOCSFWD handling Support for handling the PPPOEIOCSFWD ioctl in compat mode was added in linux-2.5.69 along with hundreds of other commands, but was always broken sincen only the structure is compatible, but the command number is not, due to the size being sizeof(size_t), or at first sizeof(sizeof((struct sockaddr_pppox)), which is different on 64-bit architectures. Guillaume Nault adds: And the implementation was broken until 2016 (see 29e73269aa4d ("pppoe: fix reference counting in PPPoE proxy")), and nobody ever noticed. I should probably have removed this ioctl entirely instead of fixing it. Clearly, it has never been used. Fix it by adding a compat_ioctl handler for all pppoe variants that translates the command number and then calls the regular ioctl function. All other ioctl commands handled by pppoe are compatible between 32-bit and 64-bit, and require compat_ptr() conversion. This should apply to all stable kernels. Acked-by: Guillaume Nault Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- drivers/net/ppp/pppoe.c | 3 +++ drivers/net/ppp/pppox.c | 13 +++++++++++++ drivers/net/ppp/pptp.c | 3 +++ fs/compat_ioctl.c | 3 --- include/linux/if_pppox.h | 3 +++ net/l2tp/l2tp_ppp.c | 3 +++ 6 files changed, 25 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c index 1d902ecb4aa8..a44dd3c8af63 100644 --- a/drivers/net/ppp/pppoe.c +++ b/drivers/net/ppp/pppoe.c @@ -1115,6 +1115,9 @@ static const struct proto_ops pppoe_ops = { .recvmsg = pppoe_recvmsg, .mmap = sock_no_mmap, .ioctl = pppox_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = pppox_compat_ioctl, +#endif }; static const struct pppox_proto pppoe_proto = { diff --git a/drivers/net/ppp/pppox.c b/drivers/net/ppp/pppox.c index 5ef422a43d70..08364f10a43f 100644 --- a/drivers/net/ppp/pppox.c +++ b/drivers/net/ppp/pppox.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -98,6 +99,18 @@ int pppox_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) EXPORT_SYMBOL(pppox_ioctl); +#ifdef CONFIG_COMPAT +int pppox_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + if (cmd == PPPOEIOCSFWD32) + cmd = PPPOEIOCSFWD; + + return pppox_ioctl(sock, cmd, (unsigned long)compat_ptr(arg)); +} + +EXPORT_SYMBOL(pppox_compat_ioctl); +#endif + static int pppox_create(struct net *net, struct socket *sock, int protocol, int kern) { diff --git a/drivers/net/ppp/pptp.c b/drivers/net/ppp/pptp.c index a8e52c8e4128..734de7de03f7 100644 --- a/drivers/net/ppp/pptp.c +++ b/drivers/net/ppp/pptp.c @@ -623,6 +623,9 @@ static const struct proto_ops pptp_ops = { .recvmsg = sock_no_recvmsg, .mmap = sock_no_mmap, .ioctl = pppox_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = pppox_compat_ioctl, +#endif }; static const struct pppox_proto pppox_pptp_proto = { diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 6e30949d9f77..a7ec2d3dff92 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -638,9 +638,6 @@ COMPATIBLE_IOCTL(PPPIOCDISCONN) COMPATIBLE_IOCTL(PPPIOCATTCHAN) COMPATIBLE_IOCTL(PPPIOCGCHAN) COMPATIBLE_IOCTL(PPPIOCGL2TPSTATS) -/* PPPOX */ -COMPATIBLE_IOCTL(PPPOEIOCSFWD) -COMPATIBLE_IOCTL(PPPOEIOCDFWD) /* Big A */ /* sparc only */ /* Big Q for sound/OSS */ diff --git a/include/linux/if_pppox.h b/include/linux/if_pppox.h index 8b728750a625..69e813bcb947 100644 --- a/include/linux/if_pppox.h +++ b/include/linux/if_pppox.h @@ -80,6 +80,9 @@ extern int register_pppox_proto(int proto_num, const struct pppox_proto *pp); extern void unregister_pppox_proto(int proto_num); extern void pppox_unbind_sock(struct sock *sk);/* delete ppp-channel binding */ extern int pppox_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); +extern int pppox_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); + +#define PPPOEIOCSFWD32 _IOW(0xB1 ,0, compat_size_t) /* PPPoX socket states */ enum { diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 1d0e5904dedf..c54cb59593ef 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -1681,6 +1681,9 @@ static const struct proto_ops pppol2tp_ops = { .recvmsg = pppol2tp_recvmsg, .mmap = sock_no_mmap, .ioctl = pppox_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = pppox_compat_ioctl, +#endif }; static const struct pppox_proto pppol2tp_proto = { -- cgit v1.2.3 From b74494872555d1f7888dfd9225700a363f4a84fc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Jul 2019 20:30:49 +0200 Subject: hrtimer: Remove task argument from hrtimer_init_sleeper() All callers hand in 'current' and that's the only task pointer which actually makes sense. Remove the task argument and set current in the function. Signed-off-by: Thomas Gleixner Reviewed-by: Steven Rostedt (VMware) Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20190726185752.791885290@linutronix.de --- block/blk-mq.c | 2 +- drivers/staging/android/vsoc.c | 2 +- include/linux/hrtimer.h | 3 +-- include/linux/wait.h | 2 +- kernel/futex.c | 2 +- kernel/time/hrtimer.c | 8 ++++---- net/core/pktgen.c | 2 +- 7 files changed, 10 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/block/blk-mq.c b/block/blk-mq.c index b038ec680e84..5f647cb8c695 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3418,7 +3418,7 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q, hrtimer_init_on_stack(&hs.timer, CLOCK_MONOTONIC, mode); hrtimer_set_expires(&hs.timer, kt); - hrtimer_init_sleeper(&hs, current); + hrtimer_init_sleeper(&hs); do { if (blk_mq_rq_state(rq) == MQ_RQ_COMPLETE) break; diff --git a/drivers/staging/android/vsoc.c b/drivers/staging/android/vsoc.c index 00a1ec7b9154..ce480bcf20d2 100644 --- a/drivers/staging/android/vsoc.c +++ b/drivers/staging/android/vsoc.c @@ -442,7 +442,7 @@ static int handle_vsoc_cond_wait(struct file *filp, struct vsoc_cond_wait *arg) hrtimer_set_expires_range_ns(&to->timer, wake_time, current->timer_slack_ns); - hrtimer_init_sleeper(to, current); + hrtimer_init_sleeper(to); } while (1) { diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 4971100a8cab..3c74f89367c4 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -463,8 +463,7 @@ extern long hrtimer_nanosleep(const struct timespec64 *rqtp, const enum hrtimer_mode mode, const clockid_t clockid); -extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, - struct task_struct *tsk); +extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl); extern int schedule_hrtimeout_range(ktime_t *expires, u64 delta, const enum hrtimer_mode mode); diff --git a/include/linux/wait.h b/include/linux/wait.h index b6f77cf60dd7..d57832774ca6 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -489,7 +489,7 @@ do { \ struct hrtimer_sleeper __t; \ \ hrtimer_init_on_stack(&__t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); \ - hrtimer_init_sleeper(&__t, current); \ + hrtimer_init_sleeper(&__t); \ if ((timeout) != KTIME_MAX) \ hrtimer_start_range_ns(&__t.timer, timeout, \ current->timer_slack_ns, \ diff --git a/kernel/futex.c b/kernel/futex.c index 6d50728ef2e7..5e9842ea4012 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -490,7 +490,7 @@ futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, hrtimer_init_on_stack(&timeout->timer, (flags & FLAGS_CLOCKRT) ? CLOCK_REALTIME : CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - hrtimer_init_sleeper(timeout, current); + hrtimer_init_sleeper(timeout); /* * If range_ns is 0, calling hrtimer_set_expires_range_ns() is diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 5ee77f1a8a92..de895d86800c 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1639,10 +1639,10 @@ static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer) return HRTIMER_NORESTART; } -void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task) +void hrtimer_init_sleeper(struct hrtimer_sleeper *sl) { sl->timer.function = hrtimer_wakeup; - sl->task = task; + sl->task = current; } EXPORT_SYMBOL_GPL(hrtimer_init_sleeper); @@ -1669,7 +1669,7 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod { struct restart_block *restart; - hrtimer_init_sleeper(t, current); + hrtimer_init_sleeper(t); do { set_current_state(TASK_INTERRUPTIBLE); @@ -1930,7 +1930,7 @@ schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, hrtimer_init_on_stack(&t.timer, clock_id, mode); hrtimer_set_expires_range_ns(&t.timer, *expires, delta); - hrtimer_init_sleeper(&t, current); + hrtimer_init_sleeper(&t); hrtimer_start_expires(&t.timer, mode); diff --git a/net/core/pktgen.c b/net/core/pktgen.c index bb9915291644..7f3cf2381f27 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2171,7 +2171,7 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until) } while (ktime_compare(end_time, spin_until) < 0); } else { /* see do_nanosleep */ - hrtimer_init_sleeper(&t, current); + hrtimer_init_sleeper(&t); do { set_current_state(TASK_INTERRUPTIBLE); hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS); -- cgit v1.2.3 From 473c7391ce731adb482c03e420964676ed8b494d Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Tue, 30 Jul 2019 17:43:30 +0200 Subject: vsock/virtio: limit the memory used per-socket Since virtio-vsock was introduced, the buffers filled by the host and pushed to the guest using the vring, are directly queued in a per-socket list. These buffers are preallocated by the guest with a fixed size (4 KB). The maximum amount of memory used by each socket should be controlled by the credit mechanism. The default credit available per-socket is 256 KB, but if we use only 1 byte per packet, the guest can queue up to 262144 of 4 KB buffers, using up to 1 GB of memory per-socket. In addition, the guest will continue to fill the vring with new 4 KB free buffers to avoid starvation of other sockets. This patch mitigates this issue copying the payload of small packets (< 128 bytes) into the buffer of last packet queued, in order to avoid wasting memory. Signed-off-by: Stefano Garzarella Reviewed-by: Stefan Hajnoczi Acked-by: Michael S. Tsirkin Signed-off-by: David S. Miller --- drivers/vhost/vsock.c | 2 ++ include/linux/virtio_vsock.h | 1 + net/vmw_vsock/virtio_transport.c | 1 + net/vmw_vsock/virtio_transport_common.c | 60 ++++++++++++++++++++++++++++----- 4 files changed, 55 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c index 6a50e1d0529c..6c8390a2af52 100644 --- a/drivers/vhost/vsock.c +++ b/drivers/vhost/vsock.c @@ -329,6 +329,8 @@ vhost_vsock_alloc_pkt(struct vhost_virtqueue *vq, return NULL; } + pkt->buf_len = pkt->len; + nbytes = copy_from_iter(pkt->buf, pkt->len, &iov_iter); if (nbytes != pkt->len) { vq_err(vq, "Expected %u byte payload, got %zu bytes\n", diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h index e223e2632edd..7d973903f52e 100644 --- a/include/linux/virtio_vsock.h +++ b/include/linux/virtio_vsock.h @@ -52,6 +52,7 @@ struct virtio_vsock_pkt { /* socket refcnt not held, only use for cancellation */ struct vsock_sock *vsk; void *buf; + u32 buf_len; u32 len; u32 off; bool reply; diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index 0815d1357861..082a30936690 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -307,6 +307,7 @@ static void virtio_vsock_rx_fill(struct virtio_vsock *vsock) break; } + pkt->buf_len = buf_len; pkt->len = buf_len; sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr)); diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 6f1a8aff65c5..095221f94786 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -26,6 +26,9 @@ /* How long to wait for graceful shutdown of a connection */ #define VSOCK_CLOSE_TIMEOUT (8 * HZ) +/* Threshold for detecting small packets to copy */ +#define GOOD_COPY_LEN 128 + static const struct virtio_transport *virtio_transport_get_ops(void) { const struct vsock_transport *t = vsock_core_get_transport(); @@ -64,6 +67,9 @@ virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info, pkt->buf = kmalloc(len, GFP_KERNEL); if (!pkt->buf) goto out_pkt; + + pkt->buf_len = len; + err = memcpy_from_msg(pkt->buf, info->msg, len); if (err) goto out; @@ -841,24 +847,60 @@ destroy: return err; } +static void +virtio_transport_recv_enqueue(struct vsock_sock *vsk, + struct virtio_vsock_pkt *pkt) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + bool free_pkt = false; + + pkt->len = le32_to_cpu(pkt->hdr.len); + pkt->off = 0; + + spin_lock_bh(&vvs->rx_lock); + + virtio_transport_inc_rx_pkt(vvs, pkt); + + /* Try to copy small packets into the buffer of last packet queued, + * to avoid wasting memory queueing the entire buffer with a small + * payload. + */ + if (pkt->len <= GOOD_COPY_LEN && !list_empty(&vvs->rx_queue)) { + struct virtio_vsock_pkt *last_pkt; + + last_pkt = list_last_entry(&vvs->rx_queue, + struct virtio_vsock_pkt, list); + + /* If there is space in the last packet queued, we copy the + * new packet in its buffer. + */ + if (pkt->len <= last_pkt->buf_len - last_pkt->len) { + memcpy(last_pkt->buf + last_pkt->len, pkt->buf, + pkt->len); + last_pkt->len += pkt->len; + free_pkt = true; + goto out; + } + } + + list_add_tail(&pkt->list, &vvs->rx_queue); + +out: + spin_unlock_bh(&vvs->rx_lock); + if (free_pkt) + virtio_transport_free_pkt(pkt); +} + static int virtio_transport_recv_connected(struct sock *sk, struct virtio_vsock_pkt *pkt) { struct vsock_sock *vsk = vsock_sk(sk); - struct virtio_vsock_sock *vvs = vsk->trans; int err = 0; switch (le16_to_cpu(pkt->hdr.op)) { case VIRTIO_VSOCK_OP_RW: - pkt->len = le32_to_cpu(pkt->hdr.len); - pkt->off = 0; - - spin_lock_bh(&vvs->rx_lock); - virtio_transport_inc_rx_pkt(vvs, pkt); - list_add_tail(&pkt->list, &vvs->rx_queue); - spin_unlock_bh(&vvs->rx_lock); - + virtio_transport_recv_enqueue(vsk, pkt); sk->sk_data_ready(sk); return err; case VIRTIO_VSOCK_OP_CREDIT_UPDATE: -- cgit v1.2.3 From b89d882dc9fc279c8acbf1df71d51b22394186d5 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Tue, 30 Jul 2019 17:43:31 +0200 Subject: vsock/virtio: reduce credit update messages In order to reduce the number of credit update messages, we send them only when the space available seen by the transmitter is less than VIRTIO_VSOCK_MAX_PKT_BUF_SIZE. Signed-off-by: Stefano Garzarella Reviewed-by: Stefan Hajnoczi Acked-by: Michael S. Tsirkin Signed-off-by: David S. Miller --- include/linux/virtio_vsock.h | 1 + net/vmw_vsock/virtio_transport_common.c | 16 +++++++++++++--- 2 files changed, 14 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h index 7d973903f52e..49fc9d20bc43 100644 --- a/include/linux/virtio_vsock.h +++ b/include/linux/virtio_vsock.h @@ -41,6 +41,7 @@ struct virtio_vsock_sock { /* Protected by rx_lock */ u32 fwd_cnt; + u32 last_fwd_cnt; u32 rx_bytes; struct list_head rx_queue; }; diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 095221f94786..a85559d4d974 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -211,6 +211,7 @@ static void virtio_transport_dec_rx_pkt(struct virtio_vsock_sock *vvs, void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct virtio_vsock_pkt *pkt) { spin_lock_bh(&vvs->tx_lock); + vvs->last_fwd_cnt = vvs->fwd_cnt; pkt->hdr.fwd_cnt = cpu_to_le32(vvs->fwd_cnt); pkt->hdr.buf_alloc = cpu_to_le32(vvs->buf_alloc); spin_unlock_bh(&vvs->tx_lock); @@ -261,6 +262,7 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk, struct virtio_vsock_sock *vvs = vsk->trans; struct virtio_vsock_pkt *pkt; size_t bytes, total = 0; + u32 free_space; int err = -EFAULT; spin_lock_bh(&vvs->rx_lock); @@ -291,11 +293,19 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk, virtio_transport_free_pkt(pkt); } } + + free_space = vvs->buf_alloc - (vvs->fwd_cnt - vvs->last_fwd_cnt); + spin_unlock_bh(&vvs->rx_lock); - /* Send a credit pkt to peer */ - virtio_transport_send_credit_update(vsk, VIRTIO_VSOCK_TYPE_STREAM, - NULL); + /* We send a credit update only when the space available seen + * by the transmitter is less than VIRTIO_VSOCK_MAX_PKT_BUF_SIZE + */ + if (free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE) { + virtio_transport_send_credit_update(vsk, + VIRTIO_VSOCK_TYPE_STREAM, + NULL); + } return total; -- cgit v1.2.3 From 9632e9f61bc4191411c47933abe5f2d93c578f5e Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Tue, 30 Jul 2019 17:43:32 +0200 Subject: vsock/virtio: fix locking in virtio_transport_inc_tx_pkt() fwd_cnt and last_fwd_cnt are protected by rx_lock, so we should use the same spinlock also if we are in the TX path. Move also buf_alloc under the same lock. Signed-off-by: Stefano Garzarella Reviewed-by: Stefan Hajnoczi Acked-by: Michael S. Tsirkin Signed-off-by: David S. Miller --- include/linux/virtio_vsock.h | 2 +- net/vmw_vsock/virtio_transport_common.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h index 49fc9d20bc43..4c7781f4b29b 100644 --- a/include/linux/virtio_vsock.h +++ b/include/linux/virtio_vsock.h @@ -35,7 +35,6 @@ struct virtio_vsock_sock { /* Protected by tx_lock */ u32 tx_cnt; - u32 buf_alloc; u32 peer_fwd_cnt; u32 peer_buf_alloc; @@ -43,6 +42,7 @@ struct virtio_vsock_sock { u32 fwd_cnt; u32 last_fwd_cnt; u32 rx_bytes; + u32 buf_alloc; struct list_head rx_queue; }; diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index a85559d4d974..34a2b42313b7 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -210,11 +210,11 @@ static void virtio_transport_dec_rx_pkt(struct virtio_vsock_sock *vvs, void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct virtio_vsock_pkt *pkt) { - spin_lock_bh(&vvs->tx_lock); + spin_lock_bh(&vvs->rx_lock); vvs->last_fwd_cnt = vvs->fwd_cnt; pkt->hdr.fwd_cnt = cpu_to_le32(vvs->fwd_cnt); pkt->hdr.buf_alloc = cpu_to_le32(vvs->buf_alloc); - spin_unlock_bh(&vvs->tx_lock); + spin_unlock_bh(&vvs->rx_lock); } EXPORT_SYMBOL_GPL(virtio_transport_inc_tx_pkt); -- cgit v1.2.3 From 016e43a26bab0126e33c9682f9d9d05eca9f0386 Mon Sep 17 00:00:00 2001 From: Tristram Ha Date: Mon, 29 Jul 2019 19:49:46 +0200 Subject: net: dsa: ksz: Add KSZ8795 tag code Add DSA tag code for Microchip KSZ8795 switch. The switch is simpler and the tag is only 1 byte, instead of 2 as is the case with KSZ9477. Signed-off-by: Tristram Ha Signed-off-by: Marek Vasut Cc: Andrew Lunn Cc: David S. Miller Cc: Florian Fainelli Cc: Tristram Ha Cc: Vivien Didelot Cc: Woojung Huh Signed-off-by: David S. Miller --- include/net/dsa.h | 2 ++ net/dsa/tag_ksz.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 1e8650fa8acc..147b757ef8ea 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -41,6 +41,7 @@ struct phylink_link_state; #define DSA_TAG_PROTO_TRAILER_VALUE 11 #define DSA_TAG_PROTO_8021Q_VALUE 12 #define DSA_TAG_PROTO_SJA1105_VALUE 13 +#define DSA_TAG_PROTO_KSZ8795_VALUE 14 enum dsa_tag_protocol { DSA_TAG_PROTO_NONE = DSA_TAG_PROTO_NONE_VALUE, @@ -57,6 +58,7 @@ enum dsa_tag_protocol { DSA_TAG_PROTO_TRAILER = DSA_TAG_PROTO_TRAILER_VALUE, DSA_TAG_PROTO_8021Q = DSA_TAG_PROTO_8021Q_VALUE, DSA_TAG_PROTO_SJA1105 = DSA_TAG_PROTO_SJA1105_VALUE, + DSA_TAG_PROTO_KSZ8795 = DSA_TAG_PROTO_KSZ8795_VALUE, }; struct packet_type; diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c index b4872b87d4a6..73605bcbb385 100644 --- a/net/dsa/tag_ksz.c +++ b/net/dsa/tag_ksz.c @@ -69,6 +69,67 @@ static struct sk_buff *ksz_common_rcv(struct sk_buff *skb, return skb; } +/* + * For Ingress (Host -> KSZ8795), 1 byte is added before FCS. + * --------------------------------------------------------------------------- + * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|tag(1byte)|FCS(4bytes) + * --------------------------------------------------------------------------- + * tag : each bit represents port (eg, 0x01=port1, 0x02=port2, 0x10=port5) + * + * For Egress (KSZ8795 -> Host), 1 byte is added before FCS. + * --------------------------------------------------------------------------- + * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|tag0(1byte)|FCS(4bytes) + * --------------------------------------------------------------------------- + * tag0 : zero-based value represents port + * (eg, 0x00=port1, 0x02=port3, 0x06=port7) + */ + +#define KSZ8795_INGRESS_TAG_LEN 1 + +#define KSZ8795_TAIL_TAG_OVERRIDE BIT(6) +#define KSZ8795_TAIL_TAG_LOOKUP BIT(7) + +static struct sk_buff *ksz8795_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + struct sk_buff *nskb; + u8 *tag; + u8 *addr; + + nskb = ksz_common_xmit(skb, dev, KSZ8795_INGRESS_TAG_LEN); + if (!nskb) + return NULL; + + /* Tag encoding */ + tag = skb_put(nskb, KSZ8795_INGRESS_TAG_LEN); + addr = skb_mac_header(nskb); + + *tag = 1 << dp->index; + if (is_link_local_ether_addr(addr)) + *tag |= KSZ8795_TAIL_TAG_OVERRIDE; + + return nskb; +} + +static struct sk_buff *ksz8795_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt) +{ + u8 *tag = skb_tail_pointer(skb) - KSZ_EGRESS_TAG_LEN; + + return ksz_common_rcv(skb, dev, tag[0] & 7, KSZ_EGRESS_TAG_LEN); +} + +static const struct dsa_device_ops ksz8795_netdev_ops = { + .name = "ksz8795", + .proto = DSA_TAG_PROTO_KSZ8795, + .xmit = ksz8795_xmit, + .rcv = ksz8795_rcv, + .overhead = KSZ8795_INGRESS_TAG_LEN, +}; + +DSA_TAG_DRIVER(ksz8795_netdev_ops); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_KSZ8795); + /* * For Ingress (Host -> KSZ9477), 2 bytes are added before FCS. * --------------------------------------------------------------------------- @@ -183,6 +244,7 @@ DSA_TAG_DRIVER(ksz9893_netdev_ops); MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_KSZ9893); static struct dsa_tag_driver *dsa_tag_driver_array[] = { + &DSA_TAG_DRIVER_NAME(ksz8795_netdev_ops), &DSA_TAG_DRIVER_NAME(ksz9477_netdev_ops), &DSA_TAG_DRIVER_NAME(ksz9893_netdev_ops), }; -- cgit v1.2.3 From fce04b1ce8e326bf2cafaea4a0c91dbc3e99e403 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 30 Jul 2019 13:43:35 +0300 Subject: gpiolib: of: Reshuffle contents of consumer.h for new library layout Kernel build bot reported a compilation error after the commit f626d6dfb709 ("gpio: of: Break out OF-only code"): drivers/gpio/gpiolib-devres.o: In function `devm_gpiod_get_from_of_node': gpiolib-devres.c:(.text+0x19a): undefined reference to `gpiod_get_from_of_node' This happens due to move the latter under umbrella of CONFIG_OF_GPIO while customer.h contains staled data. Fix it by reshuffling contents of consumer.h to satisfy build dependencies. Reported-by: kbuild test robot Fixes: f626d6dfb709 ("gpio: of: Break out OF-only code"): Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20190730104337.21235-1-andriy.shevchenko@linux.intel.com Signed-off-by: Linus Walleij --- include/linux/gpio/consumer.h | 78 ++++++++++++++++++++++++++----------------- 1 file changed, 48 insertions(+), 30 deletions(-) (limited to 'include') diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h index 9ddcf50a3c59..6019d4d128c5 100644 --- a/include/linux/gpio/consumer.h +++ b/include/linux/gpio/consumer.h @@ -170,18 +170,8 @@ struct gpio_desc *gpio_to_desc(unsigned gpio); int desc_to_gpio(const struct gpio_desc *desc); /* Child properties interface */ -struct device_node; struct fwnode_handle; -struct gpio_desc *gpiod_get_from_of_node(struct device_node *node, - const char *propname, int index, - enum gpiod_flags dflags, - const char *label); -struct gpio_desc *devm_gpiod_get_from_of_node(struct device *dev, - struct device_node *node, - const char *propname, int index, - enum gpiod_flags dflags, - const char *label); struct gpio_desc *fwnode_get_named_gpiod(struct fwnode_handle *fwnode, const char *propname, int index, enum gpiod_flags dflags, @@ -530,28 +520,8 @@ static inline int desc_to_gpio(const struct gpio_desc *desc) } /* Child properties interface */ -struct device_node; struct fwnode_handle; -static inline -struct gpio_desc *gpiod_get_from_of_node(struct device_node *node, - const char *propname, int index, - enum gpiod_flags dflags, - const char *label) -{ - return ERR_PTR(-ENOSYS); -} - -static inline -struct gpio_desc *devm_gpiod_get_from_of_node(struct device *dev, - struct device_node *node, - const char *propname, int index, - enum gpiod_flags dflags, - const char *label) -{ - return ERR_PTR(-ENOSYS); -} - static inline struct gpio_desc *fwnode_get_named_gpiod(struct fwnode_handle *fwnode, const char *propname, int index, @@ -584,6 +554,54 @@ struct gpio_desc *devm_fwnode_get_gpiod_from_child(struct device *dev, flags, label); } +#if IS_ENABLED(CONFIG_GPIOLIB) && IS_ENABLED(CONFIG_OF_GPIO) +struct device_node; + +struct gpio_desc *gpiod_get_from_of_node(struct device_node *node, + const char *propname, int index, + enum gpiod_flags dflags, + const char *label); + +#else /* CONFIG_GPIOLIB && CONFIG_OF_GPIO */ + +struct device_node; + +static inline +struct gpio_desc *gpiod_get_from_of_node(struct device_node *node, + const char *propname, int index, + enum gpiod_flags dflags, + const char *label) +{ + return ERR_PTR(-ENOSYS); +} + +#endif /* CONFIG_GPIOLIB && CONFIG_OF_GPIO */ + +#ifdef CONFIG_GPIOLIB +struct device_node; + +struct gpio_desc *devm_gpiod_get_from_of_node(struct device *dev, + struct device_node *node, + const char *propname, int index, + enum gpiod_flags dflags, + const char *label); + +#else /* CONFIG_GPIOLIB */ + +struct device_node; + +static inline +struct gpio_desc *devm_gpiod_get_from_of_node(struct device *dev, + struct device_node *node, + const char *propname, int index, + enum gpiod_flags dflags, + const char *label) +{ + return ERR_PTR(-ENOSYS); +} + +#endif /* CONFIG_GPIOLIB */ + #if IS_ENABLED(CONFIG_GPIOLIB) && IS_ENABLED(CONFIG_GPIO_SYSFS) int gpiod_export(struct gpio_desc *desc, bool direction_may_change); -- cgit v1.2.3 From 2838bf941b120ec846a3903db13e319368d51b08 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 30 Jul 2019 13:43:37 +0300 Subject: gpiolib-acpi: Move acpi_dev_add_driver_gpios() et al to consumer.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The API, which belongs to GPIO library, is foreign to ACPI headers. Earlier we moved out I²C out of the latter, and now it's time for acpi_dev_add_driver_gpios() et al. For time being the acpi_gpio_get_irq_resource() and acpi_dev_gpio_irq_get() are left untouched as they need more thought about. Note, it requires uninline acpi_dev_remove_driver_gpios() to keep purity of consumer.h. Cc: Pierre-Louis Bossart Cc: Liam Girdwood Cc: Jie Yang Cc: Mark Brown Cc: alsa-devel@alsa-project.org (moderated list:INTEL ASoC DRIVERS) Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20190730104337.21235-3-andriy.shevchenko@linux.intel.com Signed-off-by: Linus Walleij --- drivers/gpio/gpiolib-acpi.c | 7 ++++ include/linux/acpi.h | 51 ------------------------- include/linux/gpio/consumer.h | 57 ++++++++++++++++++++++++++++ sound/soc/intel/boards/bytcht_cx2072x.c | 1 + sound/soc/intel/boards/cht_bsw_max98090_ti.c | 1 + sound/soc/intel/boards/cht_bsw_rt5672.c | 1 + 6 files changed, 67 insertions(+), 51 deletions(-) (limited to 'include') diff --git a/drivers/gpio/gpiolib-acpi.c b/drivers/gpio/gpiolib-acpi.c index d54fc6e7c8a9..fdee8afa5339 100644 --- a/drivers/gpio/gpiolib-acpi.c +++ b/drivers/gpio/gpiolib-acpi.c @@ -383,6 +383,13 @@ int acpi_dev_add_driver_gpios(struct acpi_device *adev, } EXPORT_SYMBOL_GPL(acpi_dev_add_driver_gpios); +void acpi_dev_remove_driver_gpios(struct acpi_device *adev) +{ + if (adev) + adev->driver_gpios = NULL; +} +EXPORT_SYMBOL_GPL(acpi_dev_remove_driver_gpios); + static void devm_acpi_dev_release_driver_gpios(struct device *dev, void *res) { acpi_dev_remove_driver_gpios(ACPI_COMPANION(dev)); diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 9426b9aaed86..e40e1e27ed8e 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -994,62 +994,11 @@ void __acpi_handle_debug(struct _ddebug *descriptor, acpi_handle handle, const c #endif #endif -struct acpi_gpio_params { - unsigned int crs_entry_index; - unsigned int line_index; - bool active_low; -}; - -struct acpi_gpio_mapping { - const char *name; - const struct acpi_gpio_params *data; - unsigned int size; - -/* Ignore IoRestriction field */ -#define ACPI_GPIO_QUIRK_NO_IO_RESTRICTION BIT(0) -/* - * When ACPI GPIO mapping table is in use the index parameter inside it - * refers to the GPIO resource in _CRS method. That index has no - * distinction of actual type of the resource. When consumer wants to - * get GpioIo type explicitly, this quirk may be used. - */ -#define ACPI_GPIO_QUIRK_ONLY_GPIOIO BIT(1) - - unsigned int quirks; -}; - #if defined(CONFIG_ACPI) && defined(CONFIG_GPIOLIB) -int acpi_dev_add_driver_gpios(struct acpi_device *adev, - const struct acpi_gpio_mapping *gpios); - -static inline void acpi_dev_remove_driver_gpios(struct acpi_device *adev) -{ - if (adev) - adev->driver_gpios = NULL; -} - -int devm_acpi_dev_add_driver_gpios(struct device *dev, - const struct acpi_gpio_mapping *gpios); -void devm_acpi_dev_remove_driver_gpios(struct device *dev); - bool acpi_gpio_get_irq_resource(struct acpi_resource *ares, struct acpi_resource_gpio **agpio); int acpi_dev_gpio_irq_get(struct acpi_device *adev, int index); #else -static inline int acpi_dev_add_driver_gpios(struct acpi_device *adev, - const struct acpi_gpio_mapping *gpios) -{ - return -ENXIO; -} -static inline void acpi_dev_remove_driver_gpios(struct acpi_device *adev) {} - -static inline int devm_acpi_dev_add_driver_gpios(struct device *dev, - const struct acpi_gpio_mapping *gpios) -{ - return -ENXIO; -} -static inline void devm_acpi_dev_remove_driver_gpios(struct device *dev) {} - static inline bool acpi_gpio_get_irq_resource(struct acpi_resource *ares, struct acpi_resource_gpio **agpio) { diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h index 6019d4d128c5..aac617371b60 100644 --- a/include/linux/gpio/consumer.h +++ b/include/linux/gpio/consumer.h @@ -602,6 +602,63 @@ struct gpio_desc *devm_gpiod_get_from_of_node(struct device *dev, #endif /* CONFIG_GPIOLIB */ +struct acpi_gpio_params { + unsigned int crs_entry_index; + unsigned int line_index; + bool active_low; +}; + +struct acpi_gpio_mapping { + const char *name; + const struct acpi_gpio_params *data; + unsigned int size; + +/* Ignore IoRestriction field */ +#define ACPI_GPIO_QUIRK_NO_IO_RESTRICTION BIT(0) +/* + * When ACPI GPIO mapping table is in use the index parameter inside it + * refers to the GPIO resource in _CRS method. That index has no + * distinction of actual type of the resource. When consumer wants to + * get GpioIo type explicitly, this quirk may be used. + */ +#define ACPI_GPIO_QUIRK_ONLY_GPIOIO BIT(1) + + unsigned int quirks; +}; + +#if IS_ENABLED(CONFIG_GPIOLIB) && IS_ENABLED(CONFIG_ACPI) + +struct acpi_device; + +int acpi_dev_add_driver_gpios(struct acpi_device *adev, + const struct acpi_gpio_mapping *gpios); +void acpi_dev_remove_driver_gpios(struct acpi_device *adev); + +int devm_acpi_dev_add_driver_gpios(struct device *dev, + const struct acpi_gpio_mapping *gpios); +void devm_acpi_dev_remove_driver_gpios(struct device *dev); + +#else /* CONFIG_GPIOLIB && CONFIG_ACPI */ + +struct acpi_device; + +static inline int acpi_dev_add_driver_gpios(struct acpi_device *adev, + const struct acpi_gpio_mapping *gpios) +{ + return -ENXIO; +} +static inline void acpi_dev_remove_driver_gpios(struct acpi_device *adev) {} + +static inline int devm_acpi_dev_add_driver_gpios(struct device *dev, + const struct acpi_gpio_mapping *gpios) +{ + return -ENXIO; +} +static inline void devm_acpi_dev_remove_driver_gpios(struct device *dev) {} + +#endif /* CONFIG_GPIOLIB && CONFIG_ACPI */ + + #if IS_ENABLED(CONFIG_GPIOLIB) && IS_ENABLED(CONFIG_GPIO_SYSFS) int gpiod_export(struct gpio_desc *desc, bool direction_may_change); diff --git a/sound/soc/intel/boards/bytcht_cx2072x.c b/sound/soc/intel/boards/bytcht_cx2072x.c index 54ac2fd41925..67f06c95eec5 100644 --- a/sound/soc/intel/boards/bytcht_cx2072x.c +++ b/sound/soc/intel/boards/bytcht_cx2072x.c @@ -6,6 +6,7 @@ #include #include +#include #include #include #include diff --git a/sound/soc/intel/boards/cht_bsw_max98090_ti.c b/sound/soc/intel/boards/cht_bsw_max98090_ti.c index 33eb72545be6..05db311b579e 100644 --- a/sound/soc/intel/boards/cht_bsw_max98090_ti.c +++ b/sound/soc/intel/boards/cht_bsw_max98090_ti.c @@ -12,6 +12,7 @@ */ #include +#include #include #include #include diff --git a/sound/soc/intel/boards/cht_bsw_rt5672.c b/sound/soc/intel/boards/cht_bsw_rt5672.c index 4977b5a65eb8..9d657421730a 100644 --- a/sound/soc/intel/boards/cht_bsw_rt5672.c +++ b/sound/soc/intel/boards/cht_bsw_rt5672.c @@ -8,6 +8,7 @@ * Mengdong Lin */ +#include #include #include #include -- cgit v1.2.3 From b1d45c23284e55a379f85554a27a548b7988d47a Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sat, 20 Jul 2019 19:39:43 +0900 Subject: tracing: Fix header include guards in trace event headers These include guards are broken. Match the #if !define() and #define lines so that they work correctly. Link: http://lkml.kernel.org/r/20190720103943.16982-1-yamada.masahiro@socionext.com Fixes: f54d1867005c3 ("dma-buf: Rename struct fence to dma_fence") Fixes: 2e26ca7150a4f ("tracing: Fix tracepoint.h DECLARE_TRACE() to allow more than one header") Fixes: e543002f77f46 ("qdisc: add tracepoint qdisc:qdisc_dequeue for dequeued SKBs") Fixes: 95f295f9fe081 ("dmaengine: tegra: add tracepoints to driver") Signed-off-by: Masahiro Yamada Signed-off-by: Steven Rostedt (VMware) --- include/trace/events/dma_fence.h | 2 +- include/trace/events/napi.h | 4 ++-- include/trace/events/qdisc.h | 4 ++-- include/trace/events/tegra_apb_dma.h | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/trace/events/dma_fence.h b/include/trace/events/dma_fence.h index 2212adda8f77..64e92d56c6a8 100644 --- a/include/trace/events/dma_fence.h +++ b/include/trace/events/dma_fence.h @@ -2,7 +2,7 @@ #undef TRACE_SYSTEM #define TRACE_SYSTEM dma_fence -#if !defined(_TRACE_FENCE_H) || defined(TRACE_HEADER_MULTI_READ) +#if !defined(_TRACE_DMA_FENCE_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_DMA_FENCE_H #include diff --git a/include/trace/events/napi.h b/include/trace/events/napi.h index f3a12566bed0..6678cf8b235b 100644 --- a/include/trace/events/napi.h +++ b/include/trace/events/napi.h @@ -3,7 +3,7 @@ #define TRACE_SYSTEM napi #if !defined(_TRACE_NAPI_H) || defined(TRACE_HEADER_MULTI_READ) -#define _TRACE_NAPI_H_ +#define _TRACE_NAPI_H #include #include @@ -38,7 +38,7 @@ TRACE_EVENT(napi_poll, #undef NO_DEV -#endif /* _TRACE_NAPI_H_ */ +#endif /* _TRACE_NAPI_H */ /* This part must be outside protection */ #include diff --git a/include/trace/events/qdisc.h b/include/trace/events/qdisc.h index 60d0d8bd336d..0d1a9ebf55ba 100644 --- a/include/trace/events/qdisc.h +++ b/include/trace/events/qdisc.h @@ -2,7 +2,7 @@ #define TRACE_SYSTEM qdisc #if !defined(_TRACE_QDISC_H) || defined(TRACE_HEADER_MULTI_READ) -#define _TRACE_QDISC_H_ +#define _TRACE_QDISC_H #include #include @@ -44,7 +44,7 @@ TRACE_EVENT(qdisc_dequeue, __entry->txq_state, __entry->packets, __entry->skbaddr ) ); -#endif /* _TRACE_QDISC_H_ */ +#endif /* _TRACE_QDISC_H */ /* This part must be outside protection */ #include diff --git a/include/trace/events/tegra_apb_dma.h b/include/trace/events/tegra_apb_dma.h index 0818f6286110..971cd02d2daf 100644 --- a/include/trace/events/tegra_apb_dma.h +++ b/include/trace/events/tegra_apb_dma.h @@ -1,5 +1,5 @@ #if !defined(_TRACE_TEGRA_APB_DMA_H) || defined(TRACE_HEADER_MULTI_READ) -#define _TRACE_TEGRA_APM_DMA_H +#define _TRACE_TEGRA_APB_DMA_H #include #include @@ -55,7 +55,7 @@ TRACE_EVENT(tegra_dma_isr, TP_printk("%s: irq %d\n", __get_str(chan), __entry->irq) ); -#endif /* _TRACE_TEGRADMA_H */ +#endif /* _TRACE_TEGRA_APB_DMA_H */ /* This part must be outside protection */ #include -- cgit v1.2.3 From 9349d600fb6a1ca0aaeb515523e1bb5409483d76 Mon Sep 17 00:00:00 2001 From: Petar Penkov Date: Mon, 29 Jul 2019 09:59:14 -0700 Subject: tcp: add skb-less helpers to retrieve SYN cookie This patch allows generation of a SYN cookie before an SKB has been allocated, as is the case at XDP. Signed-off-by: Petar Penkov Reviewed-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov --- include/net/tcp.h | 10 +++++++ net/ipv4/tcp_input.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++++ net/ipv4/tcp_ipv4.c | 15 +++++++++++ net/ipv6/tcp_ipv6.c | 15 +++++++++++ 4 files changed, 113 insertions(+) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index e5cf514ba118..fb7e153aecc5 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -414,6 +414,16 @@ void tcp_parse_options(const struct net *net, const struct sk_buff *skb, int estab, struct tcp_fastopen_cookie *foc); const u8 *tcp_parse_md5sig_option(const struct tcphdr *th); +/* + * BPF SKB-less helpers + */ +u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, + struct tcphdr *th, u32 *cookie); +u16 tcp_v6_get_syncookie(struct sock *sk, struct ipv6hdr *iph, + struct tcphdr *th, u32 *cookie); +u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops, + const struct tcp_request_sock_ops *af_ops, + struct sock *sk, struct tcphdr *th); /* * TCP v4 functions exported for the inet6 API */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8892df6de1d4..706cbb3b2986 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3782,6 +3782,49 @@ static void smc_parse_options(const struct tcphdr *th, #endif } +/* Try to parse the MSS option from the TCP header. Return 0 on failure, clamped + * value on success. + */ +static u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss) +{ + const unsigned char *ptr = (const unsigned char *)(th + 1); + int length = (th->doff * 4) - sizeof(struct tcphdr); + u16 mss = 0; + + while (length > 0) { + int opcode = *ptr++; + int opsize; + + switch (opcode) { + case TCPOPT_EOL: + return mss; + case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ + length--; + continue; + default: + if (length < 2) + return mss; + opsize = *ptr++; + if (opsize < 2) /* "silly options" */ + return mss; + if (opsize > length) + return mss; /* fail on partial options */ + if (opcode == TCPOPT_MSS && opsize == TCPOLEN_MSS) { + u16 in_mss = get_unaligned_be16(ptr); + + if (in_mss) { + if (user_mss && user_mss < in_mss) + in_mss = user_mss; + mss = in_mss; + } + } + ptr += opsize - 2; + length -= opsize; + } + } + return mss; +} + /* Look for tcp options. Normally only called on SYN and SYNACK packets. * But, this can also be called on packets in the established flow when * the fast version below fails. @@ -6464,6 +6507,36 @@ static void tcp_reqsk_record_syn(const struct sock *sk, } } +/* If a SYN cookie is required and supported, returns a clamped MSS value to be + * used for SYN cookie generation. + */ +u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops, + const struct tcp_request_sock_ops *af_ops, + struct sock *sk, struct tcphdr *th) +{ + struct tcp_sock *tp = tcp_sk(sk); + u16 mss; + + if (sock_net(sk)->ipv4.sysctl_tcp_syncookies != 2 && + !inet_csk_reqsk_queue_is_full(sk)) + return 0; + + if (!tcp_syn_flood_action(sk, rsk_ops->slab_name)) + return 0; + + if (sk_acceptq_is_full(sk)) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); + return 0; + } + + mss = tcp_parse_mss_option(th, tp->rx_opt.user_mss); + if (!mss) + mss = af_ops->mss_clamp; + + return mss; +} +EXPORT_SYMBOL_GPL(tcp_get_syncookie_mss); + int tcp_conn_request(struct request_sock_ops *rsk_ops, const struct tcp_request_sock_ops *af_ops, struct sock *sk, struct sk_buff *skb) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index d57641cb3477..10217393cda6 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1515,6 +1515,21 @@ static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) return sk; } +u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, + struct tcphdr *th, u32 *cookie) +{ + u16 mss = 0; +#ifdef CONFIG_SYN_COOKIES + mss = tcp_get_syncookie_mss(&tcp_request_sock_ops, + &tcp_request_sock_ipv4_ops, sk, th); + if (mss) { + *cookie = __cookie_v4_init_sequence(iph, th, &mss); + tcp_synq_overflow(sk); + } +#endif + return mss; +} + /* The socket must have it's spinlock held when we get * here, unless it is a TCP_LISTEN socket. * diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 5da069e91cac..87f44d3250ee 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1063,6 +1063,21 @@ static struct sock *tcp_v6_cookie_check(struct sock *sk, struct sk_buff *skb) return sk; } +u16 tcp_v6_get_syncookie(struct sock *sk, struct ipv6hdr *iph, + struct tcphdr *th, u32 *cookie) +{ + u16 mss = 0; +#ifdef CONFIG_SYN_COOKIES + mss = tcp_get_syncookie_mss(&tcp6_request_sock_ops, + &tcp_request_sock_ipv6_ops, sk, th); + if (mss) { + *cookie = __cookie_v6_init_sequence(iph, th, &mss); + tcp_synq_overflow(sk); + } +#endif + return mss; +} + static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) { if (skb->protocol == htons(ETH_P_IP)) -- cgit v1.2.3 From 70d66244317e958092e9c971b08dd5b7fd29d9cb Mon Sep 17 00:00:00 2001 From: Petar Penkov Date: Mon, 29 Jul 2019 09:59:15 -0700 Subject: bpf: add bpf_tcp_gen_syncookie helper This helper function allows BPF programs to try to generate SYN cookies, given a reference to a listener socket. The function works from XDP and with an skb context since bpf_skc_lookup_tcp can lookup a socket in both cases. Signed-off-by: Petar Penkov Suggested-by: Eric Dumazet Reviewed-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 30 +++++++++++++++++++- net/core/filter.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 102 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6bbef0c7f585..4393bd4b2419 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2714,6 +2714,33 @@ union bpf_attr { * **-EPERM** if no permission to send the *sig*. * * **-EAGAIN** if bpf program can try again. + * + * s64 bpf_tcp_gen_syncookie(struct bpf_sock *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len) + * Description + * Try to issue a SYN cookie for the packet with corresponding + * IP/TCP headers, *iph* and *th*, on the listening socket in *sk*. + * + * *iph* points to the start of the IPv4 or IPv6 header, while + * *iph_len* contains **sizeof**\ (**struct iphdr**) or + * **sizeof**\ (**struct ip6hdr**). + * + * *th* points to the start of the TCP header, while *th_len* + * contains the length of the TCP header. + * + * Return + * On success, lower 32 bits hold the generated SYN cookie in + * followed by 16 bits which hold the MSS value for that cookie, + * and the top 16 bits are unused. + * + * On failure, the returned value is one of the following: + * + * **-EINVAL** SYN cookie cannot be issued due to error + * + * **-ENOENT** SYN cookie should not be issued (no SYN flood) + * + * **-EOPNOTSUPP** kernel configuration does not enable SYN cookies + * + * **-EPROTONOSUPPORT** IP packet version is not 4 or 6 */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2825,7 +2852,8 @@ union bpf_attr { FN(strtoul), \ FN(sk_storage_get), \ FN(sk_storage_delete), \ - FN(send_signal), + FN(send_signal), \ + FN(tcp_gen_syncookie), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/net/core/filter.c b/net/core/filter.c index 1eee70f80fba..5a2707918629 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5855,6 +5855,75 @@ static const struct bpf_func_proto bpf_tcp_check_syncookie_proto = { .arg5_type = ARG_CONST_SIZE, }; +BPF_CALL_5(bpf_tcp_gen_syncookie, struct sock *, sk, void *, iph, u32, iph_len, + struct tcphdr *, th, u32, th_len) +{ +#ifdef CONFIG_SYN_COOKIES + u32 cookie; + u16 mss; + + if (unlikely(th_len < sizeof(*th) || th_len != th->doff * 4)) + return -EINVAL; + + if (sk->sk_protocol != IPPROTO_TCP || sk->sk_state != TCP_LISTEN) + return -EINVAL; + + if (!sock_net(sk)->ipv4.sysctl_tcp_syncookies) + return -ENOENT; + + if (!th->syn || th->ack || th->fin || th->rst) + return -EINVAL; + + if (unlikely(iph_len < sizeof(struct iphdr))) + return -EINVAL; + + /* Both struct iphdr and struct ipv6hdr have the version field at the + * same offset so we can cast to the shorter header (struct iphdr). + */ + switch (((struct iphdr *)iph)->version) { + case 4: + if (sk->sk_family == AF_INET6 && sk->sk_ipv6only) + return -EINVAL; + + mss = tcp_v4_get_syncookie(sk, iph, th, &cookie); + break; + +#if IS_BUILTIN(CONFIG_IPV6) + case 6: + if (unlikely(iph_len < sizeof(struct ipv6hdr))) + return -EINVAL; + + if (sk->sk_family != AF_INET6) + return -EINVAL; + + mss = tcp_v6_get_syncookie(sk, iph, th, &cookie); + break; +#endif /* CONFIG_IPV6 */ + + default: + return -EPROTONOSUPPORT; + } + if (mss <= 0) + return -ENOENT; + + return cookie | ((u64)mss << 32); +#else + return -EOPNOTSUPP; +#endif /* CONFIG_SYN_COOKIES */ +} + +static const struct bpf_func_proto bpf_tcp_gen_syncookie_proto = { + .func = bpf_tcp_gen_syncookie, + .gpl_only = true, /* __cookie_v*_init_sequence() is GPL */ + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_SOCK_COMMON, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE, +}; + #endif /* CONFIG_INET */ bool bpf_helper_changes_pkt_data(void *func) @@ -6144,6 +6213,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_tcp_check_syncookie_proto; case BPF_FUNC_skb_ecn_set_ce: return &bpf_skb_ecn_set_ce_proto; + case BPF_FUNC_tcp_gen_syncookie: + return &bpf_tcp_gen_syncookie_proto; #endif default: return bpf_base_func_proto(func_id); @@ -6183,6 +6254,8 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_xdp_skc_lookup_tcp_proto; case BPF_FUNC_tcp_check_syncookie: return &bpf_tcp_check_syncookie_proto; + case BPF_FUNC_tcp_gen_syncookie: + return &bpf_tcp_gen_syncookie_proto; #endif default: return bpf_base_func_proto(func_id); -- cgit v1.2.3 From a78d14a31666c636a9e00a589032119fb59e3b94 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 22 Jul 2019 09:46:29 +0200 Subject: xen: avoid link error on ARM Building the privcmd code as a loadable module on ARM, we get a link error due to the private cache management functions: ERROR: "__sync_icache_dcache" [drivers/xen/xen-privcmd.ko] undefined! Move the code into a new that is always built in when Xen is enabled, as suggested by Juergen Gross and Boris Ostrovsky. Signed-off-by: Arnd Bergmann Reviewed-by: Stefano Stabellini Signed-off-by: Juergen Gross --- drivers/xen/privcmd.c | 35 +++++------------------------------ drivers/xen/xlate_mmu.c | 32 ++++++++++++++++++++++++++++++++ include/xen/xen-ops.h | 3 +++ 3 files changed, 40 insertions(+), 30 deletions(-) (limited to 'include') diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index 2f5ce7230a43..c6070e70dd73 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -724,25 +724,6 @@ static long privcmd_ioctl_restrict(struct file *file, void __user *udata) return 0; } -struct remap_pfn { - struct mm_struct *mm; - struct page **pages; - pgprot_t prot; - unsigned long i; -}; - -static int remap_pfn_fn(pte_t *ptep, unsigned long addr, void *data) -{ - struct remap_pfn *r = data; - struct page *page = r->pages[r->i]; - pte_t pte = pte_mkspecial(pfn_pte(page_to_pfn(page), r->prot)); - - set_pte_at(r->mm, addr, ptep, pte); - r->i++; - - return 0; -} - static long privcmd_ioctl_mmap_resource(struct file *file, void __user *udata) { struct privcmd_data *data = file->private_data; @@ -774,7 +755,8 @@ static long privcmd_ioctl_mmap_resource(struct file *file, void __user *udata) goto out; } - if (xen_feature(XENFEAT_auto_translated_physmap)) { + if (IS_ENABLED(CONFIG_XEN_AUTO_XLATE) && + xen_feature(XENFEAT_auto_translated_physmap)) { unsigned int nr = DIV_ROUND_UP(kdata.num, XEN_PFN_PER_PAGE); struct page **pages; unsigned int i; @@ -808,16 +790,9 @@ static long privcmd_ioctl_mmap_resource(struct file *file, void __user *udata) if (rc) goto out; - if (xen_feature(XENFEAT_auto_translated_physmap)) { - struct remap_pfn r = { - .mm = vma->vm_mm, - .pages = vma->vm_private_data, - .prot = vma->vm_page_prot, - }; - - rc = apply_to_page_range(r.mm, kdata.addr, - kdata.num << PAGE_SHIFT, - remap_pfn_fn, &r); + if (IS_ENABLED(CONFIG_XEN_AUTO_XLATE) && + xen_feature(XENFEAT_auto_translated_physmap)) { + rc = xen_remap_vma_range(vma, kdata.addr, kdata.num << PAGE_SHIFT); } else { unsigned int domid = (xdata.flags & XENMEM_rsrc_acq_caller_owned) ? diff --git a/drivers/xen/xlate_mmu.c b/drivers/xen/xlate_mmu.c index ba883a80b3c0..7b1077f0abcb 100644 --- a/drivers/xen/xlate_mmu.c +++ b/drivers/xen/xlate_mmu.c @@ -262,3 +262,35 @@ int __init xen_xlate_map_ballooned_pages(xen_pfn_t **gfns, void **virt, return 0; } EXPORT_SYMBOL_GPL(xen_xlate_map_ballooned_pages); + +struct remap_pfn { + struct mm_struct *mm; + struct page **pages; + pgprot_t prot; + unsigned long i; +}; + +static int remap_pfn_fn(pte_t *ptep, unsigned long addr, void *data) +{ + struct remap_pfn *r = data; + struct page *page = r->pages[r->i]; + pte_t pte = pte_mkspecial(pfn_pte(page_to_pfn(page), r->prot)); + + set_pte_at(r->mm, addr, ptep, pte); + r->i++; + + return 0; +} + +/* Used by the privcmd module, but has to be built-in on ARM */ +int xen_remap_vma_range(struct vm_area_struct *vma, unsigned long addr, unsigned long len) +{ + struct remap_pfn r = { + .mm = vma->vm_mm, + .pages = vma->vm_private_data, + .prot = vma->vm_page_prot, + }; + + return apply_to_page_range(vma->vm_mm, addr, len, remap_pfn_fn, &r); +} +EXPORT_SYMBOL_GPL(xen_remap_vma_range); diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h index 4969817124a8..98b30c1613b2 100644 --- a/include/xen/xen-ops.h +++ b/include/xen/xen-ops.h @@ -109,6 +109,9 @@ static inline int xen_xlate_unmap_gfn_range(struct vm_area_struct *vma, } #endif +int xen_remap_vma_range(struct vm_area_struct *vma, unsigned long addr, + unsigned long len); + /* * xen_remap_domain_gfn_array() - map an array of foreign frames by gfn * @vma: VMA to map the pages into -- cgit v1.2.3 From 796e90f42b7e52cf1c88e978e1d5ee69c102d85d Mon Sep 17 00:00:00 2001 From: John Crispin Date: Tue, 30 Jul 2019 18:37:00 +0200 Subject: cfg80211: add support for parsing OBBS_PD attributes Add the data structure, policy and parsing code allowing userland to send the OBSS PD information into the kernel. Signed-off-by: John Crispin Link: https://lore.kernel.org/r/20190730163701.18836-2-john@phrozen.org Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 15 +++++++++++++++ include/uapi/linux/nl80211.h | 27 ++++++++++++++++++++++++++ net/wireless/nl80211.c | 45 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 87 insertions(+) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 45850a8391d9..35ec1f0a2bf9 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -246,6 +246,19 @@ struct ieee80211_rate { u16 hw_value, hw_value_short; }; +/** + * struct ieee80211_he_obss_pd - AP settings for spatial reuse + * + * @enable: is the feature enabled. + * @min_offset: minimal tx power offset an associated station shall use + * @max_offset: maximum tx power offset an associated station shall use + */ +struct ieee80211_he_obss_pd { + bool enable; + u8 min_offset; + u8 max_offset; +}; + /** * struct ieee80211_sta_ht_cap - STA's HT capabilities * @@ -896,6 +909,7 @@ enum cfg80211_ap_settings_flags { * @vht_required: stations must support VHT * @twt_responder: Enable Target Wait Time * @flags: flags, as defined in enum cfg80211_ap_settings_flags + * @he_obss_pd: OBSS Packet Detection settings */ struct cfg80211_ap_settings { struct cfg80211_chan_def chandef; @@ -923,6 +937,7 @@ struct cfg80211_ap_settings { bool ht_required, vht_required; bool twt_responder; u32 flags; + struct ieee80211_he_obss_pd he_obss_pd; }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index c45587c2cf44..822851d369ab 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2358,6 +2358,9 @@ enum nl80211_commands { * * @NL80211_ATTR_TWT_RESPONDER: Enable target wait time responder support. * + * @NL80211_ATTR_HE_OBSS_PD: nested attribute for OBSS Packet Detection + * functionality. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -2815,6 +2818,8 @@ enum nl80211_attrs { NL80211_ATTR_TWT_RESPONDER, + NL80211_ATTR_HE_OBSS_PD, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, @@ -6490,4 +6495,26 @@ enum nl80211_peer_measurement_ftm_resp { NL80211_PMSR_FTM_RESP_ATTR_MAX = NUM_NL80211_PMSR_FTM_RESP_ATTR - 1 }; +/** + * enum nl80211_obss_pd_attributes - OBSS packet detection attributes + * @__NL80211_HE_OBSS_PD_ATTR_INVALID: Invalid + * + * @NL80211_HE_OBSS_PD_ATTR_MIN_OFFSET: the OBSS PD minimum tx power offset. + * @NL80211_HE_OBSS_PD_ATTR_MAX_OFFSET: the OBSS PD maximum tx power offset. + * + * @__NL80211_HE_OBSS_PD_ATTR_LAST: Internal + * @NL80211_HE_OBSS_PD_ATTR_MAX: highest OBSS PD attribute. + */ +enum nl80211_obss_pd_attributes { + __NL80211_HE_OBSS_PD_ATTR_INVALID, + + NL80211_HE_OBSS_PD_ATTR_MIN_OFFSET, + NL80211_HE_OBSS_PD_ATTR_MAX_OFFSET, + + /* keep last */ + __NL80211_HE_OBSS_PD_ATTR_LAST, + NL80211_HE_OBSS_PD_ATTR_MAX = __NL80211_HE_OBSS_PD_ATTR_LAST - 1, +}; + + #endif /* __LINUX_NL80211_H */ diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 1e78ed45a759..3006cfce7158 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -281,6 +281,14 @@ nl80211_pmsr_attr_policy[NL80211_PMSR_ATTR_MAX + 1] = { NLA_POLICY_NESTED_ARRAY(nl80211_psmr_peer_attr_policy), }; +static const struct nla_policy +he_obss_pd_policy[NL80211_HE_OBSS_PD_ATTR_MAX + 1] = { + [NL80211_HE_OBSS_PD_ATTR_MIN_OFFSET] = + NLA_POLICY_RANGE(NLA_U8, 1, 20), + [NL80211_HE_OBSS_PD_ATTR_MAX_OFFSET] = + NLA_POLICY_RANGE(NLA_U8, 1, 20), +}; + const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_WIPHY] = { .type = NLA_U32 }, [NL80211_ATTR_WIPHY_NAME] = { .type = NLA_NUL_STRING, @@ -574,6 +582,7 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_SAE_PASSWORD] = { .type = NLA_BINARY, .len = SAE_PASSWORD_MAX_LEN }, [NL80211_ATTR_TWT_RESPONDER] = { .type = NLA_FLAG }, + [NL80211_ATTR_HE_OBSS_PD] = NLA_POLICY_NESTED(he_obss_pd_policy), }; /* policy for the key attributes */ @@ -4405,6 +4414,34 @@ static int nl80211_parse_beacon(struct cfg80211_registered_device *rdev, return 0; } +static int nl80211_parse_he_obss_pd(struct nlattr *attrs, + struct ieee80211_he_obss_pd *he_obss_pd) +{ + struct nlattr *tb[NL80211_HE_OBSS_PD_ATTR_MAX + 1]; + int err; + + err = nla_parse_nested(tb, NL80211_HE_OBSS_PD_ATTR_MAX, attrs, + he_obss_pd_policy, NULL); + if (err) + return err; + + if (!tb[NL80211_HE_OBSS_PD_ATTR_MIN_OFFSET] || + !tb[NL80211_HE_OBSS_PD_ATTR_MAX_OFFSET]) + return -EINVAL; + + he_obss_pd->min_offset = + nla_get_u32(tb[NL80211_HE_OBSS_PD_ATTR_MIN_OFFSET]); + he_obss_pd->max_offset = + nla_get_u32(tb[NL80211_HE_OBSS_PD_ATTR_MAX_OFFSET]); + + if (he_obss_pd->min_offset >= he_obss_pd->max_offset) + return -EINVAL; + + he_obss_pd->enable = true; + + return 0; +} + static void nl80211_check_ap_rate_selectors(struct cfg80211_ap_settings *params, const u8 *rates) { @@ -4689,6 +4726,14 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) params.twt_responder = nla_get_flag(info->attrs[NL80211_ATTR_TWT_RESPONDER]); + if (info->attrs[NL80211_ATTR_HE_OBSS_PD]) { + err = nl80211_parse_he_obss_pd( + info->attrs[NL80211_ATTR_HE_OBSS_PD], + ¶ms.he_obss_pd); + if (err) + return err; + } + nl80211_calculate_ap_params(¶ms); if (info->attrs[NL80211_ATTR_EXTERNAL_AUTH_SUPPORT]) -- cgit v1.2.3 From 1ced169cc1c2f3e054fa14974443383ee02a8b6a Mon Sep 17 00:00:00 2001 From: John Crispin Date: Tue, 30 Jul 2019 18:37:01 +0200 Subject: mac80211: allow setting spatial reuse parameters from bss_conf Store the OBSS PD parameters inside bss_conf when bringing up an AP and/or when a station connects to an AP. This allows the driver to configure the HW accordingly. Signed-off-by: John Crispin Link: https://lore.kernel.org/r/20190730163701.18836-3-john@phrozen.org Signed-off-by: Johannes Berg --- include/net/mac80211.h | 4 ++++ net/mac80211/cfg.c | 5 ++++- net/mac80211/he.c | 24 ++++++++++++++++++++++++ net/mac80211/ieee80211_i.h | 3 +++ net/mac80211/mlme.c | 1 + 5 files changed, 36 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index bd91388797fc..6781d4637557 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -315,6 +315,7 @@ struct ieee80211_vif_chanctx_switch { * @BSS_CHANGED_FTM_RESPONDER: fime timing reasurement request responder * functionality changed for this BSS (AP mode). * @BSS_CHANGED_TWT: TWT status changed + * @BSS_CHANGED_HE_OBSS_PD: OBSS Packet Detection status changed. * */ enum ieee80211_bss_change { @@ -346,6 +347,7 @@ enum ieee80211_bss_change { BSS_CHANGED_MCAST_RATE = 1<<25, BSS_CHANGED_FTM_RESPONDER = 1<<26, BSS_CHANGED_TWT = 1<<27, + BSS_CHANGED_HE_OBSS_PD = 1<<28, /* when adding here, make sure to change ieee80211_reconfig */ }; @@ -601,6 +603,7 @@ struct ieee80211_ftm_responder_params { * @profile_periodicity: the least number of beacon frames need to be received * in order to discover all the nontransmitted BSSIDs in the set. * @he_operation: HE operation information of the AP we are connected to + * @he_obss_pd: OBSS Packet Detection parameters. */ struct ieee80211_bss_conf { const u8 *bssid; @@ -663,6 +666,7 @@ struct ieee80211_bss_conf { bool ema_ap; u8 profile_periodicity; struct ieee80211_he_operation he_operation; + struct ieee80211_he_obss_pd he_obss_pd; }; /** diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 10b99b263c4f..ed56b0c6fe19 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -980,7 +980,8 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, BSS_CHANGED_SSID | BSS_CHANGED_P2P_PS | BSS_CHANGED_TXPOWER | - BSS_CHANGED_TWT; + BSS_CHANGED_TWT | + BSS_CHANGED_HE_OBSS_PD; int err; int prev_beacon_int; @@ -1051,6 +1052,8 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, sdata->vif.bss_conf.enable_beacon = true; sdata->vif.bss_conf.allow_p2p_go_ps = sdata->vif.p2p; sdata->vif.bss_conf.twt_responder = params->twt_responder; + memcpy(&sdata->vif.bss_conf.he_obss_pd, ¶ms->he_obss_pd, + sizeof(struct ieee80211_he_obss_pd)); sdata->vif.bss_conf.ssid_len = params->ssid_len; if (params->ssid_len) diff --git a/net/mac80211/he.c b/net/mac80211/he.c index f910f730ad0d..a02abfc424aa 100644 --- a/net/mac80211/he.c +++ b/net/mac80211/he.c @@ -65,3 +65,27 @@ ieee80211_he_op_ie_to_bss_conf(struct ieee80211_vif *vif, vif->bss_conf.he_operation = *he_op_ie_elem; } + +void +ieee80211_he_spr_ie_to_bss_conf(struct ieee80211_vif *vif, + const struct ieee80211_he_spr *he_spr_ie_elem) +{ + struct ieee80211_he_obss_pd *he_obss_pd = + &vif->bss_conf.he_obss_pd; + const u8 *data = he_spr_ie_elem->optional; + + memset(he_obss_pd, 0, sizeof(*he_obss_pd)); + + if (!he_spr_ie_elem) + return; + + if (he_spr_ie_elem->he_sr_control & + IEEE80211_HE_SPR_NON_SRG_OFFSET_PRESENT) + data++; + if (he_spr_ie_elem->he_sr_control & + IEEE80211_HE_SPR_SRG_INFORMATION_PRESENT) { + he_obss_pd->max_offset = *data++; + he_obss_pd->min_offset = *data++; + he_obss_pd->enable = true; + } +} diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 38769f5c3da4..791ce58d0f09 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1873,6 +1873,9 @@ ieee80211_he_cap_ie_to_sta_he_cap(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, const u8 *he_cap_ie, u8 he_cap_len, struct sta_info *sta); +void +ieee80211_he_spr_ie_to_bss_conf(struct ieee80211_vif *vif, + const struct ieee80211_he_spr *he_spr_ie_elem); void ieee80211_he_op_ie_to_bss_conf(struct ieee80211_vif *vif, diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 2b8a7428973d..225633d9e2d4 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -3382,6 +3382,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, bss_conf->uora_ocw_range = elems.uora_element[0]; ieee80211_he_op_ie_to_bss_conf(&sdata->vif, elems.he_operation); + ieee80211_he_spr_ie_to_bss_conf(&sdata->vif, elems.he_spr); /* TODO: OPEN: what happens if BSS color disable is set? */ } -- cgit v1.2.3 From 7dcddef6f769d7e60691c732eb6d09cdb1d9df76 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Wed, 31 Jul 2019 15:29:52 +1000 Subject: cpuidle: header file stubs must be "static inline" An x86_64 allmodconfig build produces these errors: x86_64-linux-gnu-ld: kernel/sched/core.o: in function `cpuidle_poll_time': core.c:(.text+0x230): multiple definition of `cpuidle_poll_time'; arch/x86/= kernel/process.o:process.c:(.text+0xc0): first defined here (and more) Fixes: 259231a04561 ("cpuidle: add poll_limit_ns to cpuidle_device structure") Signed-off-by: Stephen Rothwell Signed-off-by: Rafael J. Wysocki --- include/linux/cpuidle.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index ba535a1a47d5..1a9f54eb3aa1 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -170,7 +170,7 @@ static inline int cpuidle_enter(struct cpuidle_driver *drv, struct cpuidle_device *dev, int index) {return -ENODEV; } static inline void cpuidle_reflect(struct cpuidle_device *dev, int index) { } -extern u64 cpuidle_poll_time(struct cpuidle_driver *drv, +static inline u64 cpuidle_poll_time(struct cpuidle_driver *drv, struct cpuidle_device *dev) {return 0; } static inline int cpuidle_register_driver(struct cpuidle_driver *drv) -- cgit v1.2.3 From 82cb54856874b1b374f18420be013ff4057700a9 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 29 Jul 2019 12:55:21 +0300 Subject: asm-generic: make simd.h a mandatory include/asm header The generic aegis128 software crypto driver recently gained support for using SIMD intrinsics to increase performance, for which it uncondionally #include's the header. Unfortunately, this header does not exist on many architectures, resulting in build failures. Since asm-generic already has a version of simd.h, let's make it a mandatory header so that it gets instantiated on all architectures that don't provide their own version. Signed-off-by: Ard Biesheuvel Acked-by: Arnd Bergmann Signed-off-by: Herbert Xu --- include/asm-generic/Kbuild | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/asm-generic/Kbuild b/include/asm-generic/Kbuild index 6f4536d70b8e..adff14fcb8e4 100644 --- a/include/asm-generic/Kbuild +++ b/include/asm-generic/Kbuild @@ -3,3 +3,5 @@ # asm headers that all architectures except um should have # (This file is not included when SRCARCH=um since UML borrows several # asm headers from the host architecutre.) + +mandatory-y += simd.h -- cgit v1.2.3 From 63643b5902c4bf096b504b0563f5426ba5baef15 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Mon, 29 Jul 2019 10:51:47 -0500 Subject: ASoC: Intel: Skylake: move NHLT header to common directory Prepare move from NHLT code to common directory, starting with header. Signed-off-by: Pierre-Louis Bossart Signed-off-by: Takashi Iwai --- include/sound/intel-nhlt.h | 119 +++++++++++++++++++++++++++++++++ sound/soc/intel/skylake/skl-nhlt.c | 1 + sound/soc/intel/skylake/skl-nhlt.h | 119 --------------------------------- sound/soc/intel/skylake/skl-ssp-clk.c | 1 + sound/soc/intel/skylake/skl-topology.c | 1 + sound/soc/intel/skylake/skl.h | 1 - 6 files changed, 122 insertions(+), 120 deletions(-) create mode 100644 include/sound/intel-nhlt.h delete mode 100644 sound/soc/intel/skylake/skl-nhlt.h (limited to 'include') diff --git a/include/sound/intel-nhlt.h b/include/sound/intel-nhlt.h new file mode 100644 index 000000000000..f85fbf9c7ce4 --- /dev/null +++ b/include/sound/intel-nhlt.h @@ -0,0 +1,119 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * skl-nhlt.h - Intel HDA Platform NHLT header + * + * Copyright (C) 2015 Intel Corp + * Author: Sanjiv Kumar + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + */ +#ifndef __SKL_NHLT_H__ +#define __SKL_NHLT_H__ + +#include + +struct wav_fmt { + u16 fmt_tag; + u16 channels; + u32 samples_per_sec; + u32 avg_bytes_per_sec; + u16 block_align; + u16 bits_per_sample; + u16 cb_size; +} __packed; + +struct wav_fmt_ext { + struct wav_fmt fmt; + union samples { + u16 valid_bits_per_sample; + u16 samples_per_block; + u16 reserved; + } sample; + u32 channel_mask; + u8 sub_fmt[16]; +} __packed; + +enum nhlt_link_type { + NHLT_LINK_HDA = 0, + NHLT_LINK_DSP = 1, + NHLT_LINK_DMIC = 2, + NHLT_LINK_SSP = 3, + NHLT_LINK_INVALID +}; + +enum nhlt_device_type { + NHLT_DEVICE_BT = 0, + NHLT_DEVICE_DMIC = 1, + NHLT_DEVICE_I2S = 4, + NHLT_DEVICE_INVALID +}; + +struct nhlt_specific_cfg { + u32 size; + u8 caps[0]; +} __packed; + +struct nhlt_fmt_cfg { + struct wav_fmt_ext fmt_ext; + struct nhlt_specific_cfg config; +} __packed; + +struct nhlt_fmt { + u8 fmt_count; + struct nhlt_fmt_cfg fmt_config[0]; +} __packed; + +struct nhlt_endpoint { + u32 length; + u8 linktype; + u8 instance_id; + u16 vendor_id; + u16 device_id; + u16 revision_id; + u32 subsystem_id; + u8 device_type; + u8 direction; + u8 virtual_bus_id; + struct nhlt_specific_cfg config; +} __packed; + +struct nhlt_acpi_table { + struct acpi_table_header header; + u8 endpoint_count; + struct nhlt_endpoint desc[0]; +} __packed; + +struct nhlt_resource_desc { + u32 extra; + u16 flags; + u64 addr_spc_gra; + u64 min_addr; + u64 max_addr; + u64 addr_trans_offset; + u64 length; +} __packed; + +#define MIC_ARRAY_2CH 2 +#define MIC_ARRAY_4CH 4 + +struct nhlt_tdm_config { + u8 virtual_slot; + u8 config_type; +} __packed; + +struct nhlt_dmic_array_config { + struct nhlt_tdm_config tdm_config; + u8 array_type; +} __packed; + +enum { + NHLT_MIC_ARRAY_2CH_SMALL = 0xa, + NHLT_MIC_ARRAY_2CH_BIG = 0xb, + NHLT_MIC_ARRAY_4CH_1ST_GEOM = 0xc, + NHLT_MIC_ARRAY_4CH_L_SHAPED = 0xd, + NHLT_MIC_ARRAY_4CH_2ND_GEOM = 0xe, + NHLT_MIC_ARRAY_VENDOR_DEFINED = 0xf, +}; + +#endif diff --git a/sound/soc/intel/skylake/skl-nhlt.c b/sound/soc/intel/skylake/skl-nhlt.c index 1132109cb992..aabc5d71650e 100644 --- a/sound/soc/intel/skylake/skl-nhlt.c +++ b/sound/soc/intel/skylake/skl-nhlt.c @@ -9,6 +9,7 @@ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ #include +#include #include "skl.h" #include "skl-i2s.h" diff --git a/sound/soc/intel/skylake/skl-nhlt.h b/sound/soc/intel/skylake/skl-nhlt.h deleted file mode 100644 index f85fbf9c7ce4..000000000000 --- a/sound/soc/intel/skylake/skl-nhlt.h +++ /dev/null @@ -1,119 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * skl-nhlt.h - Intel HDA Platform NHLT header - * - * Copyright (C) 2015 Intel Corp - * Author: Sanjiv Kumar - * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - * - * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - */ -#ifndef __SKL_NHLT_H__ -#define __SKL_NHLT_H__ - -#include - -struct wav_fmt { - u16 fmt_tag; - u16 channels; - u32 samples_per_sec; - u32 avg_bytes_per_sec; - u16 block_align; - u16 bits_per_sample; - u16 cb_size; -} __packed; - -struct wav_fmt_ext { - struct wav_fmt fmt; - union samples { - u16 valid_bits_per_sample; - u16 samples_per_block; - u16 reserved; - } sample; - u32 channel_mask; - u8 sub_fmt[16]; -} __packed; - -enum nhlt_link_type { - NHLT_LINK_HDA = 0, - NHLT_LINK_DSP = 1, - NHLT_LINK_DMIC = 2, - NHLT_LINK_SSP = 3, - NHLT_LINK_INVALID -}; - -enum nhlt_device_type { - NHLT_DEVICE_BT = 0, - NHLT_DEVICE_DMIC = 1, - NHLT_DEVICE_I2S = 4, - NHLT_DEVICE_INVALID -}; - -struct nhlt_specific_cfg { - u32 size; - u8 caps[0]; -} __packed; - -struct nhlt_fmt_cfg { - struct wav_fmt_ext fmt_ext; - struct nhlt_specific_cfg config; -} __packed; - -struct nhlt_fmt { - u8 fmt_count; - struct nhlt_fmt_cfg fmt_config[0]; -} __packed; - -struct nhlt_endpoint { - u32 length; - u8 linktype; - u8 instance_id; - u16 vendor_id; - u16 device_id; - u16 revision_id; - u32 subsystem_id; - u8 device_type; - u8 direction; - u8 virtual_bus_id; - struct nhlt_specific_cfg config; -} __packed; - -struct nhlt_acpi_table { - struct acpi_table_header header; - u8 endpoint_count; - struct nhlt_endpoint desc[0]; -} __packed; - -struct nhlt_resource_desc { - u32 extra; - u16 flags; - u64 addr_spc_gra; - u64 min_addr; - u64 max_addr; - u64 addr_trans_offset; - u64 length; -} __packed; - -#define MIC_ARRAY_2CH 2 -#define MIC_ARRAY_4CH 4 - -struct nhlt_tdm_config { - u8 virtual_slot; - u8 config_type; -} __packed; - -struct nhlt_dmic_array_config { - struct nhlt_tdm_config tdm_config; - u8 array_type; -} __packed; - -enum { - NHLT_MIC_ARRAY_2CH_SMALL = 0xa, - NHLT_MIC_ARRAY_2CH_BIG = 0xb, - NHLT_MIC_ARRAY_4CH_1ST_GEOM = 0xc, - NHLT_MIC_ARRAY_4CH_L_SHAPED = 0xd, - NHLT_MIC_ARRAY_4CH_2ND_GEOM = 0xe, - NHLT_MIC_ARRAY_VENDOR_DEFINED = 0xf, -}; - -#endif diff --git a/sound/soc/intel/skylake/skl-ssp-clk.c b/sound/soc/intel/skylake/skl-ssp-clk.c index 5bb6e40d4d3e..5bfcd46452f9 100644 --- a/sound/soc/intel/skylake/skl-ssp-clk.c +++ b/sound/soc/intel/skylake/skl-ssp-clk.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "skl.h" #include "skl-ssp-clk.h" #include "skl-topology.h" diff --git a/sound/soc/intel/skylake/skl-topology.c b/sound/soc/intel/skylake/skl-topology.c index 6241e35213af..f8a501cf5fbd 100644 --- a/sound/soc/intel/skylake/skl-topology.c +++ b/sound/soc/intel/skylake/skl-topology.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include diff --git a/sound/soc/intel/skylake/skl.h b/sound/soc/intel/skylake/skl.h index 6070666a6392..928e8115a1a7 100644 --- a/sound/soc/intel/skylake/skl.h +++ b/sound/soc/intel/skylake/skl.h @@ -16,7 +16,6 @@ #include #include #include -#include "skl-nhlt.h" #include "skl-ssp-clk.h" #define SKL_SUSPEND_DELAY 2000 -- cgit v1.2.3 From 303681f4356d322232dd5f6d9eb4bc62666064c5 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Mon, 29 Jul 2019 10:51:48 -0500 Subject: ALSA: hda: move parts of NHLT code to new module Move parts of the code outside of the Skylake driver to help detect the presence of DMICs (which are not supported by the HDaudio legacy driver). No functionality change (except for the removal of useless OR operations), only indentation and checkpatch fixes, making sure that the code compiles without ACPI and fixing an ACPI leak Signed-off-by: Pierre-Louis Bossart Signed-off-by: Takashi Iwai --- include/sound/intel-nhlt.h | 41 ++++++++++++++---- sound/hda/Kconfig | 5 +++ sound/hda/Makefile | 3 ++ sound/hda/intel-nhlt.c | 103 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 144 insertions(+), 8 deletions(-) create mode 100644 sound/hda/intel-nhlt.c (limited to 'include') diff --git a/include/sound/intel-nhlt.h b/include/sound/intel-nhlt.h index f85fbf9c7ce4..857922f03931 100644 --- a/include/sound/intel-nhlt.h +++ b/include/sound/intel-nhlt.h @@ -1,18 +1,17 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* - * skl-nhlt.h - Intel HDA Platform NHLT header + * intel-nhlt.h - Intel HDA Platform NHLT header * - * Copyright (C) 2015 Intel Corp - * Author: Sanjiv Kumar - * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - * - * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * Copyright (c) 2015-2019 Intel Corporation */ -#ifndef __SKL_NHLT_H__ -#define __SKL_NHLT_H__ + +#ifndef __INTEL_NHLT_H__ +#define __INTEL_NHLT_H__ #include +#if IS_ENABLED(CONFIG_ACPI) && IS_ENABLED(CONFIG_SND_INTEL_NHLT) + struct wav_fmt { u16 fmt_tag; u16 channels; @@ -116,4 +115,30 @@ enum { NHLT_MIC_ARRAY_VENDOR_DEFINED = 0xf, }; +struct nhlt_acpi_table *intel_nhlt_init(struct device *dev); + +void intel_nhlt_free(struct nhlt_acpi_table *addr); + +int intel_nhlt_get_dmic_geo(struct device *dev, struct nhlt_acpi_table *nhlt); + +#else + +struct nhlt_acpi_table; + +static inline struct nhlt_acpi_table *intel_nhlt_init(struct device *dev) +{ + return NULL; +} + +static inline void intel_nhlt_free(struct nhlt_acpi_table *addr) +{ +} + +static inline int intel_nhlt_get_dmic_geo(struct device *dev, + struct nhlt_acpi_table *nhlt) +{ + return 0; +} +#endif + #endif diff --git a/sound/hda/Kconfig b/sound/hda/Kconfig index f6feced15f17..9ccbcb5a06bd 100644 --- a/sound/hda/Kconfig +++ b/sound/hda/Kconfig @@ -29,3 +29,8 @@ config SND_HDA_PREALLOC_SIZE Note that the pre-allocation size can be changed dynamically via a proc file (/proc/asound/card*/pcm*/sub*/prealloc), too. + +config SND_INTEL_NHLT + tristate + # this config should be selected only for Intel ACPI platforms. + # A fallback is provided so that the code compiles in all cases. \ No newline at end of file diff --git a/sound/hda/Makefile b/sound/hda/Makefile index 2160202e2dc1..8560f6ef1b19 100644 --- a/sound/hda/Makefile +++ b/sound/hda/Makefile @@ -13,3 +13,6 @@ obj-$(CONFIG_SND_HDA_CORE) += snd-hda-core.o #extended hda obj-$(CONFIG_SND_HDA_EXT_CORE) += ext/ + +snd-intel-nhlt-objs := intel-nhlt.o +obj-$(CONFIG_SND_INTEL_NHLT) += snd-intel-nhlt.o diff --git a/sound/hda/intel-nhlt.c b/sound/hda/intel-nhlt.c new file mode 100644 index 000000000000..7a62e03ba407 --- /dev/null +++ b/sound/hda/intel-nhlt.c @@ -0,0 +1,103 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2015-2019 Intel Corporation + +#include +#include + +#define NHLT_ACPI_HEADER_SIG "NHLT" + +/* Unique identification for getting NHLT blobs */ +static guid_t osc_guid = + GUID_INIT(0xA69F886E, 0x6CEB, 0x4594, + 0xA4, 0x1F, 0x7B, 0x5D, 0xCE, 0x24, 0xC5, 0x53); + +struct nhlt_acpi_table *intel_nhlt_init(struct device *dev) +{ + acpi_handle handle; + union acpi_object *obj; + struct nhlt_resource_desc *nhlt_ptr; + struct nhlt_acpi_table *nhlt_table = NULL; + + handle = ACPI_HANDLE(dev); + if (!handle) { + dev_err(dev, "Didn't find ACPI_HANDLE\n"); + return NULL; + } + + obj = acpi_evaluate_dsm(handle, &osc_guid, 1, 1, NULL); + + if (!obj) + return NULL; + + if (obj->type != ACPI_TYPE_BUFFER) { + dev_dbg(dev, "No NHLT table found\n"); + ACPI_FREE(obj); + return NULL; + } + + nhlt_ptr = (struct nhlt_resource_desc *)obj->buffer.pointer; + if (nhlt_ptr->length) + nhlt_table = (struct nhlt_acpi_table *) + memremap(nhlt_ptr->min_addr, nhlt_ptr->length, + MEMREMAP_WB); + ACPI_FREE(obj); + if (nhlt_table && + (strncmp(nhlt_table->header.signature, + NHLT_ACPI_HEADER_SIG, + strlen(NHLT_ACPI_HEADER_SIG)) != 0)) { + memunmap(nhlt_table); + dev_err(dev, "NHLT ACPI header signature incorrect\n"); + return NULL; + } + return nhlt_table; +} +EXPORT_SYMBOL_GPL(intel_nhlt_init); + +void intel_nhlt_free(struct nhlt_acpi_table *nhlt) +{ + memunmap((void *)nhlt); +} +EXPORT_SYMBOL_GPL(intel_nhlt_free); + +int intel_nhlt_get_dmic_geo(struct device *dev, struct nhlt_acpi_table *nhlt) +{ + struct nhlt_endpoint *epnt; + struct nhlt_dmic_array_config *cfg; + unsigned int dmic_geo = 0; + u8 j; + + if (!nhlt) + return 0; + + epnt = (struct nhlt_endpoint *)nhlt->desc; + + for (j = 0; j < nhlt->endpoint_count; j++) { + if (epnt->linktype == NHLT_LINK_DMIC) { + cfg = (struct nhlt_dmic_array_config *) + (epnt->config.caps); + switch (cfg->array_type) { + case NHLT_MIC_ARRAY_2CH_SMALL: + case NHLT_MIC_ARRAY_2CH_BIG: + dmic_geo = MIC_ARRAY_2CH; + break; + + case NHLT_MIC_ARRAY_4CH_1ST_GEOM: + case NHLT_MIC_ARRAY_4CH_L_SHAPED: + case NHLT_MIC_ARRAY_4CH_2ND_GEOM: + dmic_geo = MIC_ARRAY_4CH; + break; + + default: + dev_warn(dev, "undefined DMIC array_type 0x%0x\n", + cfg->array_type); + } + } + epnt = (struct nhlt_endpoint *)((u8 *)epnt + epnt->length); + } + + return dmic_geo; +} +EXPORT_SYMBOL_GPL(intel_nhlt_get_dmic_geo); + +MODULE_LICENSE("GPL v2"); +MODULE_DESCRIPTION("Intel NHLT driver"); -- cgit v1.2.3 From 7a33ea70e1868ee578fe2e9a85dd300efa1a35d5 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Mon, 29 Jul 2019 10:51:49 -0500 Subject: ALSA: hda: intel-nhlt: handle NHLT VENDOR_DEFINED DMIC geometry The NHLT spec defines a VENDOR_DEFINED geometry, which requires reading additional information to figure out the number of microphones. Signed-off-by: Pierre-Louis Bossart Signed-off-by: Takashi Iwai --- include/sound/intel-nhlt.h | 10 ++++++++-- sound/hda/intel-nhlt.c | 6 +++++- 2 files changed, 13 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/sound/intel-nhlt.h b/include/sound/intel-nhlt.h index 857922f03931..f657fd8fc0ad 100644 --- a/include/sound/intel-nhlt.h +++ b/include/sound/intel-nhlt.h @@ -96,16 +96,22 @@ struct nhlt_resource_desc { #define MIC_ARRAY_2CH 2 #define MIC_ARRAY_4CH 4 -struct nhlt_tdm_config { +struct nhlt_device_specific_config { u8 virtual_slot; u8 config_type; } __packed; struct nhlt_dmic_array_config { - struct nhlt_tdm_config tdm_config; + struct nhlt_device_specific_config device_config; u8 array_type; } __packed; +struct nhlt_vendor_dmic_array_config { + struct nhlt_dmic_array_config dmic_config; + u8 nb_mics; + /* TODO add vendor mic config */ +} __packed; + enum { NHLT_MIC_ARRAY_2CH_SMALL = 0xa, NHLT_MIC_ARRAY_2CH_BIG = 0xb, diff --git a/sound/hda/intel-nhlt.c b/sound/hda/intel-nhlt.c index 7a62e03ba407..daede96f28ee 100644 --- a/sound/hda/intel-nhlt.c +++ b/sound/hda/intel-nhlt.c @@ -63,6 +63,7 @@ int intel_nhlt_get_dmic_geo(struct device *dev, struct nhlt_acpi_table *nhlt) { struct nhlt_endpoint *epnt; struct nhlt_dmic_array_config *cfg; + struct nhlt_vendor_dmic_array_config *cfg_vendor; unsigned int dmic_geo = 0; u8 j; @@ -86,7 +87,10 @@ int intel_nhlt_get_dmic_geo(struct device *dev, struct nhlt_acpi_table *nhlt) case NHLT_MIC_ARRAY_4CH_2ND_GEOM: dmic_geo = MIC_ARRAY_4CH; break; - + case NHLT_MIC_ARRAY_VENDOR_DEFINED: + cfg_vendor = (struct nhlt_vendor_dmic_array_config *)cfg; + dmic_geo = cfg_vendor->nb_mics; + break; default: dev_warn(dev, "undefined DMIC array_type 0x%0x\n", cfg->array_type); -- cgit v1.2.3 From c1a280b68d4e6b6db4a65aa7865c22d8789ddf09 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Jul 2019 23:19:37 +0200 Subject: sched/preempt: Use CONFIG_PREEMPTION where appropriate CONFIG_PREEMPTION is selected by CONFIG_PREEMPT and by CONFIG_PREEMPT_RT. Both PREEMPT and PREEMPT_RT require the same functionality which today depends on CONFIG_PREEMPT. Switch the preemption code, scheduler and init task over to use CONFIG_PREEMPTION. That's the first step towards RT in that area. The more complex changes are coming separately. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Paolo Bonzini Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20190726212124.117528401@linutronix.de Signed-off-by: Ingo Molnar --- include/asm-generic/preempt.h | 4 ++-- include/linux/preempt.h | 6 +++--- include/linux/sched.h | 6 +++--- init/init_task.c | 2 +- init/main.c | 2 +- kernel/sched/core.c | 14 +++++++------- kernel/sched/fair.c | 2 +- kernel/sched/sched.h | 4 ++-- 8 files changed, 20 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/include/asm-generic/preempt.h b/include/asm-generic/preempt.h index c3046c920063..d683f5e6d791 100644 --- a/include/asm-generic/preempt.h +++ b/include/asm-generic/preempt.h @@ -78,11 +78,11 @@ static __always_inline bool should_resched(int preempt_offset) tif_need_resched()); } -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION extern asmlinkage void preempt_schedule(void); #define __preempt_schedule() preempt_schedule() extern asmlinkage void preempt_schedule_notrace(void); #define __preempt_schedule_notrace() preempt_schedule_notrace() -#endif /* CONFIG_PREEMPT */ +#endif /* CONFIG_PREEMPTION */ #endif /* __ASM_PREEMPT_H */ diff --git a/include/linux/preempt.h b/include/linux/preempt.h index dd92b1a93919..bbb68dba37cc 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -182,7 +182,7 @@ do { \ #define preemptible() (preempt_count() == 0 && !irqs_disabled()) -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION #define preempt_enable() \ do { \ barrier(); \ @@ -203,7 +203,7 @@ do { \ __preempt_schedule(); \ } while (0) -#else /* !CONFIG_PREEMPT */ +#else /* !CONFIG_PREEMPTION */ #define preempt_enable() \ do { \ barrier(); \ @@ -217,7 +217,7 @@ do { \ } while (0) #define preempt_check_resched() do { } while (0) -#endif /* CONFIG_PREEMPT */ +#endif /* CONFIG_PREEMPTION */ #define preempt_disable_notrace() \ do { \ diff --git a/include/linux/sched.h b/include/linux/sched.h index 9f51932bd543..6947516a2d3e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1767,7 +1767,7 @@ static inline int test_tsk_need_resched(struct task_struct *tsk) * value indicates whether a reschedule was done in fact. * cond_resched_lock() will drop the spinlock before scheduling, */ -#ifndef CONFIG_PREEMPT +#ifndef CONFIG_PREEMPTION extern int _cond_resched(void); #else static inline int _cond_resched(void) { return 0; } @@ -1796,12 +1796,12 @@ static inline void cond_resched_rcu(void) /* * Does a critical section need to be broken due to another - * task waiting?: (technically does not depend on CONFIG_PREEMPT, + * task waiting?: (technically does not depend on CONFIG_PREEMPTION, * but a general need for low latency) */ static inline int spin_needbreak(spinlock_t *lock) { -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION return spin_is_contended(lock); #else return 0; diff --git a/init/init_task.c b/init/init_task.c index 7ab773b9b3cd..bfe06c53b14e 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -174,7 +174,7 @@ struct task_struct init_task #ifdef CONFIG_FUNCTION_GRAPH_TRACER .ret_stack = NULL, #endif -#if defined(CONFIG_TRACING) && defined(CONFIG_PREEMPT) +#if defined(CONFIG_TRACING) && defined(CONFIG_PREEMPTION) .trace_recursion = 0, #endif #ifdef CONFIG_LIVEPATCH diff --git a/init/main.c b/init/main.c index 96f8d5af52d6..653693da8da6 100644 --- a/init/main.c +++ b/init/main.c @@ -433,7 +433,7 @@ noinline void __ref rest_init(void) /* * Enable might_sleep() and smp_processor_id() checks. - * They cannot be enabled earlier because with CONFIG_PREEMPT=y + * They cannot be enabled earlier because with CONFIG_PREEMPTION=y * kernel_thread() would trigger might_sleep() splats. With * CONFIG_PREEMPT_VOLUNTARY=y the init task might have scheduled * already, but it's stuck on the kthreadd_done completion. diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2b037f195473..604a5e137efe 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3581,7 +3581,7 @@ static inline void sched_tick_start(int cpu) { } static inline void sched_tick_stop(int cpu) { } #endif -#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ +#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \ defined(CONFIG_TRACE_PREEMPT_TOGGLE)) /* * If the value passed in is equal to the current preempt count @@ -3782,7 +3782,7 @@ again: * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets * called on the nearest possible occasion: * - * - If the kernel is preemptible (CONFIG_PREEMPT=y): + * - If the kernel is preemptible (CONFIG_PREEMPTION=y): * * - in syscall or exception context, at the next outmost * preempt_enable(). (this might be as soon as the wake_up()'s @@ -3791,7 +3791,7 @@ again: * - in IRQ context, return from interrupt-handler to * preemptible context * - * - If the kernel is not preemptible (CONFIG_PREEMPT is not set) + * - If the kernel is not preemptible (CONFIG_PREEMPTION is not set) * then at the next: * * - cond_resched() call @@ -4033,7 +4033,7 @@ static void __sched notrace preempt_schedule_common(void) } while (need_resched()); } -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION /* * this is the entry point to schedule() from in-kernel preemption * off of preempt_enable. Kernel preemptions off return from interrupt @@ -4105,7 +4105,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) } EXPORT_SYMBOL_GPL(preempt_schedule_notrace); -#endif /* CONFIG_PREEMPT */ +#endif /* CONFIG_PREEMPTION */ /* * this is the entry point to schedule() from kernel preemption @@ -5416,7 +5416,7 @@ SYSCALL_DEFINE0(sched_yield) return 0; } -#ifndef CONFIG_PREEMPT +#ifndef CONFIG_PREEMPTION int __sched _cond_resched(void) { if (should_resched(0)) { @@ -5433,7 +5433,7 @@ EXPORT_SYMBOL(_cond_resched); * __cond_resched_lock() - if a reschedule is pending, drop the given lock, * call schedule, and on return reacquire the lock. * - * This works OK both with and without CONFIG_PREEMPT. We do strange low-level + * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level * operations here to prevent schedule() from being called twice (once via * spin_unlock(), once by hand). */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bc9cfeaac8bd..aff9d76d8d65 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7430,7 +7430,7 @@ static int detach_tasks(struct lb_env *env) detached++; env->imbalance -= load; -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION /* * NEWIDLE balancing is a source of latency, so preemptible * kernels will stop after the first task is detached to minimize diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 802b1f3405f2..f2ce6ba1c5d5 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1943,7 +1943,7 @@ unsigned long arch_scale_freq_capacity(int cpu) #endif #ifdef CONFIG_SMP -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION static inline void double_rq_lock(struct rq *rq1, struct rq *rq2); @@ -1995,7 +1995,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) return ret; } -#endif /* CONFIG_PREEMPT */ +#endif /* CONFIG_PREEMPTION */ /* * double_lock_balance - lock the busiest runqueue, this_rq is locked already. -- cgit v1.2.3 From 01b1d88b09824bea1a75b0bac04dcf50d9893875 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Jul 2019 23:19:38 +0200 Subject: rcu: Use CONFIG_PREEMPTION CONFIG_PREEMPTION is selected by CONFIG_PREEMPT and by CONFIG_PREEMPT_RT. Both PREEMPT and PREEMPT_RT require the same functionality which today depends on CONFIG_PREEMPT. Switch the conditionals in RCU to use CONFIG_PREEMPTION. That's the first step towards RCU on RT. The further tweaks are work in progress. This neither touches the selftest bits which need a closer look by Paul. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Paolo Bonzini Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20190726212124.210156346@linutronix.de Signed-off-by: Ingo Molnar --- arch/Kconfig | 2 +- include/linux/rcupdate.h | 2 +- include/linux/rcutree.h | 2 +- include/linux/torture.h | 2 +- kernel/rcu/Kconfig | 8 ++++---- kernel/rcu/tree.c | 6 +++--- kernel/rcu/tree_stall.h | 6 +++--- kernel/trace/Kconfig | 2 +- 8 files changed, 15 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/arch/Kconfig b/arch/Kconfig index a7b57dd42c26..c7efbc018f4f 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -103,7 +103,7 @@ config STATIC_KEYS_SELFTEST config OPTPROBES def_bool y depends on KPROBES && HAVE_OPTPROBES - select TASKS_RCU if PREEMPT + select TASKS_RCU if PREEMPTION config KPROBES_ON_FTRACE def_bool y diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 8f7167478c1d..c4f76a310443 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -578,7 +578,7 @@ do { \ * * In non-preemptible RCU implementations (TREE_RCU and TINY_RCU), * it is illegal to block while in an RCU read-side critical section. - * In preemptible RCU implementations (PREEMPT_RCU) in CONFIG_PREEMPT + * In preemptible RCU implementations (PREEMPT_RCU) in CONFIG_PREEMPTION * kernel builds, RCU read-side critical sections may be preempted, * but explicit blocking is illegal. Finally, in preemptible RCU * implementations in real-time (with -rt patchset) kernel builds, RCU diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 735601ac27d3..18b1ed9864b0 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -53,7 +53,7 @@ void rcu_scheduler_starting(void); extern int rcu_scheduler_active __read_mostly; void rcu_end_inkernel_boot(void); bool rcu_is_watching(void); -#ifndef CONFIG_PREEMPT +#ifndef CONFIG_PREEMPTION void rcu_all_qs(void); #endif diff --git a/include/linux/torture.h b/include/linux/torture.h index a620118385bb..6241f59e2d6f 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -86,7 +86,7 @@ void _torture_stop_kthread(char *m, struct task_struct **tp); #define torture_stop_kthread(n, tp) \ _torture_stop_kthread("Stopping " #n " task", &(tp)) -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION #define torture_preempt_schedule() preempt_schedule() #else #define torture_preempt_schedule() diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 480edf328b51..7644eda17d62 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -7,7 +7,7 @@ menu "RCU Subsystem" config TREE_RCU bool - default y if !PREEMPT && SMP + default y if !PREEMPTION && SMP help This option selects the RCU implementation that is designed for very large SMP system with hundreds or @@ -16,7 +16,7 @@ config TREE_RCU config PREEMPT_RCU bool - default y if PREEMPT + default y if PREEMPTION help This option selects the RCU implementation that is designed for very large SMP systems with hundreds or @@ -28,7 +28,7 @@ config PREEMPT_RCU config TINY_RCU bool - default y if !PREEMPT && !SMP + default y if !PREEMPTION && !SMP help This option selects the RCU implementation that is designed for UP systems from which real-time response @@ -70,7 +70,7 @@ config TREE_SRCU This option selects the full-fledged version of SRCU. config TASKS_RCU - def_bool PREEMPT + def_bool PREEMPTION select SRCU help This option enables a task-based RCU implementation that uses diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a14e5fbbea46..5962636502bc 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1881,7 +1881,7 @@ rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) struct rcu_node *rnp_p; raw_lockdep_assert_held_rcu_node(rnp); - if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT)) || + if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPTION)) || WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)) || rnp->qsmask != 0) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); @@ -2205,7 +2205,7 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp)) mask = 0; raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->qsmask == 0) { - if (!IS_ENABLED(CONFIG_PREEMPT) || + if (!IS_ENABLED(CONFIG_PREEMPTION) || rcu_preempt_blocked_readers_cgp(rnp)) { /* * No point in scanning bits because they @@ -2622,7 +2622,7 @@ static int rcu_blocking_is_gp(void) { int ret; - if (IS_ENABLED(CONFIG_PREEMPT)) + if (IS_ENABLED(CONFIG_PREEMPTION)) return rcu_scheduler_active == RCU_SCHEDULER_INACTIVE; might_sleep(); /* Check for RCU read-side critical section. */ preempt_disable(); diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 065183391f75..9b92bf18b737 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -163,7 +163,7 @@ static void rcu_iw_handler(struct irq_work *iwp) // // Printing RCU CPU stall warnings -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION /* * Dump detailed information for all tasks blocking the current RCU @@ -215,7 +215,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp) return ndetected; } -#else /* #ifdef CONFIG_PREEMPT */ +#else /* #ifdef CONFIG_PREEMPTION */ /* * Because preemptible RCU does not exist, we never have to check for @@ -233,7 +233,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp) { return 0; } -#endif /* #else #ifdef CONFIG_PREEMPT */ +#endif /* #else #ifdef CONFIG_PREEMPTION */ /* * Dump stacks of all tasks running on stalled CPUs. First try using diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 98da8998c25c..d2c1fe0b451d 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -146,7 +146,7 @@ config FUNCTION_TRACER select GENERIC_TRACER select CONTEXT_SWITCH_TRACER select GLOB - select TASKS_RCU if PREEMPT + select TASKS_RCU if PREEMPTION help Enable the kernel to trace every kernel function. This is done by using a compiler feature to insert a small, 5-byte No-Operation -- cgit v1.2.3 From 27972765bd0410fc2ef5e86a41de17c71440a2dd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Jul 2019 23:19:39 +0200 Subject: locking/spinlocks: Use CONFIG_PREEMPTION CONFIG_PREEMPTION is selected by CONFIG_PREEMPT and by CONFIG_PREEMPT_RT. Both PREEMPT and PREEMPT_RT require the same functionality which today depends on CONFIG_PREEMPT. Adjust the comments in the locking code. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Paolo Bonzini Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20190726212124.302995288@linutronix.de Signed-off-by: Ingo Molnar --- include/linux/spinlock.h | 2 +- include/linux/spinlock_api_smp.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index ed7c4d6b8235..031ce8617df8 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -214,7 +214,7 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock) /* * Define the various spin_lock methods. Note we define these - * regardless of whether CONFIG_SMP or CONFIG_PREEMPT are set. The + * regardless of whether CONFIG_SMP or CONFIG_PREEMPTION are set. The * various methods are defined as nops in the case they are not * required. */ diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h index 42dfab89e740..b762eaba4cdf 100644 --- a/include/linux/spinlock_api_smp.h +++ b/include/linux/spinlock_api_smp.h @@ -96,7 +96,7 @@ static inline int __raw_spin_trylock(raw_spinlock_t *lock) /* * If lockdep is enabled then we use the non-preemption spin-ops - * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are + * even on CONFIG_PREEMPTION, because lockdep assumes that interrupts are * not re-enabled during lock-acquire (which the preempt-spin-ops do): */ #if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC) -- cgit v1.2.3 From eaf7b46083a7e341a23ab3d6042e0ccc115b0914 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 26 Jul 2019 09:51:12 -0300 Subject: docs: thermal: add it to the driver API The file contents mostly describes driver internals. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet --- Documentation/driver-api/index.rst | 1 + .../driver-api/thermal/cpu-cooling-api.rst | 107 +++ .../driver-api/thermal/exynos_thermal.rst | 90 +++ .../thermal/exynos_thermal_emulation.rst | 61 ++ Documentation/driver-api/thermal/index.rst | 18 + .../driver-api/thermal/intel_powerclamp.rst | 320 +++++++++ .../driver-api/thermal/nouveau_thermal.rst | 96 +++ .../driver-api/thermal/power_allocator.rst | 271 +++++++ Documentation/driver-api/thermal/sysfs-api.rst | 798 +++++++++++++++++++++ .../thermal/x86_pkg_temperature_thermal.rst | 55 ++ Documentation/thermal/cpu-cooling-api.rst | 107 --- Documentation/thermal/exynos_thermal.rst | 90 --- Documentation/thermal/exynos_thermal_emulation.rst | 61 -- Documentation/thermal/index.rst | 18 - Documentation/thermal/intel_powerclamp.rst | 320 --------- Documentation/thermal/nouveau_thermal.rst | 96 --- Documentation/thermal/power_allocator.rst | 271 ------- Documentation/thermal/sysfs-api.rst | 798 --------------------- .../thermal/x86_pkg_temperature_thermal.rst | 55 -- MAINTAINERS | 2 +- include/linux/thermal.h | 4 +- 21 files changed, 1820 insertions(+), 1819 deletions(-) create mode 100644 Documentation/driver-api/thermal/cpu-cooling-api.rst create mode 100644 Documentation/driver-api/thermal/exynos_thermal.rst create mode 100644 Documentation/driver-api/thermal/exynos_thermal_emulation.rst create mode 100644 Documentation/driver-api/thermal/index.rst create mode 100644 Documentation/driver-api/thermal/intel_powerclamp.rst create mode 100644 Documentation/driver-api/thermal/nouveau_thermal.rst create mode 100644 Documentation/driver-api/thermal/power_allocator.rst create mode 100644 Documentation/driver-api/thermal/sysfs-api.rst create mode 100644 Documentation/driver-api/thermal/x86_pkg_temperature_thermal.rst delete mode 100644 Documentation/thermal/cpu-cooling-api.rst delete mode 100644 Documentation/thermal/exynos_thermal.rst delete mode 100644 Documentation/thermal/exynos_thermal_emulation.rst delete mode 100644 Documentation/thermal/index.rst delete mode 100644 Documentation/thermal/intel_powerclamp.rst delete mode 100644 Documentation/thermal/nouveau_thermal.rst delete mode 100644 Documentation/thermal/power_allocator.rst delete mode 100644 Documentation/thermal/sysfs-api.rst delete mode 100644 Documentation/thermal/x86_pkg_temperature_thermal.rst (limited to 'include') diff --git a/Documentation/driver-api/index.rst b/Documentation/driver-api/index.rst index d12a80f386a6..37ac052ded85 100644 --- a/Documentation/driver-api/index.rst +++ b/Documentation/driver-api/index.rst @@ -65,6 +65,7 @@ available subsections can be seen below. dmaengine/index slimbus soundwire/index + thermal/index fpga/index acpi/index backlight/lp855x-driver.rst diff --git a/Documentation/driver-api/thermal/cpu-cooling-api.rst b/Documentation/driver-api/thermal/cpu-cooling-api.rst new file mode 100644 index 000000000000..645d914c45a6 --- /dev/null +++ b/Documentation/driver-api/thermal/cpu-cooling-api.rst @@ -0,0 +1,107 @@ +======================= +CPU cooling APIs How To +======================= + +Written by Amit Daniel Kachhap + +Updated: 6 Jan 2015 + +Copyright (c) 2012 Samsung Electronics Co., Ltd(http://www.samsung.com) + +0. Introduction +=============== + +The generic cpu cooling(freq clipping) provides registration/unregistration APIs +to the caller. The binding of the cooling devices to the trip point is left for +the user. The registration APIs returns the cooling device pointer. + +1. cpu cooling APIs +=================== + +1.1 cpufreq registration/unregistration APIs +-------------------------------------------- + + :: + + struct thermal_cooling_device + *cpufreq_cooling_register(struct cpumask *clip_cpus) + + This interface function registers the cpufreq cooling device with the name + "thermal-cpufreq-%x". This api can support multiple instances of cpufreq + cooling devices. + + clip_cpus: + cpumask of cpus where the frequency constraints will happen. + + :: + + struct thermal_cooling_device + *of_cpufreq_cooling_register(struct cpufreq_policy *policy) + + This interface function registers the cpufreq cooling device with + the name "thermal-cpufreq-%x" linking it with a device tree node, in + order to bind it via the thermal DT code. This api can support multiple + instances of cpufreq cooling devices. + + policy: + CPUFreq policy. + + + :: + + void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev) + + This interface function unregisters the "thermal-cpufreq-%x" cooling device. + + cdev: Cooling device pointer which has to be unregistered. + +2. Power models +=============== + +The power API registration functions provide a simple power model for +CPUs. The current power is calculated as dynamic power (static power isn't +supported currently). This power model requires that the operating-points of +the CPUs are registered using the kernel's opp library and the +`cpufreq_frequency_table` is assigned to the `struct device` of the +cpu. If you are using CONFIG_CPUFREQ_DT then the +`cpufreq_frequency_table` should already be assigned to the cpu +device. + +The dynamic power consumption of a processor depends on many factors. +For a given processor implementation the primary factors are: + +- The time the processor spends running, consuming dynamic power, as + compared to the time in idle states where dynamic consumption is + negligible. Herein we refer to this as 'utilisation'. +- The voltage and frequency levels as a result of DVFS. The DVFS + level is a dominant factor governing power consumption. +- In running time the 'execution' behaviour (instruction types, memory + access patterns and so forth) causes, in most cases, a second order + variation. In pathological cases this variation can be significant, + but typically it is of a much lesser impact than the factors above. + +A high level dynamic power consumption model may then be represented as:: + + Pdyn = f(run) * Voltage^2 * Frequency * Utilisation + +f(run) here represents the described execution behaviour and its +result has a units of Watts/Hz/Volt^2 (this often expressed in +mW/MHz/uVolt^2) + +The detailed behaviour for f(run) could be modelled on-line. However, +in practice, such an on-line model has dependencies on a number of +implementation specific processor support and characterisation +factors. Therefore, in initial implementation that contribution is +represented as a constant coefficient. This is a simplification +consistent with the relative contribution to overall power variation. + +In this simplified representation our model becomes:: + + Pdyn = Capacitance * Voltage^2 * Frequency * Utilisation + +Where `capacitance` is a constant that represents an indicative +running time dynamic power coefficient in fundamental units of +mW/MHz/uVolt^2. Typical values for mobile CPUs might lie in range +from 100 to 500. For reference, the approximate values for the SoC in +ARM's Juno Development Platform are 530 for the Cortex-A57 cluster and +140 for the Cortex-A53 cluster. diff --git a/Documentation/driver-api/thermal/exynos_thermal.rst b/Documentation/driver-api/thermal/exynos_thermal.rst new file mode 100644 index 000000000000..5bd556566c70 --- /dev/null +++ b/Documentation/driver-api/thermal/exynos_thermal.rst @@ -0,0 +1,90 @@ +======================== +Kernel driver exynos_tmu +======================== + +Supported chips: + +* ARM SAMSUNG EXYNOS4, EXYNOS5 series of SoC + + Datasheet: Not publicly available + +Authors: Donggeun Kim +Authors: Amit Daniel + +TMU controller Description: +--------------------------- + +This driver allows to read temperature inside SAMSUNG EXYNOS4/5 series of SoC. + +The chip only exposes the measured 8-bit temperature code value +through a register. +Temperature can be taken from the temperature code. +There are three equations converting from temperature to temperature code. + +The three equations are: + 1. Two point trimming:: + + Tc = (T - 25) * (TI2 - TI1) / (85 - 25) + TI1 + + 2. One point trimming:: + + Tc = T + TI1 - 25 + + 3. No trimming:: + + Tc = T + 50 + + Tc: + Temperature code, T: Temperature, + TI1: + Trimming info for 25 degree Celsius (stored at TRIMINFO register) + Temperature code measured at 25 degree Celsius which is unchanged + TI2: + Trimming info for 85 degree Celsius (stored at TRIMINFO register) + Temperature code measured at 85 degree Celsius which is unchanged + +TMU(Thermal Management Unit) in EXYNOS4/5 generates interrupt +when temperature exceeds pre-defined levels. +The maximum number of configurable threshold is five. +The threshold levels are defined as follows:: + + Level_0: current temperature > trigger_level_0 + threshold + Level_1: current temperature > trigger_level_1 + threshold + Level_2: current temperature > trigger_level_2 + threshold + Level_3: current temperature > trigger_level_3 + threshold + +The threshold and each trigger_level are set +through the corresponding registers. + +When an interrupt occurs, this driver notify kernel thermal framework +with the function exynos_report_trigger. +Although an interrupt condition for level_0 can be set, +it can be used to synchronize the cooling action. + +TMU driver description: +----------------------- + +The exynos thermal driver is structured as:: + + Kernel Core thermal framework + (thermal_core.c, step_wise.c, cpu_cooling.c) + ^ + | + | + TMU configuration data -----> TMU Driver <----> Exynos Core thermal wrapper + (exynos_tmu_data.c) (exynos_tmu.c) (exynos_thermal_common.c) + (exynos_tmu_data.h) (exynos_tmu.h) (exynos_thermal_common.h) + +a) TMU configuration data: + This consist of TMU register offsets/bitfields + described through structure exynos_tmu_registers. Also several + other platform data (struct exynos_tmu_platform_data) members + are used to configure the TMU. +b) TMU driver: + This component initialises the TMU controller and sets different + thresholds. It invokes core thermal implementation with the call + exynos_report_trigger. +c) Exynos Core thermal wrapper: + This provides 3 wrapper function to use the + Kernel core thermal framework. They are exynos_unregister_thermal, + exynos_register_thermal and exynos_report_trigger. diff --git a/Documentation/driver-api/thermal/exynos_thermal_emulation.rst b/Documentation/driver-api/thermal/exynos_thermal_emulation.rst new file mode 100644 index 000000000000..c21d10838bc5 --- /dev/null +++ b/Documentation/driver-api/thermal/exynos_thermal_emulation.rst @@ -0,0 +1,61 @@ +===================== +Exynos Emulation Mode +===================== + +Copyright (C) 2012 Samsung Electronics + +Written by Jonghwa Lee + +Description +----------- + +Exynos 4x12 (4212, 4412) and 5 series provide emulation mode for thermal +management unit. Thermal emulation mode supports software debug for +TMU's operation. User can set temperature manually with software code +and TMU will read current temperature from user value not from sensor's +value. + +Enabling CONFIG_THERMAL_EMULATION option will make this support +available. When it's enabled, sysfs node will be created as +/sys/devices/virtual/thermal/thermal_zone'zone id'/emul_temp. + +The sysfs node, 'emul_node', will contain value 0 for the initial state. +When you input any temperature you want to update to sysfs node, it +automatically enable emulation mode and current temperature will be +changed into it. + +(Exynos also supports user changeable delay time which would be used to +delay of changing temperature. However, this node only uses same delay +of real sensing time, 938us.) + +Exynos emulation mode requires synchronous of value changing and +enabling. It means when you want to update the any value of delay or +next temperature, then you have to enable emulation mode at the same +time. (Or you have to keep the mode enabling.) If you don't, it fails to +change the value to updated one and just use last succeessful value +repeatedly. That's why this node gives users the right to change +termerpature only. Just one interface makes it more simply to use. + +Disabling emulation mode only requires writing value 0 to sysfs node. + +:: + + + TEMP 120 | + | + 100 | + | + 80 | + | +----------- + 60 | | | + | +-------------| | + 40 | | | | + | | | | + 20 | | | +---------- + | | | | | + 0 |______________|_____________|__________|__________|_________ + A A A A TIME + |<----->| |<----->| |<----->| | + | 938us | | | | | | + emulation : 0 50 | 70 | 20 | 0 + current temp: sensor 50 70 20 sensor diff --git a/Documentation/driver-api/thermal/index.rst b/Documentation/driver-api/thermal/index.rst new file mode 100644 index 000000000000..5ba61d19c6ae --- /dev/null +++ b/Documentation/driver-api/thermal/index.rst @@ -0,0 +1,18 @@ +.. SPDX-License-Identifier: GPL-2.0 + +======= +Thermal +======= + +.. toctree:: + :maxdepth: 1 + + cpu-cooling-api + sysfs-api + power_allocator + + exynos_thermal + exynos_thermal_emulation + intel_powerclamp + nouveau_thermal + x86_pkg_temperature_thermal diff --git a/Documentation/driver-api/thermal/intel_powerclamp.rst b/Documentation/driver-api/thermal/intel_powerclamp.rst new file mode 100644 index 000000000000..3f6dfb0b3ea6 --- /dev/null +++ b/Documentation/driver-api/thermal/intel_powerclamp.rst @@ -0,0 +1,320 @@ +======================= +Intel Powerclamp Driver +======================= + +By: + - Arjan van de Ven + - Jacob Pan + +.. Contents: + + (*) Introduction + - Goals and Objectives + + (*) Theory of Operation + - Idle Injection + - Calibration + + (*) Performance Analysis + - Effectiveness and Limitations + - Power vs Performance + - Scalability + - Calibration + - Comparison with Alternative Techniques + + (*) Usage and Interfaces + - Generic Thermal Layer (sysfs) + - Kernel APIs (TBD) + +INTRODUCTION +============ + +Consider the situation where a system’s power consumption must be +reduced at runtime, due to power budget, thermal constraint, or noise +level, and where active cooling is not preferred. Software managed +passive power reduction must be performed to prevent the hardware +actions that are designed for catastrophic scenarios. + +Currently, P-states, T-states (clock modulation), and CPU offlining +are used for CPU throttling. + +On Intel CPUs, C-states provide effective power reduction, but so far +they’re only used opportunistically, based on workload. With the +development of intel_powerclamp driver, the method of synchronizing +idle injection across all online CPU threads was introduced. The goal +is to achieve forced and controllable C-state residency. + +Test/Analysis has been made in the areas of power, performance, +scalability, and user experience. In many cases, clear advantage is +shown over taking the CPU offline or modulating the CPU clock. + + +THEORY OF OPERATION +=================== + +Idle Injection +-------------- + +On modern Intel processors (Nehalem or later), package level C-state +residency is available in MSRs, thus also available to the kernel. + +These MSRs are:: + + #define MSR_PKG_C2_RESIDENCY 0x60D + #define MSR_PKG_C3_RESIDENCY 0x3F8 + #define MSR_PKG_C6_RESIDENCY 0x3F9 + #define MSR_PKG_C7_RESIDENCY 0x3FA + +If the kernel can also inject idle time to the system, then a +closed-loop control system can be established that manages package +level C-state. The intel_powerclamp driver is conceived as such a +control system, where the target set point is a user-selected idle +ratio (based on power reduction), and the error is the difference +between the actual package level C-state residency ratio and the target idle +ratio. + +Injection is controlled by high priority kernel threads, spawned for +each online CPU. + +These kernel threads, with SCHED_FIFO class, are created to perform +clamping actions of controlled duty ratio and duration. Each per-CPU +thread synchronizes its idle time and duration, based on the rounding +of jiffies, so accumulated errors can be prevented to avoid a jittery +effect. Threads are also bound to the CPU such that they cannot be +migrated, unless the CPU is taken offline. In this case, threads +belong to the offlined CPUs will be terminated immediately. + +Running as SCHED_FIFO and relatively high priority, also allows such +scheme to work for both preemptable and non-preemptable kernels. +Alignment of idle time around jiffies ensures scalability for HZ +values. This effect can be better visualized using a Perf timechart. +The following diagram shows the behavior of kernel thread +kidle_inject/cpu. During idle injection, it runs monitor/mwait idle +for a given "duration", then relinquishes the CPU to other tasks, +until the next time interval. + +The NOHZ schedule tick is disabled during idle time, but interrupts +are not masked. Tests show that the extra wakeups from scheduler tick +have a dramatic impact on the effectiveness of the powerclamp driver +on large scale systems (Westmere system with 80 processors). + +:: + + CPU0 + ____________ ____________ + kidle_inject/0 | sleep | mwait | sleep | + _________| |________| |_______ + duration + CPU1 + ____________ ____________ + kidle_inject/1 | sleep | mwait | sleep | + _________| |________| |_______ + ^ + | + | + roundup(jiffies, interval) + +Only one CPU is allowed to collect statistics and update global +control parameters. This CPU is referred to as the controlling CPU in +this document. The controlling CPU is elected at runtime, with a +policy that favors BSP, taking into account the possibility of a CPU +hot-plug. + +In terms of dynamics of the idle control system, package level idle +time is considered largely as a non-causal system where its behavior +cannot be based on the past or current input. Therefore, the +intel_powerclamp driver attempts to enforce the desired idle time +instantly as given input (target idle ratio). After injection, +powerclamp monitors the actual idle for a given time window and adjust +the next injection accordingly to avoid over/under correction. + +When used in a causal control system, such as a temperature control, +it is up to the user of this driver to implement algorithms where +past samples and outputs are included in the feedback. For example, a +PID-based thermal controller can use the powerclamp driver to +maintain a desired target temperature, based on integral and +derivative gains of the past samples. + + + +Calibration +----------- +During scalability testing, it is observed that synchronized actions +among CPUs become challenging as the number of cores grows. This is +also true for the ability of a system to enter package level C-states. + +To make sure the intel_powerclamp driver scales well, online +calibration is implemented. The goals for doing such a calibration +are: + +a) determine the effective range of idle injection ratio +b) determine the amount of compensation needed at each target ratio + +Compensation to each target ratio consists of two parts: + + a) steady state error compensation + This is to offset the error occurring when the system can + enter idle without extra wakeups (such as external interrupts). + + b) dynamic error compensation + When an excessive amount of wakeups occurs during idle, an + additional idle ratio can be added to quiet interrupts, by + slowing down CPU activities. + +A debugfs file is provided for the user to examine compensation +progress and results, such as on a Westmere system:: + + [jacob@nex01 ~]$ cat + /sys/kernel/debug/intel_powerclamp/powerclamp_calib + controlling cpu: 0 + pct confidence steady dynamic (compensation) + 0 0 0 0 + 1 1 0 0 + 2 1 1 0 + 3 3 1 0 + 4 3 1 0 + 5 3 1 0 + 6 3 1 0 + 7 3 1 0 + 8 3 1 0 + ... + 30 3 2 0 + 31 3 2 0 + 32 3 1 0 + 33 3 2 0 + 34 3 1 0 + 35 3 2 0 + 36 3 1 0 + 37 3 2 0 + 38 3 1 0 + 39 3 2 0 + 40 3 3 0 + 41 3 1 0 + 42 3 2 0 + 43 3 1 0 + 44 3 1 0 + 45 3 2 0 + 46 3 3 0 + 47 3 0 0 + 48 3 2 0 + 49 3 3 0 + +Calibration occurs during runtime. No offline method is available. +Steady state compensation is used only when confidence levels of all +adjacent ratios have reached satisfactory level. A confidence level +is accumulated based on clean data collected at runtime. Data +collected during a period without extra interrupts is considered +clean. + +To compensate for excessive amounts of wakeup during idle, additional +idle time is injected when such a condition is detected. Currently, +we have a simple algorithm to double the injection ratio. A possible +enhancement might be to throttle the offending IRQ, such as delaying +EOI for level triggered interrupts. But it is a challenge to be +non-intrusive to the scheduler or the IRQ core code. + + +CPU Online/Offline +------------------ +Per-CPU kernel threads are started/stopped upon receiving +notifications of CPU hotplug activities. The intel_powerclamp driver +keeps track of clamping kernel threads, even after they are migrated +to other CPUs, after a CPU offline event. + + +Performance Analysis +==================== +This section describes the general performance data collected on +multiple systems, including Westmere (80P) and Ivy Bridge (4P, 8P). + +Effectiveness and Limitations +----------------------------- +The maximum range that idle injection is allowed is capped at 50 +percent. As mentioned earlier, since interrupts are allowed during +forced idle time, excessive interrupts could result in less +effectiveness. The extreme case would be doing a ping -f to generated +flooded network interrupts without much CPU acknowledgement. In this +case, little can be done from the idle injection threads. In most +normal cases, such as scp a large file, applications can be throttled +by the powerclamp driver, since slowing down the CPU also slows down +network protocol processing, which in turn reduces interrupts. + +When control parameters change at runtime by the controlling CPU, it +may take an additional period for the rest of the CPUs to catch up +with the changes. During this time, idle injection is out of sync, +thus not able to enter package C- states at the expected ratio. But +this effect is minor, in that in most cases change to the target +ratio is updated much less frequently than the idle injection +frequency. + +Scalability +----------- +Tests also show a minor, but measurable, difference between the 4P/8P +Ivy Bridge system and the 80P Westmere server under 50% idle ratio. +More compensation is needed on Westmere for the same amount of +target idle ratio. The compensation also increases as the idle ratio +gets larger. The above reason constitutes the need for the +calibration code. + +On the IVB 8P system, compared to an offline CPU, powerclamp can +achieve up to 40% better performance per watt. (measured by a spin +counter summed over per CPU counting threads spawned for all running +CPUs). + +Usage and Interfaces +==================== +The powerclamp driver is registered to the generic thermal layer as a +cooling device. Currently, it’s not bound to any thermal zones:: + + jacob@chromoly:/sys/class/thermal/cooling_device14$ grep . * + cur_state:0 + max_state:50 + type:intel_powerclamp + +cur_state allows user to set the desired idle percentage. Writing 0 to +cur_state will stop idle injection. Writing a value between 1 and +max_state will start the idle injection. Reading cur_state returns the +actual and current idle percentage. This may not be the same value +set by the user in that current idle percentage depends on workload +and includes natural idle. When idle injection is disabled, reading +cur_state returns value -1 instead of 0 which is to avoid confusing +100% busy state with the disabled state. + +Example usage: +- To inject 25% idle time:: + + $ sudo sh -c "echo 25 > /sys/class/thermal/cooling_device80/cur_state + +If the system is not busy and has more than 25% idle time already, +then the powerclamp driver will not start idle injection. Using Top +will not show idle injection kernel threads. + +If the system is busy (spin test below) and has less than 25% natural +idle time, powerclamp kernel threads will do idle injection. Forced +idle time is accounted as normal idle in that common code path is +taken as the idle task. + +In this example, 24.1% idle is shown. This helps the system admin or +user determine the cause of slowdown, when a powerclamp driver is in action:: + + + Tasks: 197 total, 1 running, 196 sleeping, 0 stopped, 0 zombie + Cpu(s): 71.2%us, 4.7%sy, 0.0%ni, 24.1%id, 0.0%wa, 0.0%hi, 0.0%si, 0.0%st + Mem: 3943228k total, 1689632k used, 2253596k free, 74960k buffers + Swap: 4087804k total, 0k used, 4087804k free, 945336k cached + + PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND + 3352 jacob 20 0 262m 644 428 S 286 0.0 0:17.16 spin + 3341 root -51 0 0 0 0 D 25 0.0 0:01.62 kidle_inject/0 + 3344 root -51 0 0 0 0 D 25 0.0 0:01.60 kidle_inject/3 + 3342 root -51 0 0 0 0 D 25 0.0 0:01.61 kidle_inject/1 + 3343 root -51 0 0 0 0 D 25 0.0 0:01.60 kidle_inject/2 + 2935 jacob 20 0 696m 125m 35m S 5 3.3 0:31.11 firefox + 1546 root 20 0 158m 20m 6640 S 3 0.5 0:26.97 Xorg + 2100 jacob 20 0 1223m 88m 30m S 3 2.3 0:23.68 compiz + +Tests have shown that by using the powerclamp driver as a cooling +device, a PID based userspace thermal controller can manage to +control CPU temperature effectively, when no other thermal influence +is added. For example, a UltraBook user can compile the kernel under +certain temperature (below most active trip points). diff --git a/Documentation/driver-api/thermal/nouveau_thermal.rst b/Documentation/driver-api/thermal/nouveau_thermal.rst new file mode 100644 index 000000000000..37255fd6735d --- /dev/null +++ b/Documentation/driver-api/thermal/nouveau_thermal.rst @@ -0,0 +1,96 @@ +===================== +Kernel driver nouveau +===================== + +Supported chips: + +* NV43+ + +Authors: Martin Peres (mupuf) + +Description +----------- + +This driver allows to read the GPU core temperature, drive the GPU fan and +set temperature alarms. + +Currently, due to the absence of in-kernel API to access HWMON drivers, Nouveau +cannot access any of the i2c external monitoring chips it may find. If you +have one of those, temperature and/or fan management through Nouveau's HWMON +interface is likely not to work. This document may then not cover your situation +entirely. + +Temperature management +---------------------- + +Temperature is exposed under as a read-only HWMON attribute temp1_input. + +In order to protect the GPU from overheating, Nouveau supports 4 configurable +temperature thresholds: + + * Fan_boost: + Fan speed is set to 100% when reaching this temperature; + * Downclock: + The GPU will be downclocked to reduce its power dissipation; + * Critical: + The GPU is put on hold to further lower power dissipation; + * Shutdown: + Shut the computer down to protect your GPU. + +WARNING: + Some of these thresholds may not be used by Nouveau depending + on your chipset. + +The default value for these thresholds comes from the GPU's vbios. These +thresholds can be configured thanks to the following HWMON attributes: + + * Fan_boost: temp1_auto_point1_temp and temp1_auto_point1_temp_hyst; + * Downclock: temp1_max and temp1_max_hyst; + * Critical: temp1_crit and temp1_crit_hyst; + * Shutdown: temp1_emergency and temp1_emergency_hyst. + +NOTE: Remember that the values are stored as milli degrees Celsius. Don't forget +to multiply! + +Fan management +-------------- + +Not all cards have a drivable fan. If you do, then the following HWMON +attributes should be available: + + * pwm1_enable: + Current fan management mode (NONE, MANUAL or AUTO); + * pwm1: + Current PWM value (power percentage); + * pwm1_min: + The minimum PWM speed allowed; + * pwm1_max: + The maximum PWM speed allowed (bypassed when hitting Fan_boost); + +You may also have the following attribute: + + * fan1_input: + Speed in RPM of your fan. + +Your fan can be driven in different modes: + + * 0: The fan is left untouched; + * 1: The fan can be driven in manual (use pwm1 to change the speed); + * 2; The fan is driven automatically depending on the temperature. + +NOTE: + Be sure to use the manual mode if you want to drive the fan speed manually + +NOTE2: + When operating in manual mode outside the vbios-defined + [PWM_min, PWM_max] range, the reported fan speed (RPM) may not be accurate + depending on your hardware. + +Bug reports +----------- + +Thermal management on Nouveau is new and may not work on all cards. If you have +inquiries, please ping mupuf on IRC (#nouveau, freenode). + +Bug reports should be filled on Freedesktop's bug tracker. Please follow +http://nouveau.freedesktop.org/wiki/Bugs diff --git a/Documentation/driver-api/thermal/power_allocator.rst b/Documentation/driver-api/thermal/power_allocator.rst new file mode 100644 index 000000000000..67b6a3297238 --- /dev/null +++ b/Documentation/driver-api/thermal/power_allocator.rst @@ -0,0 +1,271 @@ +================================= +Power allocator governor tunables +================================= + +Trip points +----------- + +The governor works optimally with the following two passive trip points: + +1. "switch on" trip point: temperature above which the governor + control loop starts operating. This is the first passive trip + point of the thermal zone. + +2. "desired temperature" trip point: it should be higher than the + "switch on" trip point. This the target temperature the governor + is controlling for. This is the last passive trip point of the + thermal zone. + +PID Controller +-------------- + +The power allocator governor implements a +Proportional-Integral-Derivative controller (PID controller) with +temperature as the control input and power as the controlled output: + + P_max = k_p * e + k_i * err_integral + k_d * diff_err + sustainable_power + +where + - e = desired_temperature - current_temperature + - err_integral is the sum of previous errors + - diff_err = e - previous_error + +It is similar to the one depicted below:: + + k_d + | + current_temp | + | v + | +----------+ +---+ + | +----->| diff_err |-->| X |------+ + | | +----------+ +---+ | + | | | tdp actor + | | k_i | | get_requested_power() + | | | | | | | + | | | | | | | ... + v | v v v v v + +---+ | +-------+ +---+ +---+ +---+ +----------+ + | S |-----+----->| sum e |----->| X |--->| S |-->| S |-->|power | + +---+ | +-------+ +---+ +---+ +---+ |allocation| + ^ | ^ +----------+ + | | | | | + | | +---+ | | | + | +------->| X |-------------------+ v v + | +---+ granted performance + desired_temperature ^ + | + | + k_po/k_pu + +Sustainable power +----------------- + +An estimate of the sustainable dissipatable power (in mW) should be +provided while registering the thermal zone. This estimates the +sustained power that can be dissipated at the desired control +temperature. This is the maximum sustained power for allocation at +the desired maximum temperature. The actual sustained power can vary +for a number of reasons. The closed loop controller will take care of +variations such as environmental conditions, and some factors related +to the speed-grade of the silicon. `sustainable_power` is therefore +simply an estimate, and may be tuned to affect the aggressiveness of +the thermal ramp. For reference, the sustainable power of a 4" phone +is typically 2000mW, while on a 10" tablet is around 4500mW (may vary +depending on screen size). + +If you are using device tree, do add it as a property of the +thermal-zone. For example:: + + thermal-zones { + soc_thermal { + polling-delay = <1000>; + polling-delay-passive = <100>; + sustainable-power = <2500>; + ... + +Instead, if the thermal zone is registered from the platform code, pass a +`thermal_zone_params` that has a `sustainable_power`. If no +`thermal_zone_params` were being passed, then something like below +will suffice:: + + static const struct thermal_zone_params tz_params = { + .sustainable_power = 3500, + }; + +and then pass `tz_params` as the 5th parameter to +`thermal_zone_device_register()` + +k_po and k_pu +------------- + +The implementation of the PID controller in the power allocator +thermal governor allows the configuration of two proportional term +constants: `k_po` and `k_pu`. `k_po` is the proportional term +constant during temperature overshoot periods (current temperature is +above "desired temperature" trip point). Conversely, `k_pu` is the +proportional term constant during temperature undershoot periods +(current temperature below "desired temperature" trip point). + +These controls are intended as the primary mechanism for configuring +the permitted thermal "ramp" of the system. For instance, a lower +`k_pu` value will provide a slower ramp, at the cost of capping +available capacity at a low temperature. On the other hand, a high +value of `k_pu` will result in the governor granting very high power +while temperature is low, and may lead to temperature overshooting. + +The default value for `k_pu` is:: + + 2 * sustainable_power / (desired_temperature - switch_on_temp) + +This means that at `switch_on_temp` the output of the controller's +proportional term will be 2 * `sustainable_power`. The default value +for `k_po` is:: + + sustainable_power / (desired_temperature - switch_on_temp) + +Focusing on the proportional and feed forward values of the PID +controller equation we have:: + + P_max = k_p * e + sustainable_power + +The proportional term is proportional to the difference between the +desired temperature and the current one. When the current temperature +is the desired one, then the proportional component is zero and +`P_max` = `sustainable_power`. That is, the system should operate in +thermal equilibrium under constant load. `sustainable_power` is only +an estimate, which is the reason for closed-loop control such as this. + +Expanding `k_pu` we get:: + + P_max = 2 * sustainable_power * (T_set - T) / (T_set - T_on) + + sustainable_power + +where: + + - T_set is the desired temperature + - T is the current temperature + - T_on is the switch on temperature + +When the current temperature is the switch_on temperature, the above +formula becomes:: + + P_max = 2 * sustainable_power * (T_set - T_on) / (T_set - T_on) + + sustainable_power = 2 * sustainable_power + sustainable_power = + 3 * sustainable_power + +Therefore, the proportional term alone linearly decreases power from +3 * `sustainable_power` to `sustainable_power` as the temperature +rises from the switch on temperature to the desired temperature. + +k_i and integral_cutoff +----------------------- + +`k_i` configures the PID loop's integral term constant. This term +allows the PID controller to compensate for long term drift and for +the quantized nature of the output control: cooling devices can't set +the exact power that the governor requests. When the temperature +error is below `integral_cutoff`, errors are accumulated in the +integral term. This term is then multiplied by `k_i` and the result +added to the output of the controller. Typically `k_i` is set low (1 +or 2) and `integral_cutoff` is 0. + +k_d +--- + +`k_d` configures the PID loop's derivative term constant. It's +recommended to leave it as the default: 0. + +Cooling device power API +======================== + +Cooling devices controlled by this governor must supply the additional +"power" API in their `cooling_device_ops`. It consists on three ops: + +1. :: + + int get_requested_power(struct thermal_cooling_device *cdev, + struct thermal_zone_device *tz, u32 *power); + + +@cdev: + The `struct thermal_cooling_device` pointer +@tz: + thermal zone in which we are currently operating +@power: + pointer in which to store the calculated power + +`get_requested_power()` calculates the power requested by the device +in milliwatts and stores it in @power . It should return 0 on +success, -E* on failure. This is currently used by the power +allocator governor to calculate how much power to give to each cooling +device. + +2. :: + + int state2power(struct thermal_cooling_device *cdev, struct + thermal_zone_device *tz, unsigned long state, + u32 *power); + +@cdev: + The `struct thermal_cooling_device` pointer +@tz: + thermal zone in which we are currently operating +@state: + A cooling device state +@power: + pointer in which to store the equivalent power + +Convert cooling device state @state into power consumption in +milliwatts and store it in @power. It should return 0 on success, -E* +on failure. This is currently used by thermal core to calculate the +maximum power that an actor can consume. + +3. :: + + int power2state(struct thermal_cooling_device *cdev, u32 power, + unsigned long *state); + +@cdev: + The `struct thermal_cooling_device` pointer +@power: + power in milliwatts +@state: + pointer in which to store the resulting state + +Calculate a cooling device state that would make the device consume at +most @power mW and store it in @state. It should return 0 on success, +-E* on failure. This is currently used by the thermal core to convert +a given power set by the power allocator governor to a state that the +cooling device can set. It is a function because this conversion may +depend on external factors that may change so this function should the +best conversion given "current circumstances". + +Cooling device weights +---------------------- + +Weights are a mechanism to bias the allocation among cooling +devices. They express the relative power efficiency of different +cooling devices. Higher weight can be used to express higher power +efficiency. Weighting is relative such that if each cooling device +has a weight of one they are considered equal. This is particularly +useful in heterogeneous systems where two cooling devices may perform +the same kind of compute, but with different efficiency. For example, +a system with two different types of processors. + +If the thermal zone is registered using +`thermal_zone_device_register()` (i.e., platform code), then weights +are passed as part of the thermal zone's `thermal_bind_parameters`. +If the platform is registered using device tree, then they are passed +as the `contribution` property of each map in the `cooling-maps` node. + +Limitations of the power allocator governor +=========================================== + +The power allocator governor's PID controller works best if there is a +periodic tick. If you have a driver that calls +`thermal_zone_device_update()` (or anything that ends up calling the +governor's `throttle()` function) repetitively, the governor response +won't be very good. Note that this is not particular to this +governor, step-wise will also misbehave if you call its throttle() +faster than the normal thermal framework tick (due to interrupts for +example) as it will overreact. diff --git a/Documentation/driver-api/thermal/sysfs-api.rst b/Documentation/driver-api/thermal/sysfs-api.rst new file mode 100644 index 000000000000..fab2c9b36d08 --- /dev/null +++ b/Documentation/driver-api/thermal/sysfs-api.rst @@ -0,0 +1,798 @@ +=================================== +Generic Thermal Sysfs driver How To +=================================== + +Written by Sujith Thomas , Zhang Rui + +Updated: 2 January 2008 + +Copyright (c) 2008 Intel Corporation + + +0. Introduction +=============== + +The generic thermal sysfs provides a set of interfaces for thermal zone +devices (sensors) and thermal cooling devices (fan, processor...) to register +with the thermal management solution and to be a part of it. + +This how-to focuses on enabling new thermal zone and cooling devices to +participate in thermal management. +This solution is platform independent and any type of thermal zone devices +and cooling devices should be able to make use of the infrastructure. + +The main task of the thermal sysfs driver is to expose thermal zone attributes +as well as cooling device attributes to the user space. +An intelligent thermal management application can make decisions based on +inputs from thermal zone attributes (the current temperature and trip point +temperature) and throttle appropriate devices. + +- `[0-*]` denotes any positive number starting from 0 +- `[1-*]` denotes any positive number starting from 1 + +1. thermal sysfs driver interface functions +=========================================== + +1.1 thermal zone device interface +--------------------------------- + + :: + + struct thermal_zone_device + *thermal_zone_device_register(char *type, + int trips, int mask, void *devdata, + struct thermal_zone_device_ops *ops, + const struct thermal_zone_params *tzp, + int passive_delay, int polling_delay)) + + This interface function adds a new thermal zone device (sensor) to + /sys/class/thermal folder as `thermal_zone[0-*]`. It tries to bind all the + thermal cooling devices registered at the same time. + + type: + the thermal zone type. + trips: + the total number of trip points this thermal zone supports. + mask: + Bit string: If 'n'th bit is set, then trip point 'n' is writeable. + devdata: + device private data + ops: + thermal zone device call-backs. + + .bind: + bind the thermal zone device with a thermal cooling device. + .unbind: + unbind the thermal zone device with a thermal cooling device. + .get_temp: + get the current temperature of the thermal zone. + .set_trips: + set the trip points window. Whenever the current temperature + is updated, the trip points immediately below and above the + current temperature are found. + .get_mode: + get the current mode (enabled/disabled) of the thermal zone. + + - "enabled" means the kernel thermal management is + enabled. + - "disabled" will prevent kernel thermal driver action + upon trip points so that user applications can take + charge of thermal management. + .set_mode: + set the mode (enabled/disabled) of the thermal zone. + .get_trip_type: + get the type of certain trip point. + .get_trip_temp: + get the temperature above which the certain trip point + will be fired. + .set_emul_temp: + set the emulation temperature which helps in debugging + different threshold temperature points. + tzp: + thermal zone platform parameters. + passive_delay: + number of milliseconds to wait between polls when + performing passive cooling. + polling_delay: + number of milliseconds to wait between polls when checking + whether trip points have been crossed (0 for interrupt driven systems). + + :: + + void thermal_zone_device_unregister(struct thermal_zone_device *tz) + + This interface function removes the thermal zone device. + It deletes the corresponding entry from /sys/class/thermal folder and + unbinds all the thermal cooling devices it uses. + + :: + + struct thermal_zone_device + *thermal_zone_of_sensor_register(struct device *dev, int sensor_id, + void *data, + const struct thermal_zone_of_device_ops *ops) + + This interface adds a new sensor to a DT thermal zone. + This function will search the list of thermal zones described in + device tree and look for the zone that refer to the sensor device + pointed by dev->of_node as temperature providers. For the zone + pointing to the sensor node, the sensor will be added to the DT + thermal zone device. + + The parameters for this interface are: + + dev: + Device node of sensor containing valid node pointer in + dev->of_node. + sensor_id: + a sensor identifier, in case the sensor IP has more + than one sensors + data: + a private pointer (owned by the caller) that will be + passed back, when a temperature reading is needed. + ops: + `struct thermal_zone_of_device_ops *`. + + ============== ======================================= + get_temp a pointer to a function that reads the + sensor temperature. This is mandatory + callback provided by sensor driver. + set_trips a pointer to a function that sets a + temperature window. When this window is + left the driver must inform the thermal + core via thermal_zone_device_update. + get_trend a pointer to a function that reads the + sensor temperature trend. + set_emul_temp a pointer to a function that sets + sensor emulated temperature. + ============== ======================================= + + The thermal zone temperature is provided by the get_temp() function + pointer of thermal_zone_of_device_ops. When called, it will + have the private pointer @data back. + + It returns error pointer if fails otherwise valid thermal zone device + handle. Caller should check the return handle with IS_ERR() for finding + whether success or not. + + :: + + void thermal_zone_of_sensor_unregister(struct device *dev, + struct thermal_zone_device *tzd) + + This interface unregisters a sensor from a DT thermal zone which was + successfully added by interface thermal_zone_of_sensor_register(). + This function removes the sensor callbacks and private data from the + thermal zone device registered with thermal_zone_of_sensor_register() + interface. It will also silent the zone by remove the .get_temp() and + get_trend() thermal zone device callbacks. + + :: + + struct thermal_zone_device + *devm_thermal_zone_of_sensor_register(struct device *dev, + int sensor_id, + void *data, + const struct thermal_zone_of_device_ops *ops) + + This interface is resource managed version of + thermal_zone_of_sensor_register(). + + All details of thermal_zone_of_sensor_register() described in + section 1.1.3 is applicable here. + + The benefit of using this interface to register sensor is that it + is not require to explicitly call thermal_zone_of_sensor_unregister() + in error path or during driver unbinding as this is done by driver + resource manager. + + :: + + void devm_thermal_zone_of_sensor_unregister(struct device *dev, + struct thermal_zone_device *tzd) + + This interface is resource managed version of + thermal_zone_of_sensor_unregister(). + All details of thermal_zone_of_sensor_unregister() described in + section 1.1.4 is applicable here. + Normally this function will not need to be called and the resource + management code will ensure that the resource is freed. + + :: + + int thermal_zone_get_slope(struct thermal_zone_device *tz) + + This interface is used to read the slope attribute value + for the thermal zone device, which might be useful for platform + drivers for temperature calculations. + + :: + + int thermal_zone_get_offset(struct thermal_zone_device *tz) + + This interface is used to read the offset attribute value + for the thermal zone device, which might be useful for platform + drivers for temperature calculations. + +1.2 thermal cooling device interface +------------------------------------ + + + :: + + struct thermal_cooling_device + *thermal_cooling_device_register(char *name, + void *devdata, struct thermal_cooling_device_ops *) + + This interface function adds a new thermal cooling device (fan/processor/...) + to /sys/class/thermal/ folder as `cooling_device[0-*]`. It tries to bind itself + to all the thermal zone devices registered at the same time. + + name: + the cooling device name. + devdata: + device private data. + ops: + thermal cooling devices call-backs. + + .get_max_state: + get the Maximum throttle state of the cooling device. + .get_cur_state: + get the Currently requested throttle state of the + cooling device. + .set_cur_state: + set the Current throttle state of the cooling device. + + :: + + void thermal_cooling_device_unregister(struct thermal_cooling_device *cdev) + + This interface function removes the thermal cooling device. + It deletes the corresponding entry from /sys/class/thermal folder and + unbinds itself from all the thermal zone devices using it. + +1.3 interface for binding a thermal zone device with a thermal cooling device +----------------------------------------------------------------------------- + + :: + + int thermal_zone_bind_cooling_device(struct thermal_zone_device *tz, + int trip, struct thermal_cooling_device *cdev, + unsigned long upper, unsigned long lower, unsigned int weight); + + This interface function binds a thermal cooling device to a particular trip + point of a thermal zone device. + + This function is usually called in the thermal zone device .bind callback. + + tz: + the thermal zone device + cdev: + thermal cooling device + trip: + indicates which trip point in this thermal zone the cooling device + is associated with. + upper: + the Maximum cooling state for this trip point. + THERMAL_NO_LIMIT means no upper limit, + and the cooling device can be in max_state. + lower: + the Minimum cooling state can be used for this trip point. + THERMAL_NO_LIMIT means no lower limit, + and the cooling device can be in cooling state 0. + weight: + the influence of this cooling device in this thermal + zone. See 1.4.1 below for more information. + + :: + + int thermal_zone_unbind_cooling_device(struct thermal_zone_device *tz, + int trip, struct thermal_cooling_device *cdev); + + This interface function unbinds a thermal cooling device from a particular + trip point of a thermal zone device. This function is usually called in + the thermal zone device .unbind callback. + + tz: + the thermal zone device + cdev: + thermal cooling device + trip: + indicates which trip point in this thermal zone the cooling device + is associated with. + +1.4 Thermal Zone Parameters +--------------------------- + + :: + + struct thermal_bind_params + + This structure defines the following parameters that are used to bind + a zone with a cooling device for a particular trip point. + + .cdev: + The cooling device pointer + .weight: + The 'influence' of a particular cooling device on this + zone. This is relative to the rest of the cooling + devices. For example, if all cooling devices have a + weight of 1, then they all contribute the same. You can + use percentages if you want, but it's not mandatory. A + weight of 0 means that this cooling device doesn't + contribute to the cooling of this zone unless all cooling + devices have a weight of 0. If all weights are 0, then + they all contribute the same. + .trip_mask: + This is a bit mask that gives the binding relation between + this thermal zone and cdev, for a particular trip point. + If nth bit is set, then the cdev and thermal zone are bound + for trip point n. + .binding_limits: + This is an array of cooling state limits. Must have + exactly 2 * thermal_zone.number_of_trip_points. It is an + array consisting of tuples of + state limits. Each trip will be associated with one state + limit tuple when binding. A NULL pointer means + on all trips. + These limits are used when binding a cdev to a trip point. + .match: + This call back returns success(0) if the 'tz and cdev' need to + be bound, as per platform data. + + :: + + struct thermal_zone_params + + This structure defines the platform level parameters for a thermal zone. + This data, for each thermal zone should come from the platform layer. + This is an optional feature where some platforms can choose not to + provide this data. + + .governor_name: + Name of the thermal governor used for this zone + .no_hwmon: + a boolean to indicate if the thermal to hwmon sysfs interface + is required. when no_hwmon == false, a hwmon sysfs interface + will be created. when no_hwmon == true, nothing will be done. + In case the thermal_zone_params is NULL, the hwmon interface + will be created (for backward compatibility). + .num_tbps: + Number of thermal_bind_params entries for this zone + .tbp: + thermal_bind_params entries + +2. sysfs attributes structure +============================= + +== ================ +RO read only value +WO write only value +RW read/write value +== ================ + +Thermal sysfs attributes will be represented under /sys/class/thermal. +Hwmon sysfs I/F extension is also available under /sys/class/hwmon +if hwmon is compiled in or built as a module. + +Thermal zone device sys I/F, created once it's registered:: + + /sys/class/thermal/thermal_zone[0-*]: + |---type: Type of the thermal zone + |---temp: Current temperature + |---mode: Working mode of the thermal zone + |---policy: Thermal governor used for this zone + |---available_policies: Available thermal governors for this zone + |---trip_point_[0-*]_temp: Trip point temperature + |---trip_point_[0-*]_type: Trip point type + |---trip_point_[0-*]_hyst: Hysteresis value for this trip point + |---emul_temp: Emulated temperature set node + |---sustainable_power: Sustainable dissipatable power + |---k_po: Proportional term during temperature overshoot + |---k_pu: Proportional term during temperature undershoot + |---k_i: PID's integral term in the power allocator gov + |---k_d: PID's derivative term in the power allocator + |---integral_cutoff: Offset above which errors are accumulated + |---slope: Slope constant applied as linear extrapolation + |---offset: Offset constant applied as linear extrapolation + +Thermal cooling device sys I/F, created once it's registered:: + + /sys/class/thermal/cooling_device[0-*]: + |---type: Type of the cooling device(processor/fan/...) + |---max_state: Maximum cooling state of the cooling device + |---cur_state: Current cooling state of the cooling device + |---stats: Directory containing cooling device's statistics + |---stats/reset: Writing any value resets the statistics + |---stats/time_in_state_ms: Time (msec) spent in various cooling states + |---stats/total_trans: Total number of times cooling state is changed + |---stats/trans_table: Cooing state transition table + + +Then next two dynamic attributes are created/removed in pairs. They represent +the relationship between a thermal zone and its associated cooling device. +They are created/removed for each successful execution of +thermal_zone_bind_cooling_device/thermal_zone_unbind_cooling_device. + +:: + + /sys/class/thermal/thermal_zone[0-*]: + |---cdev[0-*]: [0-*]th cooling device in current thermal zone + |---cdev[0-*]_trip_point: Trip point that cdev[0-*] is associated with + |---cdev[0-*]_weight: Influence of the cooling device in + this thermal zone + +Besides the thermal zone device sysfs I/F and cooling device sysfs I/F, +the generic thermal driver also creates a hwmon sysfs I/F for each _type_ +of thermal zone device. E.g. the generic thermal driver registers one hwmon +class device and build the associated hwmon sysfs I/F for all the registered +ACPI thermal zones. + +:: + + /sys/class/hwmon/hwmon[0-*]: + |---name: The type of the thermal zone devices + |---temp[1-*]_input: The current temperature of thermal zone [1-*] + |---temp[1-*]_critical: The critical trip point of thermal zone [1-*] + +Please read Documentation/hwmon/sysfs-interface.rst for additional information. + +Thermal zone attributes +----------------------- + +type + Strings which represent the thermal zone type. + This is given by thermal zone driver as part of registration. + E.g: "acpitz" indicates it's an ACPI thermal device. + In order to keep it consistent with hwmon sys attribute; this should + be a short, lowercase string, not containing spaces nor dashes. + RO, Required + +temp + Current temperature as reported by thermal zone (sensor). + Unit: millidegree Celsius + RO, Required + +mode + One of the predefined values in [enabled, disabled]. + This file gives information about the algorithm that is currently + managing the thermal zone. It can be either default kernel based + algorithm or user space application. + + enabled + enable Kernel Thermal management. + disabled + Preventing kernel thermal zone driver actions upon + trip points so that user application can take full + charge of the thermal management. + + RW, Optional + +policy + One of the various thermal governors used for a particular zone. + + RW, Required + +available_policies + Available thermal governors which can be used for a particular zone. + + RO, Required + +`trip_point_[0-*]_temp` + The temperature above which trip point will be fired. + + Unit: millidegree Celsius + + RO, Optional + +`trip_point_[0-*]_type` + Strings which indicate the type of the trip point. + + E.g. it can be one of critical, hot, passive, `active[0-*]` for ACPI + thermal zone. + + RO, Optional + +`trip_point_[0-*]_hyst` + The hysteresis value for a trip point, represented as an integer + Unit: Celsius + RW, Optional + +`cdev[0-*]` + Sysfs link to the thermal cooling device node where the sys I/F + for cooling device throttling control represents. + + RO, Optional + +`cdev[0-*]_trip_point` + The trip point in this thermal zone which `cdev[0-*]` is associated + with; -1 means the cooling device is not associated with any trip + point. + + RO, Optional + +`cdev[0-*]_weight` + The influence of `cdev[0-*]` in this thermal zone. This value + is relative to the rest of cooling devices in the thermal + zone. For example, if a cooling device has a weight double + than that of other, it's twice as effective in cooling the + thermal zone. + + RW, Optional + +passive + Attribute is only present for zones in which the passive cooling + policy is not supported by native thermal driver. Default is zero + and can be set to a temperature (in millidegrees) to enable a + passive trip point for the zone. Activation is done by polling with + an interval of 1 second. + + Unit: millidegrees Celsius + + Valid values: 0 (disabled) or greater than 1000 + + RW, Optional + +emul_temp + Interface to set the emulated temperature method in thermal zone + (sensor). After setting this temperature, the thermal zone may pass + this temperature to platform emulation function if registered or + cache it locally. This is useful in debugging different temperature + threshold and its associated cooling action. This is write only node + and writing 0 on this node should disable emulation. + Unit: millidegree Celsius + + WO, Optional + + WARNING: + Be careful while enabling this option on production systems, + because userland can easily disable the thermal policy by simply + flooding this sysfs node with low temperature values. + +sustainable_power + An estimate of the sustained power that can be dissipated by + the thermal zone. Used by the power allocator governor. For + more information see Documentation/driver-api/thermal/power_allocator.rst + + Unit: milliwatts + + RW, Optional + +k_po + The proportional term of the power allocator governor's PID + controller during temperature overshoot. Temperature overshoot + is when the current temperature is above the "desired + temperature" trip point. For more information see + Documentation/driver-api/thermal/power_allocator.rst + + RW, Optional + +k_pu + The proportional term of the power allocator governor's PID + controller during temperature undershoot. Temperature undershoot + is when the current temperature is below the "desired + temperature" trip point. For more information see + Documentation/driver-api/thermal/power_allocator.rst + + RW, Optional + +k_i + The integral term of the power allocator governor's PID + controller. This term allows the PID controller to compensate + for long term drift. For more information see + Documentation/driver-api/thermal/power_allocator.rst + + RW, Optional + +k_d + The derivative term of the power allocator governor's PID + controller. For more information see + Documentation/driver-api/thermal/power_allocator.rst + + RW, Optional + +integral_cutoff + Temperature offset from the desired temperature trip point + above which the integral term of the power allocator + governor's PID controller starts accumulating errors. For + example, if integral_cutoff is 0, then the integral term only + accumulates error when temperature is above the desired + temperature trip point. For more information see + Documentation/driver-api/thermal/power_allocator.rst + + Unit: millidegree Celsius + + RW, Optional + +slope + The slope constant used in a linear extrapolation model + to determine a hotspot temperature based off the sensor's + raw readings. It is up to the device driver to determine + the usage of these values. + + RW, Optional + +offset + The offset constant used in a linear extrapolation model + to determine a hotspot temperature based off the sensor's + raw readings. It is up to the device driver to determine + the usage of these values. + + RW, Optional + +Cooling device attributes +------------------------- + +type + String which represents the type of device, e.g: + + - for generic ACPI: should be "Fan", "Processor" or "LCD" + - for memory controller device on intel_menlow platform: + should be "Memory controller". + + RO, Required + +max_state + The maximum permissible cooling state of this cooling device. + + RO, Required + +cur_state + The current cooling state of this cooling device. + The value can any integer numbers between 0 and max_state: + + - cur_state == 0 means no cooling + - cur_state == max_state means the maximum cooling. + + RW, Required + +stats/reset + Writing any value resets the cooling device's statistics. + WO, Required + +stats/time_in_state_ms: + The amount of time spent by the cooling device in various cooling + states. The output will have "